diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100755--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,71955 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10275, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.732360097323601e-05, + "grad_norm": 16.13226202148005, + "learning_rate": 3.2362459546925574e-08, + "loss": 1.1997, + "step": 1 + }, + { + "epoch": 0.00019464720194647202, + "grad_norm": 15.765097511926365, + "learning_rate": 6.472491909385115e-08, + "loss": 1.384, + "step": 2 + }, + { + "epoch": 0.00029197080291970805, + "grad_norm": 16.64113665586635, + "learning_rate": 9.70873786407767e-08, + "loss": 1.2291, + "step": 3 + }, + { + "epoch": 0.00038929440389294404, + "grad_norm": 20.34864047521242, + "learning_rate": 1.294498381877023e-07, + "loss": 0.9025, + "step": 4 + }, + { + "epoch": 0.00048661800486618007, + "grad_norm": 28.710711096046108, + "learning_rate": 1.6181229773462782e-07, + "loss": 1.0305, + "step": 5 + }, + { + "epoch": 0.0005839416058394161, + "grad_norm": 21.945801992582915, + "learning_rate": 1.941747572815534e-07, + "loss": 1.0979, + "step": 6 + }, + { + "epoch": 0.0006812652068126521, + "grad_norm": 23.947905905644966, + "learning_rate": 2.26537216828479e-07, + "loss": 1.1909, + "step": 7 + }, + { + "epoch": 0.0007785888077858881, + "grad_norm": 19.835016686730835, + "learning_rate": 2.588996763754046e-07, + "loss": 1.2083, + "step": 8 + }, + { + "epoch": 0.0008759124087591241, + "grad_norm": 16.926846352507788, + "learning_rate": 2.9126213592233014e-07, + "loss": 1.2369, + "step": 9 + }, + { + "epoch": 0.0009732360097323601, + "grad_norm": 21.349924470647284, + "learning_rate": 3.2362459546925565e-07, + "loss": 1.0052, + "step": 10 + }, + { + "epoch": 0.0010705596107055961, + "grad_norm": 25.127579741628022, + "learning_rate": 3.5598705501618125e-07, + "loss": 1.2631, + "step": 11 + }, + { + "epoch": 0.0011678832116788322, + "grad_norm": 12.524049131196549, + "learning_rate": 3.883495145631068e-07, + "loss": 1.0884, + "step": 12 + }, + { + "epoch": 0.001265206812652068, + "grad_norm": 20.706648432487587, + "learning_rate": 4.207119741100324e-07, + "loss": 1.1469, + "step": 13 + }, + { + "epoch": 0.0013625304136253042, + "grad_norm": 17.655230197318655, + "learning_rate": 4.53074433656958e-07, + "loss": 1.2922, + "step": 14 + }, + { + "epoch": 0.00145985401459854, + "grad_norm": 16.550170455008725, + "learning_rate": 4.854368932038835e-07, + "loss": 1.1792, + "step": 15 + }, + { + "epoch": 0.0015571776155717761, + "grad_norm": 24.456798845425887, + "learning_rate": 5.177993527508092e-07, + "loss": 1.0804, + "step": 16 + }, + { + "epoch": 0.0016545012165450122, + "grad_norm": 14.659117460865279, + "learning_rate": 5.501618122977346e-07, + "loss": 1.0973, + "step": 17 + }, + { + "epoch": 0.0017518248175182481, + "grad_norm": 15.324823146378344, + "learning_rate": 5.825242718446603e-07, + "loss": 0.9791, + "step": 18 + }, + { + "epoch": 0.0018491484184914842, + "grad_norm": 12.483869597287145, + "learning_rate": 6.148867313915858e-07, + "loss": 1.0829, + "step": 19 + }, + { + "epoch": 0.0019464720194647203, + "grad_norm": 11.921211994178957, + "learning_rate": 6.472491909385113e-07, + "loss": 0.6862, + "step": 20 + }, + { + "epoch": 0.0020437956204379564, + "grad_norm": 14.53279456676939, + "learning_rate": 6.79611650485437e-07, + "loss": 0.7814, + "step": 21 + }, + { + "epoch": 0.0021411192214111923, + "grad_norm": 15.68359520937104, + "learning_rate": 7.119741100323625e-07, + "loss": 0.883, + "step": 22 + }, + { + "epoch": 0.002238442822384428, + "grad_norm": 14.062468532950906, + "learning_rate": 7.443365695792882e-07, + "loss": 1.0087, + "step": 23 + }, + { + "epoch": 0.0023357664233576644, + "grad_norm": 11.150778403716444, + "learning_rate": 7.766990291262136e-07, + "loss": 0.4884, + "step": 24 + }, + { + "epoch": 0.0024330900243309003, + "grad_norm": 7.740982223602688, + "learning_rate": 8.090614886731392e-07, + "loss": 0.8543, + "step": 25 + }, + { + "epoch": 0.002530413625304136, + "grad_norm": 6.4338060169141915, + "learning_rate": 8.414239482200648e-07, + "loss": 0.7948, + "step": 26 + }, + { + "epoch": 0.002627737226277372, + "grad_norm": 6.227022582398367, + "learning_rate": 8.737864077669904e-07, + "loss": 0.7814, + "step": 27 + }, + { + "epoch": 0.0027250608272506084, + "grad_norm": 7.989531820662516, + "learning_rate": 9.06148867313916e-07, + "loss": 0.5645, + "step": 28 + }, + { + "epoch": 0.0028223844282238442, + "grad_norm": 6.4745089193753, + "learning_rate": 9.385113268608415e-07, + "loss": 0.6802, + "step": 29 + }, + { + "epoch": 0.00291970802919708, + "grad_norm": 8.23650018531745, + "learning_rate": 9.70873786407767e-07, + "loss": 0.6218, + "step": 30 + }, + { + "epoch": 0.0030170316301703164, + "grad_norm": 4.915479010119541, + "learning_rate": 1.0032362459546926e-06, + "loss": 0.8879, + "step": 31 + }, + { + "epoch": 0.0031143552311435523, + "grad_norm": 4.288138757396447, + "learning_rate": 1.0355987055016184e-06, + "loss": 0.5917, + "step": 32 + }, + { + "epoch": 0.003211678832116788, + "grad_norm": 4.230901102531741, + "learning_rate": 1.0679611650485437e-06, + "loss": 0.7373, + "step": 33 + }, + { + "epoch": 0.0033090024330900245, + "grad_norm": 4.714303656539792, + "learning_rate": 1.1003236245954693e-06, + "loss": 0.5886, + "step": 34 + }, + { + "epoch": 0.0034063260340632603, + "grad_norm": 4.1204943469600925, + "learning_rate": 1.132686084142395e-06, + "loss": 0.5991, + "step": 35 + }, + { + "epoch": 0.0035036496350364962, + "grad_norm": 3.124375547961107, + "learning_rate": 1.1650485436893206e-06, + "loss": 0.432, + "step": 36 + }, + { + "epoch": 0.0036009732360097325, + "grad_norm": 3.741153837090354, + "learning_rate": 1.197411003236246e-06, + "loss": 0.6379, + "step": 37 + }, + { + "epoch": 0.0036982968369829684, + "grad_norm": 3.7740270813504506, + "learning_rate": 1.2297734627831717e-06, + "loss": 0.5595, + "step": 38 + }, + { + "epoch": 0.0037956204379562043, + "grad_norm": 4.783986424289694, + "learning_rate": 1.2621359223300972e-06, + "loss": 0.8717, + "step": 39 + }, + { + "epoch": 0.0038929440389294406, + "grad_norm": 4.242597978097827, + "learning_rate": 1.2944983818770226e-06, + "loss": 0.6632, + "step": 40 + }, + { + "epoch": 0.0039902676399026765, + "grad_norm": 4.309602952976607, + "learning_rate": 1.3268608414239483e-06, + "loss": 0.7191, + "step": 41 + }, + { + "epoch": 0.004087591240875913, + "grad_norm": 4.136462382872819, + "learning_rate": 1.359223300970874e-06, + "loss": 0.6782, + "step": 42 + }, + { + "epoch": 0.004184914841849148, + "grad_norm": 4.2148643401229, + "learning_rate": 1.3915857605177997e-06, + "loss": 0.8932, + "step": 43 + }, + { + "epoch": 0.0042822384428223845, + "grad_norm": 3.829331188520966, + "learning_rate": 1.423948220064725e-06, + "loss": 0.4697, + "step": 44 + }, + { + "epoch": 0.004379562043795621, + "grad_norm": 3.4564347781684557, + "learning_rate": 1.4563106796116506e-06, + "loss": 0.3377, + "step": 45 + }, + { + "epoch": 0.004476885644768856, + "grad_norm": 3.319649807488789, + "learning_rate": 1.4886731391585763e-06, + "loss": 0.4589, + "step": 46 + }, + { + "epoch": 0.0045742092457420926, + "grad_norm": 3.8856546910308034, + "learning_rate": 1.5210355987055017e-06, + "loss": 0.8413, + "step": 47 + }, + { + "epoch": 0.004671532846715329, + "grad_norm": 3.7955924171570605, + "learning_rate": 1.5533980582524272e-06, + "loss": 0.588, + "step": 48 + }, + { + "epoch": 0.004768856447688564, + "grad_norm": 4.5762685715882805, + "learning_rate": 1.585760517799353e-06, + "loss": 0.6472, + "step": 49 + }, + { + "epoch": 0.004866180048661801, + "grad_norm": 4.284420204063246, + "learning_rate": 1.6181229773462783e-06, + "loss": 0.5233, + "step": 50 + }, + { + "epoch": 0.004963503649635037, + "grad_norm": 4.0399534913964645, + "learning_rate": 1.650485436893204e-06, + "loss": 0.6737, + "step": 51 + }, + { + "epoch": 0.005060827250608272, + "grad_norm": 4.850258079033273, + "learning_rate": 1.6828478964401297e-06, + "loss": 0.5017, + "step": 52 + }, + { + "epoch": 0.005158150851581509, + "grad_norm": 3.289730774319516, + "learning_rate": 1.715210355987055e-06, + "loss": 0.6378, + "step": 53 + }, + { + "epoch": 0.005255474452554744, + "grad_norm": 3.116783938182044, + "learning_rate": 1.7475728155339808e-06, + "loss": 0.5681, + "step": 54 + }, + { + "epoch": 0.00535279805352798, + "grad_norm": 3.5896487509946677, + "learning_rate": 1.7799352750809063e-06, + "loss": 0.5222, + "step": 55 + }, + { + "epoch": 0.005450121654501217, + "grad_norm": 3.3627737905222146, + "learning_rate": 1.812297734627832e-06, + "loss": 0.351, + "step": 56 + }, + { + "epoch": 0.005547445255474452, + "grad_norm": 3.405981770724818, + "learning_rate": 1.8446601941747574e-06, + "loss": 0.5832, + "step": 57 + }, + { + "epoch": 0.0056447688564476885, + "grad_norm": 3.231134680455488, + "learning_rate": 1.877022653721683e-06, + "loss": 0.5558, + "step": 58 + }, + { + "epoch": 0.005742092457420925, + "grad_norm": 4.2963387449464605, + "learning_rate": 1.9093851132686085e-06, + "loss": 0.7544, + "step": 59 + }, + { + "epoch": 0.00583941605839416, + "grad_norm": 3.3678084152804315, + "learning_rate": 1.941747572815534e-06, + "loss": 0.554, + "step": 60 + }, + { + "epoch": 0.0059367396593673965, + "grad_norm": 3.635756089652443, + "learning_rate": 1.9741100323624596e-06, + "loss": 0.5312, + "step": 61 + }, + { + "epoch": 0.006034063260340633, + "grad_norm": 3.91764256649437, + "learning_rate": 2.006472491909385e-06, + "loss": 0.4329, + "step": 62 + }, + { + "epoch": 0.006131386861313868, + "grad_norm": 3.4866607421863565, + "learning_rate": 2.0388349514563107e-06, + "loss": 0.4453, + "step": 63 + }, + { + "epoch": 0.006228710462287105, + "grad_norm": 2.9369425143161147, + "learning_rate": 2.0711974110032367e-06, + "loss": 0.467, + "step": 64 + }, + { + "epoch": 0.006326034063260341, + "grad_norm": 3.0906723589687024, + "learning_rate": 2.103559870550162e-06, + "loss": 0.3917, + "step": 65 + }, + { + "epoch": 0.006423357664233576, + "grad_norm": 3.5121616512799747, + "learning_rate": 2.1359223300970874e-06, + "loss": 0.6428, + "step": 66 + }, + { + "epoch": 0.006520681265206813, + "grad_norm": 3.470270871630247, + "learning_rate": 2.1682847896440134e-06, + "loss": 0.586, + "step": 67 + }, + { + "epoch": 0.006618004866180049, + "grad_norm": 2.8689679430782498, + "learning_rate": 2.2006472491909385e-06, + "loss": 0.2938, + "step": 68 + }, + { + "epoch": 0.006715328467153284, + "grad_norm": 4.115573400175418, + "learning_rate": 2.2330097087378645e-06, + "loss": 0.3855, + "step": 69 + }, + { + "epoch": 0.006812652068126521, + "grad_norm": 3.903319335204406, + "learning_rate": 2.26537216828479e-06, + "loss": 0.6272, + "step": 70 + }, + { + "epoch": 0.006909975669099757, + "grad_norm": 2.649165320750572, + "learning_rate": 2.297734627831715e-06, + "loss": 0.5229, + "step": 71 + }, + { + "epoch": 0.0070072992700729924, + "grad_norm": 2.8543884488184235, + "learning_rate": 2.330097087378641e-06, + "loss": 0.4006, + "step": 72 + }, + { + "epoch": 0.007104622871046229, + "grad_norm": 2.9817247056794134, + "learning_rate": 2.3624595469255667e-06, + "loss": 0.2331, + "step": 73 + }, + { + "epoch": 0.007201946472019465, + "grad_norm": 3.592880940053797, + "learning_rate": 2.394822006472492e-06, + "loss": 0.4889, + "step": 74 + }, + { + "epoch": 0.0072992700729927005, + "grad_norm": 2.89844013224274, + "learning_rate": 2.427184466019418e-06, + "loss": 0.4711, + "step": 75 + }, + { + "epoch": 0.007396593673965937, + "grad_norm": 2.6071345596032134, + "learning_rate": 2.4595469255663434e-06, + "loss": 0.4844, + "step": 76 + }, + { + "epoch": 0.007493917274939173, + "grad_norm": 2.9053930844585776, + "learning_rate": 2.491909385113269e-06, + "loss": 0.5163, + "step": 77 + }, + { + "epoch": 0.0075912408759124085, + "grad_norm": 3.4016540038418115, + "learning_rate": 2.5242718446601945e-06, + "loss": 0.5852, + "step": 78 + }, + { + "epoch": 0.007688564476885645, + "grad_norm": 2.7133170026932887, + "learning_rate": 2.55663430420712e-06, + "loss": 0.4934, + "step": 79 + }, + { + "epoch": 0.007785888077858881, + "grad_norm": 3.2321439410345585, + "learning_rate": 2.588996763754045e-06, + "loss": 0.62, + "step": 80 + }, + { + "epoch": 0.007883211678832117, + "grad_norm": 2.6835948161160545, + "learning_rate": 2.621359223300971e-06, + "loss": 0.4689, + "step": 81 + }, + { + "epoch": 0.007980535279805353, + "grad_norm": 4.716894934604404, + "learning_rate": 2.6537216828478967e-06, + "loss": 0.3364, + "step": 82 + }, + { + "epoch": 0.00807785888077859, + "grad_norm": 2.6507857723180646, + "learning_rate": 2.686084142394822e-06, + "loss": 0.3785, + "step": 83 + }, + { + "epoch": 0.008175182481751826, + "grad_norm": 2.356714630861861, + "learning_rate": 2.718446601941748e-06, + "loss": 0.2591, + "step": 84 + }, + { + "epoch": 0.00827250608272506, + "grad_norm": 2.755477478688418, + "learning_rate": 2.7508090614886734e-06, + "loss": 0.4762, + "step": 85 + }, + { + "epoch": 0.008369829683698296, + "grad_norm": 3.7771581783688837, + "learning_rate": 2.7831715210355993e-06, + "loss": 0.4627, + "step": 86 + }, + { + "epoch": 0.008467153284671533, + "grad_norm": 2.8568450908810257, + "learning_rate": 2.8155339805825245e-06, + "loss": 0.4322, + "step": 87 + }, + { + "epoch": 0.008564476885644769, + "grad_norm": 2.914756058289183, + "learning_rate": 2.84789644012945e-06, + "loss": 0.4835, + "step": 88 + }, + { + "epoch": 0.008661800486618005, + "grad_norm": 2.414182197047686, + "learning_rate": 2.880258899676376e-06, + "loss": 0.493, + "step": 89 + }, + { + "epoch": 0.008759124087591242, + "grad_norm": 2.8597853736106975, + "learning_rate": 2.912621359223301e-06, + "loss": 0.6063, + "step": 90 + }, + { + "epoch": 0.008856447688564476, + "grad_norm": 2.4567808863650007, + "learning_rate": 2.9449838187702267e-06, + "loss": 0.5874, + "step": 91 + }, + { + "epoch": 0.008953771289537713, + "grad_norm": 2.819434031784131, + "learning_rate": 2.9773462783171527e-06, + "loss": 0.552, + "step": 92 + }, + { + "epoch": 0.009051094890510949, + "grad_norm": 1.9840396387462764, + "learning_rate": 3.0097087378640778e-06, + "loss": 0.3736, + "step": 93 + }, + { + "epoch": 0.009148418491484185, + "grad_norm": 2.52047300259283, + "learning_rate": 3.0420711974110033e-06, + "loss": 0.407, + "step": 94 + }, + { + "epoch": 0.009245742092457421, + "grad_norm": 3.140839526692518, + "learning_rate": 3.0744336569579293e-06, + "loss": 0.6513, + "step": 95 + }, + { + "epoch": 0.009343065693430658, + "grad_norm": 3.1368865731879554, + "learning_rate": 3.1067961165048544e-06, + "loss": 0.4804, + "step": 96 + }, + { + "epoch": 0.009440389294403892, + "grad_norm": 2.6987222968513196, + "learning_rate": 3.13915857605178e-06, + "loss": 0.4228, + "step": 97 + }, + { + "epoch": 0.009537712895377129, + "grad_norm": 2.5779408707034026, + "learning_rate": 3.171521035598706e-06, + "loss": 0.4654, + "step": 98 + }, + { + "epoch": 0.009635036496350365, + "grad_norm": 2.5189587792888934, + "learning_rate": 3.2038834951456315e-06, + "loss": 0.5465, + "step": 99 + }, + { + "epoch": 0.009732360097323601, + "grad_norm": 2.457408493992738, + "learning_rate": 3.2362459546925567e-06, + "loss": 0.5077, + "step": 100 + }, + { + "epoch": 0.009829683698296838, + "grad_norm": 2.445932328031196, + "learning_rate": 3.2686084142394826e-06, + "loss": 0.492, + "step": 101 + }, + { + "epoch": 0.009927007299270074, + "grad_norm": 2.3199141960061915, + "learning_rate": 3.300970873786408e-06, + "loss": 0.4432, + "step": 102 + }, + { + "epoch": 0.010024330900243308, + "grad_norm": 3.88769555780582, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3684, + "step": 103 + }, + { + "epoch": 0.010121654501216545, + "grad_norm": 2.63905676146042, + "learning_rate": 3.3656957928802593e-06, + "loss": 0.4238, + "step": 104 + }, + { + "epoch": 0.010218978102189781, + "grad_norm": 3.0073749174392885, + "learning_rate": 3.398058252427185e-06, + "loss": 0.4655, + "step": 105 + }, + { + "epoch": 0.010316301703163017, + "grad_norm": 2.613524831872459, + "learning_rate": 3.43042071197411e-06, + "loss": 0.4948, + "step": 106 + }, + { + "epoch": 0.010413625304136254, + "grad_norm": 2.4293628733346764, + "learning_rate": 3.462783171521036e-06, + "loss": 0.3717, + "step": 107 + }, + { + "epoch": 0.010510948905109488, + "grad_norm": 3.3036504610837016, + "learning_rate": 3.4951456310679615e-06, + "loss": 0.4939, + "step": 108 + }, + { + "epoch": 0.010608272506082725, + "grad_norm": 2.6808221933664846, + "learning_rate": 3.5275080906148866e-06, + "loss": 0.4809, + "step": 109 + }, + { + "epoch": 0.01070559610705596, + "grad_norm": 2.853958419293739, + "learning_rate": 3.5598705501618126e-06, + "loss": 0.4066, + "step": 110 + }, + { + "epoch": 0.010802919708029197, + "grad_norm": 5.3412930378250145, + "learning_rate": 3.592233009708738e-06, + "loss": 0.3599, + "step": 111 + }, + { + "epoch": 0.010900243309002433, + "grad_norm": 2.983669976646381, + "learning_rate": 3.624595469255664e-06, + "loss": 0.6187, + "step": 112 + }, + { + "epoch": 0.01099756690997567, + "grad_norm": 3.388543821878077, + "learning_rate": 3.6569579288025893e-06, + "loss": 0.717, + "step": 113 + }, + { + "epoch": 0.011094890510948904, + "grad_norm": 3.0720120062792127, + "learning_rate": 3.689320388349515e-06, + "loss": 0.5057, + "step": 114 + }, + { + "epoch": 0.01119221411192214, + "grad_norm": 2.521868238475485, + "learning_rate": 3.721682847896441e-06, + "loss": 0.4308, + "step": 115 + }, + { + "epoch": 0.011289537712895377, + "grad_norm": 2.641085251645149, + "learning_rate": 3.754045307443366e-06, + "loss": 0.4047, + "step": 116 + }, + { + "epoch": 0.011386861313868613, + "grad_norm": 2.6936547530255828, + "learning_rate": 3.7864077669902915e-06, + "loss": 0.5548, + "step": 117 + }, + { + "epoch": 0.01148418491484185, + "grad_norm": 5.599830434139348, + "learning_rate": 3.818770226537217e-06, + "loss": 0.5338, + "step": 118 + }, + { + "epoch": 0.011581508515815086, + "grad_norm": 2.6372065340185378, + "learning_rate": 3.851132686084142e-06, + "loss": 0.4833, + "step": 119 + }, + { + "epoch": 0.01167883211678832, + "grad_norm": 2.555049765563167, + "learning_rate": 3.883495145631068e-06, + "loss": 0.4295, + "step": 120 + }, + { + "epoch": 0.011776155717761557, + "grad_norm": 2.22725048478721, + "learning_rate": 3.915857605177994e-06, + "loss": 0.4074, + "step": 121 + }, + { + "epoch": 0.011873479318734793, + "grad_norm": 3.0093045583939984, + "learning_rate": 3.948220064724919e-06, + "loss": 0.7168, + "step": 122 + }, + { + "epoch": 0.01197080291970803, + "grad_norm": 2.8800338131191223, + "learning_rate": 3.980582524271845e-06, + "loss": 0.3826, + "step": 123 + }, + { + "epoch": 0.012068126520681266, + "grad_norm": 2.3197904571086974, + "learning_rate": 4.01294498381877e-06, + "loss": 0.2584, + "step": 124 + }, + { + "epoch": 0.012165450121654502, + "grad_norm": 2.929540360888414, + "learning_rate": 4.045307443365696e-06, + "loss": 0.4617, + "step": 125 + }, + { + "epoch": 0.012262773722627737, + "grad_norm": 2.5602803735383137, + "learning_rate": 4.0776699029126215e-06, + "loss": 0.2561, + "step": 126 + }, + { + "epoch": 0.012360097323600973, + "grad_norm": 2.676345297957673, + "learning_rate": 4.1100323624595475e-06, + "loss": 0.2996, + "step": 127 + }, + { + "epoch": 0.01245742092457421, + "grad_norm": 1.9047794610871986, + "learning_rate": 4.1423948220064734e-06, + "loss": 0.3475, + "step": 128 + }, + { + "epoch": 0.012554744525547445, + "grad_norm": 2.9014607006450555, + "learning_rate": 4.1747572815533986e-06, + "loss": 0.4748, + "step": 129 + }, + { + "epoch": 0.012652068126520682, + "grad_norm": 2.2992367182815987, + "learning_rate": 4.207119741100324e-06, + "loss": 0.3465, + "step": 130 + }, + { + "epoch": 0.012749391727493918, + "grad_norm": 2.668874383033437, + "learning_rate": 4.23948220064725e-06, + "loss": 0.6119, + "step": 131 + }, + { + "epoch": 0.012846715328467153, + "grad_norm": 2.69106703615133, + "learning_rate": 4.271844660194175e-06, + "loss": 0.4743, + "step": 132 + }, + { + "epoch": 0.012944038929440389, + "grad_norm": 2.972314561813759, + "learning_rate": 4.304207119741101e-06, + "loss": 0.5766, + "step": 133 + }, + { + "epoch": 0.013041362530413625, + "grad_norm": 2.7487017428059635, + "learning_rate": 4.336569579288027e-06, + "loss": 0.5818, + "step": 134 + }, + { + "epoch": 0.013138686131386862, + "grad_norm": 3.1117207482379663, + "learning_rate": 4.368932038834952e-06, + "loss": 0.6468, + "step": 135 + }, + { + "epoch": 0.013236009732360098, + "grad_norm": 2.781796948090657, + "learning_rate": 4.401294498381877e-06, + "loss": 0.7209, + "step": 136 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 2.5480533986327556, + "learning_rate": 4.433656957928803e-06, + "loss": 0.5907, + "step": 137 + }, + { + "epoch": 0.013430656934306569, + "grad_norm": 2.054397852683208, + "learning_rate": 4.466019417475729e-06, + "loss": 0.4079, + "step": 138 + }, + { + "epoch": 0.013527980535279805, + "grad_norm": 2.2564046621809037, + "learning_rate": 4.498381877022654e-06, + "loss": 0.4, + "step": 139 + }, + { + "epoch": 0.013625304136253041, + "grad_norm": 2.8739841159071022, + "learning_rate": 4.53074433656958e-06, + "loss": 0.5819, + "step": 140 + }, + { + "epoch": 0.013722627737226278, + "grad_norm": 2.6418540847993657, + "learning_rate": 4.563106796116505e-06, + "loss": 0.589, + "step": 141 + }, + { + "epoch": 0.013819951338199514, + "grad_norm": 2.431908870746442, + "learning_rate": 4.59546925566343e-06, + "loss": 0.5468, + "step": 142 + }, + { + "epoch": 0.013917274939172749, + "grad_norm": 4.44933942542394, + "learning_rate": 4.627831715210356e-06, + "loss": 0.3846, + "step": 143 + }, + { + "epoch": 0.014014598540145985, + "grad_norm": 2.2469929628351126, + "learning_rate": 4.660194174757282e-06, + "loss": 0.3047, + "step": 144 + }, + { + "epoch": 0.014111922141119221, + "grad_norm": 2.8361034502388205, + "learning_rate": 4.6925566343042074e-06, + "loss": 0.4186, + "step": 145 + }, + { + "epoch": 0.014209245742092457, + "grad_norm": 2.485184255788147, + "learning_rate": 4.724919093851133e-06, + "loss": 0.455, + "step": 146 + }, + { + "epoch": 0.014306569343065694, + "grad_norm": 2.677307495548506, + "learning_rate": 4.7572815533980585e-06, + "loss": 0.6346, + "step": 147 + }, + { + "epoch": 0.01440389294403893, + "grad_norm": 2.9440091029213034, + "learning_rate": 4.789644012944984e-06, + "loss": 0.4961, + "step": 148 + }, + { + "epoch": 0.014501216545012165, + "grad_norm": 2.6810327828724723, + "learning_rate": 4.82200647249191e-06, + "loss": 0.3754, + "step": 149 + }, + { + "epoch": 0.014598540145985401, + "grad_norm": 2.519257002697837, + "learning_rate": 4.854368932038836e-06, + "loss": 0.249, + "step": 150 + }, + { + "epoch": 0.014695863746958637, + "grad_norm": 2.8041238457488578, + "learning_rate": 4.886731391585761e-06, + "loss": 0.3117, + "step": 151 + }, + { + "epoch": 0.014793187347931874, + "grad_norm": 2.363481194731433, + "learning_rate": 4.919093851132687e-06, + "loss": 0.3325, + "step": 152 + }, + { + "epoch": 0.01489051094890511, + "grad_norm": 3.078347599868747, + "learning_rate": 4.951456310679612e-06, + "loss": 0.3569, + "step": 153 + }, + { + "epoch": 0.014987834549878346, + "grad_norm": 3.2926461094535515, + "learning_rate": 4.983818770226538e-06, + "loss": 0.716, + "step": 154 + }, + { + "epoch": 0.01508515815085158, + "grad_norm": 2.340052421830345, + "learning_rate": 5.016181229773464e-06, + "loss": 0.2642, + "step": 155 + }, + { + "epoch": 0.015182481751824817, + "grad_norm": 1.8915730140906823, + "learning_rate": 5.048543689320389e-06, + "loss": 0.3523, + "step": 156 + }, + { + "epoch": 0.015279805352798053, + "grad_norm": 4.2448533254564484, + "learning_rate": 5.080906148867314e-06, + "loss": 0.5185, + "step": 157 + }, + { + "epoch": 0.01537712895377129, + "grad_norm": 2.1172922256300333, + "learning_rate": 5.11326860841424e-06, + "loss": 0.3341, + "step": 158 + }, + { + "epoch": 0.015474452554744526, + "grad_norm": 2.7414250631657113, + "learning_rate": 5.145631067961165e-06, + "loss": 0.5965, + "step": 159 + }, + { + "epoch": 0.015571776155717762, + "grad_norm": 1.977804344185745, + "learning_rate": 5.17799352750809e-06, + "loss": 0.239, + "step": 160 + }, + { + "epoch": 0.015669099756690997, + "grad_norm": 2.771807640315191, + "learning_rate": 5.210355987055017e-06, + "loss": 0.4122, + "step": 161 + }, + { + "epoch": 0.015766423357664233, + "grad_norm": 1.9977073642008174, + "learning_rate": 5.242718446601942e-06, + "loss": 0.3423, + "step": 162 + }, + { + "epoch": 0.01586374695863747, + "grad_norm": 3.222730527079622, + "learning_rate": 5.275080906148867e-06, + "loss": 0.5647, + "step": 163 + }, + { + "epoch": 0.015961070559610706, + "grad_norm": 2.95441646694508, + "learning_rate": 5.307443365695793e-06, + "loss": 0.5198, + "step": 164 + }, + { + "epoch": 0.016058394160583942, + "grad_norm": 2.3346384576429116, + "learning_rate": 5.3398058252427185e-06, + "loss": 0.3516, + "step": 165 + }, + { + "epoch": 0.01615571776155718, + "grad_norm": 2.089159587923689, + "learning_rate": 5.372168284789644e-06, + "loss": 0.3704, + "step": 166 + }, + { + "epoch": 0.016253041362530415, + "grad_norm": 2.8135820638465088, + "learning_rate": 5.4045307443365705e-06, + "loss": 0.3729, + "step": 167 + }, + { + "epoch": 0.01635036496350365, + "grad_norm": 2.991259557993277, + "learning_rate": 5.436893203883496e-06, + "loss": 0.5622, + "step": 168 + }, + { + "epoch": 0.016447688564476887, + "grad_norm": 3.1512644455187857, + "learning_rate": 5.4692556634304216e-06, + "loss": 0.5915, + "step": 169 + }, + { + "epoch": 0.01654501216545012, + "grad_norm": 2.616126184062516, + "learning_rate": 5.501618122977347e-06, + "loss": 0.4252, + "step": 170 + }, + { + "epoch": 0.016642335766423356, + "grad_norm": 1.9958281517625203, + "learning_rate": 5.533980582524272e-06, + "loss": 0.3704, + "step": 171 + }, + { + "epoch": 0.016739659367396593, + "grad_norm": 2.470731302334384, + "learning_rate": 5.566343042071199e-06, + "loss": 0.4373, + "step": 172 + }, + { + "epoch": 0.01683698296836983, + "grad_norm": 2.583270308023139, + "learning_rate": 5.598705501618124e-06, + "loss": 0.4125, + "step": 173 + }, + { + "epoch": 0.016934306569343065, + "grad_norm": 1.9644684632241667, + "learning_rate": 5.631067961165049e-06, + "loss": 0.3522, + "step": 174 + }, + { + "epoch": 0.0170316301703163, + "grad_norm": 2.4207097357376046, + "learning_rate": 5.663430420711975e-06, + "loss": 0.3579, + "step": 175 + }, + { + "epoch": 0.017128953771289538, + "grad_norm": 2.3511041847292034, + "learning_rate": 5.6957928802589e-06, + "loss": 0.5412, + "step": 176 + }, + { + "epoch": 0.017226277372262774, + "grad_norm": 2.274427899539275, + "learning_rate": 5.728155339805825e-06, + "loss": 0.5353, + "step": 177 + }, + { + "epoch": 0.01732360097323601, + "grad_norm": 2.133749284526256, + "learning_rate": 5.760517799352752e-06, + "loss": 0.4392, + "step": 178 + }, + { + "epoch": 0.017420924574209247, + "grad_norm": 2.3097462109285787, + "learning_rate": 5.792880258899677e-06, + "loss": 0.4442, + "step": 179 + }, + { + "epoch": 0.017518248175182483, + "grad_norm": 2.2128802818602056, + "learning_rate": 5.825242718446602e-06, + "loss": 0.5635, + "step": 180 + }, + { + "epoch": 0.017615571776155716, + "grad_norm": 2.103405792854256, + "learning_rate": 5.857605177993528e-06, + "loss": 0.4533, + "step": 181 + }, + { + "epoch": 0.017712895377128952, + "grad_norm": 2.0565661990183597, + "learning_rate": 5.889967637540453e-06, + "loss": 0.3806, + "step": 182 + }, + { + "epoch": 0.01781021897810219, + "grad_norm": 2.179649872267064, + "learning_rate": 5.9223300970873785e-06, + "loss": 0.3842, + "step": 183 + }, + { + "epoch": 0.017907542579075425, + "grad_norm": 3.8333244047199146, + "learning_rate": 5.954692556634305e-06, + "loss": 0.3876, + "step": 184 + }, + { + "epoch": 0.01800486618004866, + "grad_norm": 2.2893517217095716, + "learning_rate": 5.9870550161812304e-06, + "loss": 0.4781, + "step": 185 + }, + { + "epoch": 0.018102189781021898, + "grad_norm": 1.6022498167897639, + "learning_rate": 6.0194174757281556e-06, + "loss": 0.2306, + "step": 186 + }, + { + "epoch": 0.018199513381995134, + "grad_norm": 2.32863493589546, + "learning_rate": 6.0517799352750815e-06, + "loss": 0.5139, + "step": 187 + }, + { + "epoch": 0.01829683698296837, + "grad_norm": 2.0789478938631314, + "learning_rate": 6.084142394822007e-06, + "loss": 0.2824, + "step": 188 + }, + { + "epoch": 0.018394160583941607, + "grad_norm": 1.7544615955949223, + "learning_rate": 6.116504854368932e-06, + "loss": 0.4172, + "step": 189 + }, + { + "epoch": 0.018491484184914843, + "grad_norm": 1.931043696572374, + "learning_rate": 6.148867313915859e-06, + "loss": 0.3584, + "step": 190 + }, + { + "epoch": 0.01858880778588808, + "grad_norm": 2.467258437370788, + "learning_rate": 6.181229773462784e-06, + "loss": 0.462, + "step": 191 + }, + { + "epoch": 0.018686131386861315, + "grad_norm": 2.1541091684996965, + "learning_rate": 6.213592233009709e-06, + "loss": 0.3967, + "step": 192 + }, + { + "epoch": 0.01878345498783455, + "grad_norm": 2.2330486922808395, + "learning_rate": 6.245954692556635e-06, + "loss": 0.5316, + "step": 193 + }, + { + "epoch": 0.018880778588807785, + "grad_norm": 2.3498262097642395, + "learning_rate": 6.27831715210356e-06, + "loss": 0.4815, + "step": 194 + }, + { + "epoch": 0.01897810218978102, + "grad_norm": 1.7045092076002246, + "learning_rate": 6.310679611650487e-06, + "loss": 0.3, + "step": 195 + }, + { + "epoch": 0.019075425790754257, + "grad_norm": 2.5703331850837023, + "learning_rate": 6.343042071197412e-06, + "loss": 0.4143, + "step": 196 + }, + { + "epoch": 0.019172749391727494, + "grad_norm": 2.6940646171495133, + "learning_rate": 6.375404530744337e-06, + "loss": 0.5463, + "step": 197 + }, + { + "epoch": 0.01927007299270073, + "grad_norm": 2.4185580273524847, + "learning_rate": 6.407766990291263e-06, + "loss": 0.5215, + "step": 198 + }, + { + "epoch": 0.019367396593673966, + "grad_norm": 2.6509824694985946, + "learning_rate": 6.440129449838188e-06, + "loss": 0.5286, + "step": 199 + }, + { + "epoch": 0.019464720194647202, + "grad_norm": 2.4807219128312767, + "learning_rate": 6.472491909385113e-06, + "loss": 0.3996, + "step": 200 + }, + { + "epoch": 0.01956204379562044, + "grad_norm": 2.651883834043772, + "learning_rate": 6.50485436893204e-06, + "loss": 0.3499, + "step": 201 + }, + { + "epoch": 0.019659367396593675, + "grad_norm": 2.670759179984812, + "learning_rate": 6.537216828478965e-06, + "loss": 0.552, + "step": 202 + }, + { + "epoch": 0.01975669099756691, + "grad_norm": 2.51305850829245, + "learning_rate": 6.56957928802589e-06, + "loss": 0.3806, + "step": 203 + }, + { + "epoch": 0.019854014598540148, + "grad_norm": 2.435954851305265, + "learning_rate": 6.601941747572816e-06, + "loss": 0.6093, + "step": 204 + }, + { + "epoch": 0.01995133819951338, + "grad_norm": 2.091315833022872, + "learning_rate": 6.6343042071197415e-06, + "loss": 0.3573, + "step": 205 + }, + { + "epoch": 0.020048661800486617, + "grad_norm": 2.205515437184344, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2892, + "step": 206 + }, + { + "epoch": 0.020145985401459853, + "grad_norm": 2.314981932930035, + "learning_rate": 6.6990291262135935e-06, + "loss": 0.4184, + "step": 207 + }, + { + "epoch": 0.02024330900243309, + "grad_norm": 1.9102474885146974, + "learning_rate": 6.731391585760519e-06, + "loss": 0.2287, + "step": 208 + }, + { + "epoch": 0.020340632603406326, + "grad_norm": 1.9408029275065433, + "learning_rate": 6.763754045307444e-06, + "loss": 0.3958, + "step": 209 + }, + { + "epoch": 0.020437956204379562, + "grad_norm": 2.1006467731485823, + "learning_rate": 6.79611650485437e-06, + "loss": 0.3764, + "step": 210 + }, + { + "epoch": 0.0205352798053528, + "grad_norm": 2.0927447282795146, + "learning_rate": 6.828478964401295e-06, + "loss": 0.531, + "step": 211 + }, + { + "epoch": 0.020632603406326035, + "grad_norm": 3.4830081465453633, + "learning_rate": 6.86084142394822e-06, + "loss": 0.4887, + "step": 212 + }, + { + "epoch": 0.02072992700729927, + "grad_norm": 2.253360993066953, + "learning_rate": 6.893203883495147e-06, + "loss": 0.4587, + "step": 213 + }, + { + "epoch": 0.020827250608272507, + "grad_norm": 3.3751096354443852, + "learning_rate": 6.925566343042072e-06, + "loss": 0.3427, + "step": 214 + }, + { + "epoch": 0.020924574209245744, + "grad_norm": 1.9729713112803993, + "learning_rate": 6.957928802588997e-06, + "loss": 0.384, + "step": 215 + }, + { + "epoch": 0.021021897810218976, + "grad_norm": 2.761285167796522, + "learning_rate": 6.990291262135923e-06, + "loss": 0.3512, + "step": 216 + }, + { + "epoch": 0.021119221411192213, + "grad_norm": 2.431882400442612, + "learning_rate": 7.022653721682848e-06, + "loss": 0.3971, + "step": 217 + }, + { + "epoch": 0.02121654501216545, + "grad_norm": 3.659254877088116, + "learning_rate": 7.055016181229773e-06, + "loss": 0.4115, + "step": 218 + }, + { + "epoch": 0.021313868613138685, + "grad_norm": 2.5501534359714655, + "learning_rate": 7.0873786407767e-06, + "loss": 0.4963, + "step": 219 + }, + { + "epoch": 0.02141119221411192, + "grad_norm": 4.296894309260591, + "learning_rate": 7.119741100323625e-06, + "loss": 0.5203, + "step": 220 + }, + { + "epoch": 0.021508515815085158, + "grad_norm": 2.5489854552137237, + "learning_rate": 7.152103559870551e-06, + "loss": 0.4343, + "step": 221 + }, + { + "epoch": 0.021605839416058394, + "grad_norm": 2.00955207958064, + "learning_rate": 7.184466019417476e-06, + "loss": 0.3603, + "step": 222 + }, + { + "epoch": 0.02170316301703163, + "grad_norm": 2.2675038932590224, + "learning_rate": 7.2168284789644015e-06, + "loss": 0.3968, + "step": 223 + }, + { + "epoch": 0.021800486618004867, + "grad_norm": 2.4690586331753277, + "learning_rate": 7.249190938511328e-06, + "loss": 0.5883, + "step": 224 + }, + { + "epoch": 0.021897810218978103, + "grad_norm": 2.141328682063472, + "learning_rate": 7.2815533980582534e-06, + "loss": 0.3547, + "step": 225 + }, + { + "epoch": 0.02199513381995134, + "grad_norm": 2.223927434368622, + "learning_rate": 7.3139158576051786e-06, + "loss": 0.5031, + "step": 226 + }, + { + "epoch": 0.022092457420924576, + "grad_norm": 2.8602320319532346, + "learning_rate": 7.3462783171521046e-06, + "loss": 0.4226, + "step": 227 + }, + { + "epoch": 0.02218978102189781, + "grad_norm": 2.8852449405031835, + "learning_rate": 7.37864077669903e-06, + "loss": 0.4298, + "step": 228 + }, + { + "epoch": 0.022287104622871045, + "grad_norm": 1.7370522944561966, + "learning_rate": 7.411003236245955e-06, + "loss": 0.3827, + "step": 229 + }, + { + "epoch": 0.02238442822384428, + "grad_norm": 2.3907908463140584, + "learning_rate": 7.443365695792882e-06, + "loss": 0.4139, + "step": 230 + }, + { + "epoch": 0.022481751824817518, + "grad_norm": 2.27581306432663, + "learning_rate": 7.475728155339807e-06, + "loss": 0.4736, + "step": 231 + }, + { + "epoch": 0.022579075425790754, + "grad_norm": 2.1861094823645675, + "learning_rate": 7.508090614886732e-06, + "loss": 0.4809, + "step": 232 + }, + { + "epoch": 0.02267639902676399, + "grad_norm": 1.9626208371421419, + "learning_rate": 7.540453074433658e-06, + "loss": 0.3436, + "step": 233 + }, + { + "epoch": 0.022773722627737226, + "grad_norm": 1.7092390993202267, + "learning_rate": 7.572815533980583e-06, + "loss": 0.3224, + "step": 234 + }, + { + "epoch": 0.022871046228710463, + "grad_norm": 3.0168693228526546, + "learning_rate": 7.605177993527508e-06, + "loss": 0.6366, + "step": 235 + }, + { + "epoch": 0.0229683698296837, + "grad_norm": 2.424919921496664, + "learning_rate": 7.637540453074434e-06, + "loss": 0.4483, + "step": 236 + }, + { + "epoch": 0.023065693430656935, + "grad_norm": 2.4586833984787626, + "learning_rate": 7.66990291262136e-06, + "loss": 0.4031, + "step": 237 + }, + { + "epoch": 0.02316301703163017, + "grad_norm": 2.092010230715883, + "learning_rate": 7.702265372168284e-06, + "loss": 0.4257, + "step": 238 + }, + { + "epoch": 0.023260340632603408, + "grad_norm": 2.3360188447701655, + "learning_rate": 7.734627831715211e-06, + "loss": 0.4684, + "step": 239 + }, + { + "epoch": 0.02335766423357664, + "grad_norm": 2.087175894606599, + "learning_rate": 7.766990291262136e-06, + "loss": 0.4272, + "step": 240 + }, + { + "epoch": 0.023454987834549877, + "grad_norm": 2.598684557686617, + "learning_rate": 7.799352750809061e-06, + "loss": 0.5401, + "step": 241 + }, + { + "epoch": 0.023552311435523113, + "grad_norm": 2.025117037181364, + "learning_rate": 7.831715210355988e-06, + "loss": 0.372, + "step": 242 + }, + { + "epoch": 0.02364963503649635, + "grad_norm": 2.2467324584398405, + "learning_rate": 7.864077669902913e-06, + "loss": 0.5891, + "step": 243 + }, + { + "epoch": 0.023746958637469586, + "grad_norm": 2.38036373195977, + "learning_rate": 7.896440129449839e-06, + "loss": 0.5133, + "step": 244 + }, + { + "epoch": 0.023844282238442822, + "grad_norm": 2.052700924442009, + "learning_rate": 7.928802588996765e-06, + "loss": 0.5161, + "step": 245 + }, + { + "epoch": 0.02394160583941606, + "grad_norm": 3.4299018810240254, + "learning_rate": 7.96116504854369e-06, + "loss": 0.5314, + "step": 246 + }, + { + "epoch": 0.024038929440389295, + "grad_norm": 1.3903956706369247, + "learning_rate": 7.993527508090616e-06, + "loss": 0.3539, + "step": 247 + }, + { + "epoch": 0.02413625304136253, + "grad_norm": 2.4599878810180873, + "learning_rate": 8.02588996763754e-06, + "loss": 0.4876, + "step": 248 + }, + { + "epoch": 0.024233576642335768, + "grad_norm": 2.4053308291912083, + "learning_rate": 8.058252427184466e-06, + "loss": 0.5185, + "step": 249 + }, + { + "epoch": 0.024330900243309004, + "grad_norm": 1.6624263546342495, + "learning_rate": 8.090614886731393e-06, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.024428223844282237, + "grad_norm": 2.4091367373679597, + "learning_rate": 8.122977346278318e-06, + "loss": 0.6192, + "step": 251 + }, + { + "epoch": 0.024525547445255473, + "grad_norm": 2.4595313520548427, + "learning_rate": 8.155339805825243e-06, + "loss": 0.3444, + "step": 252 + }, + { + "epoch": 0.02462287104622871, + "grad_norm": 2.3200411140153174, + "learning_rate": 8.18770226537217e-06, + "loss": 0.6112, + "step": 253 + }, + { + "epoch": 0.024720194647201946, + "grad_norm": 2.029624875741936, + "learning_rate": 8.220064724919095e-06, + "loss": 0.4524, + "step": 254 + }, + { + "epoch": 0.024817518248175182, + "grad_norm": 1.8862765408033388, + "learning_rate": 8.25242718446602e-06, + "loss": 0.2173, + "step": 255 + }, + { + "epoch": 0.02491484184914842, + "grad_norm": 2.575687620331568, + "learning_rate": 8.284789644012947e-06, + "loss": 0.4599, + "step": 256 + }, + { + "epoch": 0.025012165450121655, + "grad_norm": 2.373530485379713, + "learning_rate": 8.317152103559872e-06, + "loss": 0.5326, + "step": 257 + }, + { + "epoch": 0.02510948905109489, + "grad_norm": 2.4086353319447262, + "learning_rate": 8.349514563106797e-06, + "loss": 0.6275, + "step": 258 + }, + { + "epoch": 0.025206812652068127, + "grad_norm": 2.1075725625285697, + "learning_rate": 8.381877022653722e-06, + "loss": 0.44, + "step": 259 + }, + { + "epoch": 0.025304136253041364, + "grad_norm": 2.0285700798989614, + "learning_rate": 8.414239482200647e-06, + "loss": 0.3489, + "step": 260 + }, + { + "epoch": 0.0254014598540146, + "grad_norm": 2.5592973746241, + "learning_rate": 8.446601941747573e-06, + "loss": 0.4403, + "step": 261 + }, + { + "epoch": 0.025498783454987836, + "grad_norm": 2.470930078509074, + "learning_rate": 8.4789644012945e-06, + "loss": 0.4985, + "step": 262 + }, + { + "epoch": 0.02559610705596107, + "grad_norm": 2.099638103909556, + "learning_rate": 8.511326860841424e-06, + "loss": 0.4194, + "step": 263 + }, + { + "epoch": 0.025693430656934305, + "grad_norm": 1.6030834140551835, + "learning_rate": 8.54368932038835e-06, + "loss": 0.3382, + "step": 264 + }, + { + "epoch": 0.02579075425790754, + "grad_norm": 1.8960928547169034, + "learning_rate": 8.576051779935276e-06, + "loss": 0.2838, + "step": 265 + }, + { + "epoch": 0.025888077858880778, + "grad_norm": 2.4306930963261966, + "learning_rate": 8.608414239482202e-06, + "loss": 0.4956, + "step": 266 + }, + { + "epoch": 0.025985401459854014, + "grad_norm": 2.374430136325354, + "learning_rate": 8.640776699029127e-06, + "loss": 0.5083, + "step": 267 + }, + { + "epoch": 0.02608272506082725, + "grad_norm": 2.410095115145934, + "learning_rate": 8.673139158576054e-06, + "loss": 0.4247, + "step": 268 + }, + { + "epoch": 0.026180048661800487, + "grad_norm": 2.41271065696519, + "learning_rate": 8.705501618122979e-06, + "loss": 0.6946, + "step": 269 + }, + { + "epoch": 0.026277372262773723, + "grad_norm": 1.752688930628829, + "learning_rate": 8.737864077669904e-06, + "loss": 0.2662, + "step": 270 + }, + { + "epoch": 0.02637469586374696, + "grad_norm": 1.9842034213162434, + "learning_rate": 8.770226537216829e-06, + "loss": 0.3611, + "step": 271 + }, + { + "epoch": 0.026472019464720196, + "grad_norm": 2.4137979998327497, + "learning_rate": 8.802588996763754e-06, + "loss": 0.501, + "step": 272 + }, + { + "epoch": 0.026569343065693432, + "grad_norm": 2.929650064864996, + "learning_rate": 8.834951456310681e-06, + "loss": 0.6153, + "step": 273 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 2.281738020025263, + "learning_rate": 8.867313915857606e-06, + "loss": 0.5395, + "step": 274 + }, + { + "epoch": 0.0267639902676399, + "grad_norm": 2.1406726692627975, + "learning_rate": 8.899676375404531e-06, + "loss": 0.4039, + "step": 275 + }, + { + "epoch": 0.026861313868613138, + "grad_norm": 3.2366954201371523, + "learning_rate": 8.932038834951458e-06, + "loss": 0.5414, + "step": 276 + }, + { + "epoch": 0.026958637469586374, + "grad_norm": 2.1900667662872513, + "learning_rate": 8.964401294498383e-06, + "loss": 0.3815, + "step": 277 + }, + { + "epoch": 0.02705596107055961, + "grad_norm": 2.5301939091612216, + "learning_rate": 8.996763754045308e-06, + "loss": 0.8016, + "step": 278 + }, + { + "epoch": 0.027153284671532846, + "grad_norm": 2.2552758985680907, + "learning_rate": 9.029126213592233e-06, + "loss": 0.4133, + "step": 279 + }, + { + "epoch": 0.027250608272506083, + "grad_norm": 2.309545536997134, + "learning_rate": 9.06148867313916e-06, + "loss": 0.5346, + "step": 280 + }, + { + "epoch": 0.02734793187347932, + "grad_norm": 2.585578916644781, + "learning_rate": 9.093851132686085e-06, + "loss": 0.407, + "step": 281 + }, + { + "epoch": 0.027445255474452555, + "grad_norm": 1.8503464194025006, + "learning_rate": 9.12621359223301e-06, + "loss": 0.4674, + "step": 282 + }, + { + "epoch": 0.02754257907542579, + "grad_norm": 2.431490115980846, + "learning_rate": 9.158576051779936e-06, + "loss": 0.6026, + "step": 283 + }, + { + "epoch": 0.027639902676399028, + "grad_norm": 1.916233248702735, + "learning_rate": 9.19093851132686e-06, + "loss": 0.4949, + "step": 284 + }, + { + "epoch": 0.027737226277372264, + "grad_norm": 2.2160236640245072, + "learning_rate": 9.223300970873788e-06, + "loss": 0.4765, + "step": 285 + }, + { + "epoch": 0.027834549878345497, + "grad_norm": 2.0764827118780143, + "learning_rate": 9.255663430420713e-06, + "loss": 0.472, + "step": 286 + }, + { + "epoch": 0.027931873479318733, + "grad_norm": 2.638286661284288, + "learning_rate": 9.288025889967638e-06, + "loss": 0.6312, + "step": 287 + }, + { + "epoch": 0.02802919708029197, + "grad_norm": 1.940011273577467, + "learning_rate": 9.320388349514565e-06, + "loss": 0.4555, + "step": 288 + }, + { + "epoch": 0.028126520681265206, + "grad_norm": 1.8760624736314784, + "learning_rate": 9.35275080906149e-06, + "loss": 0.3625, + "step": 289 + }, + { + "epoch": 0.028223844282238442, + "grad_norm": 1.3468692859077058, + "learning_rate": 9.385113268608415e-06, + "loss": 0.2442, + "step": 290 + }, + { + "epoch": 0.02832116788321168, + "grad_norm": 2.1497394847504014, + "learning_rate": 9.41747572815534e-06, + "loss": 0.5227, + "step": 291 + }, + { + "epoch": 0.028418491484184915, + "grad_norm": 2.1233743171190014, + "learning_rate": 9.449838187702267e-06, + "loss": 0.6184, + "step": 292 + }, + { + "epoch": 0.02851581508515815, + "grad_norm": 2.337806183860394, + "learning_rate": 9.482200647249192e-06, + "loss": 0.5491, + "step": 293 + }, + { + "epoch": 0.028613138686131388, + "grad_norm": 2.015000594070385, + "learning_rate": 9.514563106796117e-06, + "loss": 0.5137, + "step": 294 + }, + { + "epoch": 0.028710462287104624, + "grad_norm": 2.0267324830753766, + "learning_rate": 9.546925566343042e-06, + "loss": 0.4117, + "step": 295 + }, + { + "epoch": 0.02880778588807786, + "grad_norm": 1.732639028192012, + "learning_rate": 9.579288025889967e-06, + "loss": 0.3156, + "step": 296 + }, + { + "epoch": 0.028905109489051097, + "grad_norm": 2.1204056159243923, + "learning_rate": 9.611650485436894e-06, + "loss": 0.6056, + "step": 297 + }, + { + "epoch": 0.02900243309002433, + "grad_norm": 1.7868071753968195, + "learning_rate": 9.64401294498382e-06, + "loss": 0.3417, + "step": 298 + }, + { + "epoch": 0.029099756690997566, + "grad_norm": 1.9477439300595292, + "learning_rate": 9.676375404530746e-06, + "loss": 0.3631, + "step": 299 + }, + { + "epoch": 0.029197080291970802, + "grad_norm": 1.7688147839655162, + "learning_rate": 9.708737864077671e-06, + "loss": 0.3605, + "step": 300 + }, + { + "epoch": 0.029294403892944038, + "grad_norm": 1.9162335597538034, + "learning_rate": 9.741100323624596e-06, + "loss": 0.2498, + "step": 301 + }, + { + "epoch": 0.029391727493917275, + "grad_norm": 2.9282579520055756, + "learning_rate": 9.773462783171522e-06, + "loss": 0.4286, + "step": 302 + }, + { + "epoch": 0.02948905109489051, + "grad_norm": 1.9744499285549086, + "learning_rate": 9.805825242718447e-06, + "loss": 0.3391, + "step": 303 + }, + { + "epoch": 0.029586374695863747, + "grad_norm": 2.2116032868392455, + "learning_rate": 9.838187702265373e-06, + "loss": 0.3414, + "step": 304 + }, + { + "epoch": 0.029683698296836983, + "grad_norm": 1.9159144570242486, + "learning_rate": 9.870550161812299e-06, + "loss": 0.2915, + "step": 305 + }, + { + "epoch": 0.02978102189781022, + "grad_norm": 2.671718838238437, + "learning_rate": 9.902912621359224e-06, + "loss": 0.79, + "step": 306 + }, + { + "epoch": 0.029878345498783456, + "grad_norm": 2.093937424199301, + "learning_rate": 9.935275080906149e-06, + "loss": 0.576, + "step": 307 + }, + { + "epoch": 0.029975669099756692, + "grad_norm": 1.895574286512308, + "learning_rate": 9.967637540453076e-06, + "loss": 0.4223, + "step": 308 + }, + { + "epoch": 0.03007299270072993, + "grad_norm": 2.142643554578675, + "learning_rate": 1e-05, + "loss": 0.4719, + "step": 309 + }, + { + "epoch": 0.03017031630170316, + "grad_norm": 2.2548613483238378, + "learning_rate": 9.999999751573464e-06, + "loss": 0.5547, + "step": 310 + }, + { + "epoch": 0.030267639902676398, + "grad_norm": 2.375146158639999, + "learning_rate": 9.99999900629388e-06, + "loss": 0.3864, + "step": 311 + }, + { + "epoch": 0.030364963503649634, + "grad_norm": 1.558937895217452, + "learning_rate": 9.99999776416132e-06, + "loss": 0.3409, + "step": 312 + }, + { + "epoch": 0.03046228710462287, + "grad_norm": 2.7508940543848115, + "learning_rate": 9.99999602517591e-06, + "loss": 0.3641, + "step": 313 + }, + { + "epoch": 0.030559610705596107, + "grad_norm": 2.228096737889712, + "learning_rate": 9.99999378933782e-06, + "loss": 0.6464, + "step": 314 + }, + { + "epoch": 0.030656934306569343, + "grad_norm": 1.5612763763472, + "learning_rate": 9.999991056647274e-06, + "loss": 0.3124, + "step": 315 + }, + { + "epoch": 0.03075425790754258, + "grad_norm": 2.3203527787434104, + "learning_rate": 9.999987827104544e-06, + "loss": 0.5893, + "step": 316 + }, + { + "epoch": 0.030851581508515816, + "grad_norm": 1.8472611567410342, + "learning_rate": 9.999984100709951e-06, + "loss": 0.3732, + "step": 317 + }, + { + "epoch": 0.030948905109489052, + "grad_norm": 2.269778108549014, + "learning_rate": 9.999979877463866e-06, + "loss": 0.5537, + "step": 318 + }, + { + "epoch": 0.03104622871046229, + "grad_norm": 2.381498581134022, + "learning_rate": 9.999975157366705e-06, + "loss": 0.7179, + "step": 319 + }, + { + "epoch": 0.031143552311435525, + "grad_norm": 1.7030655036823346, + "learning_rate": 9.99996994041894e-06, + "loss": 0.4256, + "step": 320 + }, + { + "epoch": 0.031240875912408757, + "grad_norm": 1.8361141038730153, + "learning_rate": 9.999964226621089e-06, + "loss": 0.4648, + "step": 321 + }, + { + "epoch": 0.031338199513381994, + "grad_norm": 1.7985459229558753, + "learning_rate": 9.99995801597372e-06, + "loss": 0.3031, + "step": 322 + }, + { + "epoch": 0.031435523114355234, + "grad_norm": 2.4309020119442915, + "learning_rate": 9.99995130847745e-06, + "loss": 0.5011, + "step": 323 + }, + { + "epoch": 0.031532846715328466, + "grad_norm": 2.048514022969095, + "learning_rate": 9.999944104132944e-06, + "loss": 0.6152, + "step": 324 + }, + { + "epoch": 0.031630170316301706, + "grad_norm": 1.8892667320795724, + "learning_rate": 9.99993640294092e-06, + "loss": 0.4738, + "step": 325 + }, + { + "epoch": 0.03172749391727494, + "grad_norm": 2.081179331819785, + "learning_rate": 9.999928204902141e-06, + "loss": 0.5192, + "step": 326 + }, + { + "epoch": 0.03182481751824817, + "grad_norm": 2.410280889073595, + "learning_rate": 9.999919510017424e-06, + "loss": 0.3314, + "step": 327 + }, + { + "epoch": 0.03192214111922141, + "grad_norm": 1.663034255724975, + "learning_rate": 9.999910318287632e-06, + "loss": 0.3342, + "step": 328 + }, + { + "epoch": 0.032019464720194644, + "grad_norm": 1.7874391345352068, + "learning_rate": 9.999900629713679e-06, + "loss": 0.3189, + "step": 329 + }, + { + "epoch": 0.032116788321167884, + "grad_norm": 2.1098429973097805, + "learning_rate": 9.999890444296528e-06, + "loss": 0.4561, + "step": 330 + }, + { + "epoch": 0.03221411192214112, + "grad_norm": 2.4678279265353558, + "learning_rate": 9.999879762037187e-06, + "loss": 0.5831, + "step": 331 + }, + { + "epoch": 0.03231143552311436, + "grad_norm": 1.6643716587630457, + "learning_rate": 9.999868582936726e-06, + "loss": 0.4371, + "step": 332 + }, + { + "epoch": 0.03240875912408759, + "grad_norm": 2.088466639768523, + "learning_rate": 9.999856906996246e-06, + "loss": 0.3904, + "step": 333 + }, + { + "epoch": 0.03250608272506083, + "grad_norm": 2.0023651443392256, + "learning_rate": 9.999844734216914e-06, + "loss": 0.4802, + "step": 334 + }, + { + "epoch": 0.03260340632603406, + "grad_norm": 2.161282844007076, + "learning_rate": 9.99983206459994e-06, + "loss": 0.5187, + "step": 335 + }, + { + "epoch": 0.0327007299270073, + "grad_norm": 2.10212671583593, + "learning_rate": 9.999818898146576e-06, + "loss": 0.4618, + "step": 336 + }, + { + "epoch": 0.032798053527980535, + "grad_norm": 2.2142899508809286, + "learning_rate": 9.999805234858137e-06, + "loss": 0.2387, + "step": 337 + }, + { + "epoch": 0.032895377128953775, + "grad_norm": 2.1084763484693023, + "learning_rate": 9.999791074735981e-06, + "loss": 0.5652, + "step": 338 + }, + { + "epoch": 0.03299270072992701, + "grad_norm": 2.261838498017328, + "learning_rate": 9.99977641778151e-06, + "loss": 0.7224, + "step": 339 + }, + { + "epoch": 0.03309002433090024, + "grad_norm": 1.612816030006559, + "learning_rate": 9.999761263996184e-06, + "loss": 0.377, + "step": 340 + }, + { + "epoch": 0.03318734793187348, + "grad_norm": 2.1209830295615832, + "learning_rate": 9.999745613381507e-06, + "loss": 0.614, + "step": 341 + }, + { + "epoch": 0.03328467153284671, + "grad_norm": 1.7938764015879674, + "learning_rate": 9.999729465939036e-06, + "loss": 0.3983, + "step": 342 + }, + { + "epoch": 0.03338199513381995, + "grad_norm": 1.943418875698731, + "learning_rate": 9.999712821670375e-06, + "loss": 0.4708, + "step": 343 + }, + { + "epoch": 0.033479318734793186, + "grad_norm": 1.9787546900237571, + "learning_rate": 9.99969568057718e-06, + "loss": 0.578, + "step": 344 + }, + { + "epoch": 0.033576642335766425, + "grad_norm": 1.4798263726328331, + "learning_rate": 9.99967804266115e-06, + "loss": 0.394, + "step": 345 + }, + { + "epoch": 0.03367396593673966, + "grad_norm": 2.1936298043995484, + "learning_rate": 9.99965990792404e-06, + "loss": 0.6316, + "step": 346 + }, + { + "epoch": 0.0337712895377129, + "grad_norm": 2.2799650780195133, + "learning_rate": 9.99964127636765e-06, + "loss": 0.3985, + "step": 347 + }, + { + "epoch": 0.03386861313868613, + "grad_norm": 1.8519049219191819, + "learning_rate": 9.999622147993837e-06, + "loss": 0.3853, + "step": 348 + }, + { + "epoch": 0.03396593673965937, + "grad_norm": 1.5111895282974241, + "learning_rate": 9.999602522804497e-06, + "loss": 0.4201, + "step": 349 + }, + { + "epoch": 0.0340632603406326, + "grad_norm": 1.8605769784283237, + "learning_rate": 9.99958240080158e-06, + "loss": 0.5225, + "step": 350 + }, + { + "epoch": 0.034160583941605836, + "grad_norm": 1.6063240538866903, + "learning_rate": 9.999561781987087e-06, + "loss": 0.3165, + "step": 351 + }, + { + "epoch": 0.034257907542579076, + "grad_norm": 1.4751976204077173, + "learning_rate": 9.999540666363068e-06, + "loss": 0.3156, + "step": 352 + }, + { + "epoch": 0.03435523114355231, + "grad_norm": 2.1029966771511757, + "learning_rate": 9.99951905393162e-06, + "loss": 0.5336, + "step": 353 + }, + { + "epoch": 0.03445255474452555, + "grad_norm": 2.1419054642874267, + "learning_rate": 9.99949694469489e-06, + "loss": 0.5253, + "step": 354 + }, + { + "epoch": 0.03454987834549878, + "grad_norm": 2.169397271826959, + "learning_rate": 9.999474338655075e-06, + "loss": 0.5567, + "step": 355 + }, + { + "epoch": 0.03464720194647202, + "grad_norm": 2.2972412855327797, + "learning_rate": 9.999451235814422e-06, + "loss": 0.5233, + "step": 356 + }, + { + "epoch": 0.034744525547445254, + "grad_norm": 1.830377999961128, + "learning_rate": 9.999427636175228e-06, + "loss": 0.4297, + "step": 357 + }, + { + "epoch": 0.034841849148418494, + "grad_norm": 2.1217123292302875, + "learning_rate": 9.999403539739837e-06, + "loss": 0.3605, + "step": 358 + }, + { + "epoch": 0.03493917274939173, + "grad_norm": 2.001599625802253, + "learning_rate": 9.999378946510642e-06, + "loss": 0.5237, + "step": 359 + }, + { + "epoch": 0.035036496350364967, + "grad_norm": 1.6719956399048532, + "learning_rate": 9.99935385649009e-06, + "loss": 0.424, + "step": 360 + }, + { + "epoch": 0.0351338199513382, + "grad_norm": 1.5962062682133515, + "learning_rate": 9.99932826968067e-06, + "loss": 0.4228, + "step": 361 + }, + { + "epoch": 0.03523114355231143, + "grad_norm": 1.9754274750693919, + "learning_rate": 9.999302186084929e-06, + "loss": 0.4333, + "step": 362 + }, + { + "epoch": 0.03532846715328467, + "grad_norm": 1.8248617929879183, + "learning_rate": 9.999275605705457e-06, + "loss": 0.4985, + "step": 363 + }, + { + "epoch": 0.035425790754257905, + "grad_norm": 2.5923075514224982, + "learning_rate": 9.999248528544895e-06, + "loss": 0.4829, + "step": 364 + }, + { + "epoch": 0.035523114355231145, + "grad_norm": 1.9900801938638135, + "learning_rate": 9.999220954605932e-06, + "loss": 0.587, + "step": 365 + }, + { + "epoch": 0.03562043795620438, + "grad_norm": 1.731558772897005, + "learning_rate": 9.999192883891314e-06, + "loss": 0.3299, + "step": 366 + }, + { + "epoch": 0.03571776155717762, + "grad_norm": 2.339577788711278, + "learning_rate": 9.999164316403823e-06, + "loss": 0.4845, + "step": 367 + }, + { + "epoch": 0.03581508515815085, + "grad_norm": 1.9784113864985955, + "learning_rate": 9.999135252146302e-06, + "loss": 0.5776, + "step": 368 + }, + { + "epoch": 0.03591240875912409, + "grad_norm": 1.5555461256937277, + "learning_rate": 9.999105691121638e-06, + "loss": 0.3563, + "step": 369 + }, + { + "epoch": 0.03600973236009732, + "grad_norm": 1.7905677559908044, + "learning_rate": 9.99907563333277e-06, + "loss": 0.546, + "step": 370 + }, + { + "epoch": 0.03610705596107056, + "grad_norm": 2.0490894714600287, + "learning_rate": 9.999045078782684e-06, + "loss": 0.6836, + "step": 371 + }, + { + "epoch": 0.036204379562043795, + "grad_norm": 2.216601446334751, + "learning_rate": 9.999014027474413e-06, + "loss": 0.5237, + "step": 372 + }, + { + "epoch": 0.036301703163017035, + "grad_norm": 1.5937926342815392, + "learning_rate": 9.998982479411047e-06, + "loss": 0.3539, + "step": 373 + }, + { + "epoch": 0.03639902676399027, + "grad_norm": 2.3941848280266864, + "learning_rate": 9.99895043459572e-06, + "loss": 0.6249, + "step": 374 + }, + { + "epoch": 0.0364963503649635, + "grad_norm": 2.072859669066288, + "learning_rate": 9.998917893031615e-06, + "loss": 0.5415, + "step": 375 + }, + { + "epoch": 0.03659367396593674, + "grad_norm": 1.670908711065728, + "learning_rate": 9.998884854721968e-06, + "loss": 0.3034, + "step": 376 + }, + { + "epoch": 0.03669099756690997, + "grad_norm": 1.9880303784818283, + "learning_rate": 9.998851319670057e-06, + "loss": 0.5025, + "step": 377 + }, + { + "epoch": 0.03678832116788321, + "grad_norm": 1.3517666701087396, + "learning_rate": 9.99881728787922e-06, + "loss": 0.2775, + "step": 378 + }, + { + "epoch": 0.036885644768856446, + "grad_norm": 1.8952553535268069, + "learning_rate": 9.998782759352839e-06, + "loss": 0.5306, + "step": 379 + }, + { + "epoch": 0.036982968369829686, + "grad_norm": 1.8730537486024816, + "learning_rate": 9.998747734094338e-06, + "loss": 0.386, + "step": 380 + }, + { + "epoch": 0.03708029197080292, + "grad_norm": 2.058996056292158, + "learning_rate": 9.998712212107205e-06, + "loss": 0.5641, + "step": 381 + }, + { + "epoch": 0.03717761557177616, + "grad_norm": 1.9837834234853275, + "learning_rate": 9.998676193394966e-06, + "loss": 0.2628, + "step": 382 + }, + { + "epoch": 0.03727493917274939, + "grad_norm": 2.189700953999047, + "learning_rate": 9.998639677961203e-06, + "loss": 0.6024, + "step": 383 + }, + { + "epoch": 0.03737226277372263, + "grad_norm": 2.060696593716547, + "learning_rate": 9.99860266580954e-06, + "loss": 0.5377, + "step": 384 + }, + { + "epoch": 0.037469586374695864, + "grad_norm": 2.0831966609629227, + "learning_rate": 9.99856515694366e-06, + "loss": 0.5063, + "step": 385 + }, + { + "epoch": 0.0375669099756691, + "grad_norm": 2.2950496556846227, + "learning_rate": 9.998527151367288e-06, + "loss": 0.6484, + "step": 386 + }, + { + "epoch": 0.037664233576642336, + "grad_norm": 2.2597922123273873, + "learning_rate": 9.9984886490842e-06, + "loss": 0.6617, + "step": 387 + }, + { + "epoch": 0.03776155717761557, + "grad_norm": 2.071575887731456, + "learning_rate": 9.99844965009822e-06, + "loss": 0.5405, + "step": 388 + }, + { + "epoch": 0.03785888077858881, + "grad_norm": 2.004249587957457, + "learning_rate": 9.99841015441323e-06, + "loss": 0.4306, + "step": 389 + }, + { + "epoch": 0.03795620437956204, + "grad_norm": 1.9297023880727862, + "learning_rate": 9.99837016203315e-06, + "loss": 0.4083, + "step": 390 + }, + { + "epoch": 0.03805352798053528, + "grad_norm": 2.001337081282171, + "learning_rate": 9.998329672961952e-06, + "loss": 0.4999, + "step": 391 + }, + { + "epoch": 0.038150851581508514, + "grad_norm": 1.7630230797021285, + "learning_rate": 9.998288687203665e-06, + "loss": 0.4267, + "step": 392 + }, + { + "epoch": 0.038248175182481754, + "grad_norm": 1.4413546421147376, + "learning_rate": 9.998247204762358e-06, + "loss": 0.3028, + "step": 393 + }, + { + "epoch": 0.03834549878345499, + "grad_norm": 2.032450629241147, + "learning_rate": 9.998205225642154e-06, + "loss": 0.4216, + "step": 394 + }, + { + "epoch": 0.03844282238442823, + "grad_norm": 1.8288270303352272, + "learning_rate": 9.998162749847224e-06, + "loss": 0.451, + "step": 395 + }, + { + "epoch": 0.03854014598540146, + "grad_norm": 1.5869427581540143, + "learning_rate": 9.998119777381791e-06, + "loss": 0.4896, + "step": 396 + }, + { + "epoch": 0.03863746958637469, + "grad_norm": 1.9312614168983935, + "learning_rate": 9.998076308250122e-06, + "loss": 0.351, + "step": 397 + }, + { + "epoch": 0.03873479318734793, + "grad_norm": 2.182734939846557, + "learning_rate": 9.99803234245654e-06, + "loss": 0.4456, + "step": 398 + }, + { + "epoch": 0.038832116788321165, + "grad_norm": 1.6075130172605856, + "learning_rate": 9.997987880005412e-06, + "loss": 0.3333, + "step": 399 + }, + { + "epoch": 0.038929440389294405, + "grad_norm": 2.0206579020801048, + "learning_rate": 9.997942920901154e-06, + "loss": 0.4662, + "step": 400 + }, + { + "epoch": 0.03902676399026764, + "grad_norm": 2.0019154912621246, + "learning_rate": 9.997897465148236e-06, + "loss": 0.588, + "step": 401 + }, + { + "epoch": 0.03912408759124088, + "grad_norm": 1.9556688755730123, + "learning_rate": 9.997851512751178e-06, + "loss": 0.5364, + "step": 402 + }, + { + "epoch": 0.03922141119221411, + "grad_norm": 2.1735940620422687, + "learning_rate": 9.997805063714541e-06, + "loss": 0.4155, + "step": 403 + }, + { + "epoch": 0.03931873479318735, + "grad_norm": 1.893104755523836, + "learning_rate": 9.997758118042945e-06, + "loss": 0.2835, + "step": 404 + }, + { + "epoch": 0.03941605839416058, + "grad_norm": 1.892857392200546, + "learning_rate": 9.99771067574105e-06, + "loss": 0.317, + "step": 405 + }, + { + "epoch": 0.03951338199513382, + "grad_norm": 2.194365925195629, + "learning_rate": 9.997662736813575e-06, + "loss": 0.5972, + "step": 406 + }, + { + "epoch": 0.039610705596107056, + "grad_norm": 2.3359516870584547, + "learning_rate": 9.997614301265281e-06, + "loss": 0.3505, + "step": 407 + }, + { + "epoch": 0.039708029197080295, + "grad_norm": 1.8041349283411827, + "learning_rate": 9.997565369100983e-06, + "loss": 0.4003, + "step": 408 + }, + { + "epoch": 0.03980535279805353, + "grad_norm": 2.2199870140108273, + "learning_rate": 9.997515940325542e-06, + "loss": 0.4428, + "step": 409 + }, + { + "epoch": 0.03990267639902676, + "grad_norm": 2.193796849633566, + "learning_rate": 9.997466014943871e-06, + "loss": 0.3906, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 2.7309920828616168, + "learning_rate": 9.99741559296093e-06, + "loss": 0.6283, + "step": 411 + }, + { + "epoch": 0.040097323600973234, + "grad_norm": 2.220745639846989, + "learning_rate": 9.99736467438173e-06, + "loss": 0.4568, + "step": 412 + }, + { + "epoch": 0.04019464720194647, + "grad_norm": 1.905067765139487, + "learning_rate": 9.99731325921133e-06, + "loss": 0.3198, + "step": 413 + }, + { + "epoch": 0.040291970802919706, + "grad_norm": 2.0461180940034116, + "learning_rate": 9.997261347454841e-06, + "loss": 0.3783, + "step": 414 + }, + { + "epoch": 0.040389294403892946, + "grad_norm": 1.9732614929529544, + "learning_rate": 9.99720893911742e-06, + "loss": 0.5211, + "step": 415 + }, + { + "epoch": 0.04048661800486618, + "grad_norm": 2.341156401873798, + "learning_rate": 9.997156034204276e-06, + "loss": 0.5094, + "step": 416 + }, + { + "epoch": 0.04058394160583942, + "grad_norm": 2.2588135503158138, + "learning_rate": 9.997102632720664e-06, + "loss": 0.591, + "step": 417 + }, + { + "epoch": 0.04068126520681265, + "grad_norm": 2.187795564574772, + "learning_rate": 9.997048734671893e-06, + "loss": 0.3811, + "step": 418 + }, + { + "epoch": 0.04077858880778589, + "grad_norm": 2.2570398189900938, + "learning_rate": 9.996994340063314e-06, + "loss": 0.4494, + "step": 419 + }, + { + "epoch": 0.040875912408759124, + "grad_norm": 2.3267878846596597, + "learning_rate": 9.996939448900341e-06, + "loss": 0.5254, + "step": 420 + }, + { + "epoch": 0.04097323600973236, + "grad_norm": 1.9149387144635641, + "learning_rate": 9.99688406118842e-06, + "loss": 0.4281, + "step": 421 + }, + { + "epoch": 0.0410705596107056, + "grad_norm": 2.4052095021382285, + "learning_rate": 9.996828176933062e-06, + "loss": 0.61, + "step": 422 + }, + { + "epoch": 0.04116788321167883, + "grad_norm": 2.8744864627123237, + "learning_rate": 9.996771796139814e-06, + "loss": 0.4708, + "step": 423 + }, + { + "epoch": 0.04126520681265207, + "grad_norm": 2.0334953222734513, + "learning_rate": 9.996714918814284e-06, + "loss": 0.2697, + "step": 424 + }, + { + "epoch": 0.0413625304136253, + "grad_norm": 2.1314093477075486, + "learning_rate": 9.996657544962119e-06, + "loss": 0.3026, + "step": 425 + }, + { + "epoch": 0.04145985401459854, + "grad_norm": 1.7241742631767316, + "learning_rate": 9.996599674589022e-06, + "loss": 0.3624, + "step": 426 + }, + { + "epoch": 0.041557177615571775, + "grad_norm": 2.417754377928955, + "learning_rate": 9.996541307700746e-06, + "loss": 0.6682, + "step": 427 + }, + { + "epoch": 0.041654501216545015, + "grad_norm": 2.2126055245100256, + "learning_rate": 9.99648244430309e-06, + "loss": 0.3705, + "step": 428 + }, + { + "epoch": 0.04175182481751825, + "grad_norm": 1.8224510106748588, + "learning_rate": 9.996423084401901e-06, + "loss": 0.4318, + "step": 429 + }, + { + "epoch": 0.04184914841849149, + "grad_norm": 1.6786428352287364, + "learning_rate": 9.996363228003079e-06, + "loss": 0.4662, + "step": 430 + }, + { + "epoch": 0.04194647201946472, + "grad_norm": 1.9342922605897592, + "learning_rate": 9.99630287511257e-06, + "loss": 0.4874, + "step": 431 + }, + { + "epoch": 0.04204379562043795, + "grad_norm": 1.9444011100602645, + "learning_rate": 9.996242025736377e-06, + "loss": 0.3711, + "step": 432 + }, + { + "epoch": 0.04214111922141119, + "grad_norm": 3.114184163688958, + "learning_rate": 9.99618067988054e-06, + "loss": 0.5342, + "step": 433 + }, + { + "epoch": 0.042238442822384425, + "grad_norm": 1.993932460938173, + "learning_rate": 9.99611883755116e-06, + "loss": 0.465, + "step": 434 + }, + { + "epoch": 0.042335766423357665, + "grad_norm": 1.5062408953506277, + "learning_rate": 9.99605649875438e-06, + "loss": 0.3862, + "step": 435 + }, + { + "epoch": 0.0424330900243309, + "grad_norm": 2.5287447175721733, + "learning_rate": 9.995993663496394e-06, + "loss": 0.5638, + "step": 436 + }, + { + "epoch": 0.04253041362530414, + "grad_norm": 1.7215400937807486, + "learning_rate": 9.995930331783448e-06, + "loss": 0.3507, + "step": 437 + }, + { + "epoch": 0.04262773722627737, + "grad_norm": 1.5105936757865817, + "learning_rate": 9.995866503621834e-06, + "loss": 0.4086, + "step": 438 + }, + { + "epoch": 0.04272506082725061, + "grad_norm": 1.828501540310894, + "learning_rate": 9.995802179017893e-06, + "loss": 0.3477, + "step": 439 + }, + { + "epoch": 0.04282238442822384, + "grad_norm": 1.6658361590948114, + "learning_rate": 9.995737357978022e-06, + "loss": 0.4006, + "step": 440 + }, + { + "epoch": 0.04291970802919708, + "grad_norm": 1.6434395036324305, + "learning_rate": 9.995672040508656e-06, + "loss": 0.4349, + "step": 441 + }, + { + "epoch": 0.043017031630170316, + "grad_norm": 1.9913424027071125, + "learning_rate": 9.99560622661629e-06, + "loss": 0.3415, + "step": 442 + }, + { + "epoch": 0.043114355231143556, + "grad_norm": 1.6487474195389296, + "learning_rate": 9.995539916307463e-06, + "loss": 0.4804, + "step": 443 + }, + { + "epoch": 0.04321167883211679, + "grad_norm": 1.4861266391850032, + "learning_rate": 9.995473109588764e-06, + "loss": 0.411, + "step": 444 + }, + { + "epoch": 0.04330900243309002, + "grad_norm": 1.4390762643228305, + "learning_rate": 9.995405806466831e-06, + "loss": 0.3806, + "step": 445 + }, + { + "epoch": 0.04340632603406326, + "grad_norm": 1.7775332171720517, + "learning_rate": 9.995338006948353e-06, + "loss": 0.3332, + "step": 446 + }, + { + "epoch": 0.043503649635036494, + "grad_norm": 1.7312883283317864, + "learning_rate": 9.995269711040067e-06, + "loss": 0.2736, + "step": 447 + }, + { + "epoch": 0.043600973236009734, + "grad_norm": 1.7973901424872405, + "learning_rate": 9.995200918748759e-06, + "loss": 0.5597, + "step": 448 + }, + { + "epoch": 0.04369829683698297, + "grad_norm": 2.0409413301370334, + "learning_rate": 9.995131630081265e-06, + "loss": 0.6045, + "step": 449 + }, + { + "epoch": 0.043795620437956206, + "grad_norm": 3.2708903670147347, + "learning_rate": 9.995061845044473e-06, + "loss": 0.6245, + "step": 450 + }, + { + "epoch": 0.04389294403892944, + "grad_norm": 1.744466889932859, + "learning_rate": 9.994991563645314e-06, + "loss": 0.4129, + "step": 451 + }, + { + "epoch": 0.04399026763990268, + "grad_norm": 1.8775864246251477, + "learning_rate": 9.994920785890771e-06, + "loss": 0.414, + "step": 452 + }, + { + "epoch": 0.04408759124087591, + "grad_norm": 1.3868286948878126, + "learning_rate": 9.994849511787881e-06, + "loss": 0.3164, + "step": 453 + }, + { + "epoch": 0.04418491484184915, + "grad_norm": 1.6888257223301795, + "learning_rate": 9.994777741343727e-06, + "loss": 0.3241, + "step": 454 + }, + { + "epoch": 0.044282238442822384, + "grad_norm": 1.5029594314338663, + "learning_rate": 9.994705474565436e-06, + "loss": 0.4148, + "step": 455 + }, + { + "epoch": 0.04437956204379562, + "grad_norm": 1.7159996915963702, + "learning_rate": 9.994632711460193e-06, + "loss": 0.3387, + "step": 456 + }, + { + "epoch": 0.04447688564476886, + "grad_norm": 1.7717997513120352, + "learning_rate": 9.994559452035228e-06, + "loss": 0.4547, + "step": 457 + }, + { + "epoch": 0.04457420924574209, + "grad_norm": 1.887765282184233, + "learning_rate": 9.99448569629782e-06, + "loss": 0.5919, + "step": 458 + }, + { + "epoch": 0.04467153284671533, + "grad_norm": 2.0151049512314585, + "learning_rate": 9.994411444255298e-06, + "loss": 0.4556, + "step": 459 + }, + { + "epoch": 0.04476885644768856, + "grad_norm": 1.5706463359289826, + "learning_rate": 9.994336695915041e-06, + "loss": 0.3443, + "step": 460 + }, + { + "epoch": 0.0448661800486618, + "grad_norm": 1.9067884841542395, + "learning_rate": 9.994261451284477e-06, + "loss": 0.5862, + "step": 461 + }, + { + "epoch": 0.044963503649635035, + "grad_norm": 1.7346846845298518, + "learning_rate": 9.994185710371083e-06, + "loss": 0.3588, + "step": 462 + }, + { + "epoch": 0.045060827250608275, + "grad_norm": 1.5593715629463312, + "learning_rate": 9.994109473182385e-06, + "loss": 0.2891, + "step": 463 + }, + { + "epoch": 0.04515815085158151, + "grad_norm": 2.326736753149576, + "learning_rate": 9.994032739725959e-06, + "loss": 0.6517, + "step": 464 + }, + { + "epoch": 0.04525547445255475, + "grad_norm": 2.2142852132770305, + "learning_rate": 9.99395551000943e-06, + "loss": 0.3571, + "step": 465 + }, + { + "epoch": 0.04535279805352798, + "grad_norm": 1.7351954813390544, + "learning_rate": 9.993877784040474e-06, + "loss": 0.3849, + "step": 466 + }, + { + "epoch": 0.04545012165450121, + "grad_norm": 1.3962336815381617, + "learning_rate": 9.993799561826811e-06, + "loss": 0.311, + "step": 467 + }, + { + "epoch": 0.04554744525547445, + "grad_norm": 1.878958465421645, + "learning_rate": 9.993720843376216e-06, + "loss": 0.5602, + "step": 468 + }, + { + "epoch": 0.045644768856447686, + "grad_norm": 1.519160992933857, + "learning_rate": 9.993641628696513e-06, + "loss": 0.2379, + "step": 469 + }, + { + "epoch": 0.045742092457420926, + "grad_norm": 2.5345930464298885, + "learning_rate": 9.99356191779557e-06, + "loss": 0.4239, + "step": 470 + }, + { + "epoch": 0.04583941605839416, + "grad_norm": 1.3153911718041251, + "learning_rate": 9.993481710681314e-06, + "loss": 0.3454, + "step": 471 + }, + { + "epoch": 0.0459367396593674, + "grad_norm": 2.16208125563947, + "learning_rate": 9.993401007361707e-06, + "loss": 0.5386, + "step": 472 + }, + { + "epoch": 0.04603406326034063, + "grad_norm": 1.8150842593472827, + "learning_rate": 9.993319807844775e-06, + "loss": 0.3077, + "step": 473 + }, + { + "epoch": 0.04613138686131387, + "grad_norm": 1.6656864462678063, + "learning_rate": 9.993238112138584e-06, + "loss": 0.4927, + "step": 474 + }, + { + "epoch": 0.046228710462287104, + "grad_norm": 1.3429917702468868, + "learning_rate": 9.993155920251252e-06, + "loss": 0.2433, + "step": 475 + }, + { + "epoch": 0.04632603406326034, + "grad_norm": 1.3651155739367906, + "learning_rate": 9.993073232190949e-06, + "loss": 0.2947, + "step": 476 + }, + { + "epoch": 0.046423357664233576, + "grad_norm": 1.7815516701613203, + "learning_rate": 9.992990047965887e-06, + "loss": 0.5372, + "step": 477 + }, + { + "epoch": 0.046520681265206816, + "grad_norm": 1.846696342179327, + "learning_rate": 9.992906367584337e-06, + "loss": 0.5127, + "step": 478 + }, + { + "epoch": 0.04661800486618005, + "grad_norm": 1.7511253825578088, + "learning_rate": 9.992822191054612e-06, + "loss": 0.4074, + "step": 479 + }, + { + "epoch": 0.04671532846715328, + "grad_norm": 1.8105635986872588, + "learning_rate": 9.992737518385076e-06, + "loss": 0.4998, + "step": 480 + }, + { + "epoch": 0.04681265206812652, + "grad_norm": 2.2743597617900746, + "learning_rate": 9.992652349584147e-06, + "loss": 0.6249, + "step": 481 + }, + { + "epoch": 0.046909975669099754, + "grad_norm": 1.93948496382319, + "learning_rate": 9.992566684660282e-06, + "loss": 0.5411, + "step": 482 + }, + { + "epoch": 0.047007299270072994, + "grad_norm": 1.4073760716303516, + "learning_rate": 9.992480523621999e-06, + "loss": 0.3506, + "step": 483 + }, + { + "epoch": 0.04710462287104623, + "grad_norm": 1.388293079160528, + "learning_rate": 9.992393866477856e-06, + "loss": 0.3304, + "step": 484 + }, + { + "epoch": 0.04720194647201947, + "grad_norm": 2.082643572745618, + "learning_rate": 9.992306713236467e-06, + "loss": 0.5653, + "step": 485 + }, + { + "epoch": 0.0472992700729927, + "grad_norm": 1.7104664332606834, + "learning_rate": 9.992219063906492e-06, + "loss": 0.3317, + "step": 486 + }, + { + "epoch": 0.04739659367396594, + "grad_norm": 1.7575848919482624, + "learning_rate": 9.992130918496638e-06, + "loss": 0.4109, + "step": 487 + }, + { + "epoch": 0.04749391727493917, + "grad_norm": 1.7351379091271637, + "learning_rate": 9.992042277015668e-06, + "loss": 0.5065, + "step": 488 + }, + { + "epoch": 0.04759124087591241, + "grad_norm": 1.4444570948381004, + "learning_rate": 9.991953139472387e-06, + "loss": 0.4023, + "step": 489 + }, + { + "epoch": 0.047688564476885645, + "grad_norm": 1.4697709289140384, + "learning_rate": 9.991863505875656e-06, + "loss": 0.3364, + "step": 490 + }, + { + "epoch": 0.04778588807785888, + "grad_norm": 1.9428205960506804, + "learning_rate": 9.99177337623438e-06, + "loss": 0.4303, + "step": 491 + }, + { + "epoch": 0.04788321167883212, + "grad_norm": 1.931152158561148, + "learning_rate": 9.991682750557516e-06, + "loss": 0.2857, + "step": 492 + }, + { + "epoch": 0.04798053527980535, + "grad_norm": 1.9301394655308035, + "learning_rate": 9.991591628854067e-06, + "loss": 0.5998, + "step": 493 + }, + { + "epoch": 0.04807785888077859, + "grad_norm": 1.7788293016868693, + "learning_rate": 9.99150001113309e-06, + "loss": 0.4595, + "step": 494 + }, + { + "epoch": 0.04817518248175182, + "grad_norm": 2.0641225732440134, + "learning_rate": 9.99140789740369e-06, + "loss": 0.3848, + "step": 495 + }, + { + "epoch": 0.04827250608272506, + "grad_norm": 2.2832955373527044, + "learning_rate": 9.99131528767502e-06, + "loss": 0.6396, + "step": 496 + }, + { + "epoch": 0.048369829683698295, + "grad_norm": 1.6658790952812916, + "learning_rate": 9.99122218195628e-06, + "loss": 0.5429, + "step": 497 + }, + { + "epoch": 0.048467153284671535, + "grad_norm": 1.6568038302360257, + "learning_rate": 9.991128580256725e-06, + "loss": 0.4532, + "step": 498 + }, + { + "epoch": 0.04856447688564477, + "grad_norm": 1.8451659374514144, + "learning_rate": 9.991034482585656e-06, + "loss": 0.5845, + "step": 499 + }, + { + "epoch": 0.04866180048661801, + "grad_norm": 1.9103948838029656, + "learning_rate": 9.99093988895242e-06, + "loss": 0.5508, + "step": 500 + }, + { + "epoch": 0.04875912408759124, + "grad_norm": 1.9691733858712537, + "learning_rate": 9.990844799366422e-06, + "loss": 0.6374, + "step": 501 + }, + { + "epoch": 0.048856447688564474, + "grad_norm": 2.1278472226161846, + "learning_rate": 9.990749213837108e-06, + "loss": 0.572, + "step": 502 + }, + { + "epoch": 0.04895377128953771, + "grad_norm": 1.9704028865885994, + "learning_rate": 9.990653132373977e-06, + "loss": 0.6282, + "step": 503 + }, + { + "epoch": 0.049051094890510946, + "grad_norm": 1.8965741341561362, + "learning_rate": 9.990556554986577e-06, + "loss": 0.5749, + "step": 504 + }, + { + "epoch": 0.049148418491484186, + "grad_norm": 1.5425018763105707, + "learning_rate": 9.990459481684504e-06, + "loss": 0.4236, + "step": 505 + }, + { + "epoch": 0.04924574209245742, + "grad_norm": 1.736669998068125, + "learning_rate": 9.990361912477405e-06, + "loss": 0.4275, + "step": 506 + }, + { + "epoch": 0.04934306569343066, + "grad_norm": 2.049335776858506, + "learning_rate": 9.990263847374976e-06, + "loss": 0.6897, + "step": 507 + }, + { + "epoch": 0.04944038929440389, + "grad_norm": 1.8544975871268152, + "learning_rate": 9.990165286386961e-06, + "loss": 0.4811, + "step": 508 + }, + { + "epoch": 0.04953771289537713, + "grad_norm": 1.5709178763522822, + "learning_rate": 9.990066229523155e-06, + "loss": 0.4585, + "step": 509 + }, + { + "epoch": 0.049635036496350364, + "grad_norm": 2.1410068811754153, + "learning_rate": 9.989966676793399e-06, + "loss": 0.4773, + "step": 510 + }, + { + "epoch": 0.049732360097323604, + "grad_norm": 1.760724042734433, + "learning_rate": 9.989866628207589e-06, + "loss": 0.3144, + "step": 511 + }, + { + "epoch": 0.04982968369829684, + "grad_norm": 1.8521560168370175, + "learning_rate": 9.989766083775662e-06, + "loss": 0.4656, + "step": 512 + }, + { + "epoch": 0.049927007299270076, + "grad_norm": 1.544987615640627, + "learning_rate": 9.989665043507616e-06, + "loss": 0.4089, + "step": 513 + }, + { + "epoch": 0.05002433090024331, + "grad_norm": 1.9122960249889975, + "learning_rate": 9.989563507413487e-06, + "loss": 0.4535, + "step": 514 + }, + { + "epoch": 0.05012165450121654, + "grad_norm": 1.5187134098621655, + "learning_rate": 9.989461475503363e-06, + "loss": 0.31, + "step": 515 + }, + { + "epoch": 0.05021897810218978, + "grad_norm": 1.562160455050312, + "learning_rate": 9.989358947787389e-06, + "loss": 0.4009, + "step": 516 + }, + { + "epoch": 0.050316301703163015, + "grad_norm": 1.738084966314413, + "learning_rate": 9.989255924275746e-06, + "loss": 0.4723, + "step": 517 + }, + { + "epoch": 0.050413625304136254, + "grad_norm": 2.156580581755068, + "learning_rate": 9.989152404978678e-06, + "loss": 0.4407, + "step": 518 + }, + { + "epoch": 0.05051094890510949, + "grad_norm": 1.8652302207700793, + "learning_rate": 9.989048389906469e-06, + "loss": 0.587, + "step": 519 + }, + { + "epoch": 0.05060827250608273, + "grad_norm": 1.5934369396830426, + "learning_rate": 9.988943879069452e-06, + "loss": 0.3961, + "step": 520 + }, + { + "epoch": 0.05070559610705596, + "grad_norm": 1.4294562647861604, + "learning_rate": 9.988838872478017e-06, + "loss": 0.3382, + "step": 521 + }, + { + "epoch": 0.0508029197080292, + "grad_norm": 1.5693240874435923, + "learning_rate": 9.988733370142598e-06, + "loss": 0.3876, + "step": 522 + }, + { + "epoch": 0.05090024330900243, + "grad_norm": 1.6720738515514542, + "learning_rate": 9.988627372073678e-06, + "loss": 0.448, + "step": 523 + }, + { + "epoch": 0.05099756690997567, + "grad_norm": 2.0438207304961367, + "learning_rate": 9.988520878281787e-06, + "loss": 0.5724, + "step": 524 + }, + { + "epoch": 0.051094890510948905, + "grad_norm": 2.0003463921985456, + "learning_rate": 9.988413888777512e-06, + "loss": 0.4506, + "step": 525 + }, + { + "epoch": 0.05119221411192214, + "grad_norm": 2.11812759304704, + "learning_rate": 9.988306403571482e-06, + "loss": 0.757, + "step": 526 + }, + { + "epoch": 0.05128953771289538, + "grad_norm": 1.5594386055307068, + "learning_rate": 9.98819842267438e-06, + "loss": 0.4145, + "step": 527 + }, + { + "epoch": 0.05138686131386861, + "grad_norm": 1.917978943216931, + "learning_rate": 9.988089946096933e-06, + "loss": 0.5363, + "step": 528 + }, + { + "epoch": 0.05148418491484185, + "grad_norm": 1.3212282063862113, + "learning_rate": 9.987980973849924e-06, + "loss": 0.3132, + "step": 529 + }, + { + "epoch": 0.05158150851581508, + "grad_norm": 1.2285769982465171, + "learning_rate": 9.987871505944177e-06, + "loss": 0.2287, + "step": 530 + }, + { + "epoch": 0.05167883211678832, + "grad_norm": 1.849610792922833, + "learning_rate": 9.987761542390574e-06, + "loss": 0.6487, + "step": 531 + }, + { + "epoch": 0.051776155717761556, + "grad_norm": 1.158461389164102, + "learning_rate": 9.987651083200044e-06, + "loss": 0.2111, + "step": 532 + }, + { + "epoch": 0.051873479318734796, + "grad_norm": 1.8450520976911682, + "learning_rate": 9.987540128383556e-06, + "loss": 0.5579, + "step": 533 + }, + { + "epoch": 0.05197080291970803, + "grad_norm": 1.9047794610871986, + "learning_rate": 9.98742867795214e-06, + "loss": 0.4542, + "step": 534 + }, + { + "epoch": 0.05206812652068127, + "grad_norm": 1.5564676952152843, + "learning_rate": 9.987316731916872e-06, + "loss": 0.4467, + "step": 535 + }, + { + "epoch": 0.0521654501216545, + "grad_norm": 1.403952395827601, + "learning_rate": 9.987204290288876e-06, + "loss": 0.3761, + "step": 536 + }, + { + "epoch": 0.052262773722627734, + "grad_norm": 1.948151749349848, + "learning_rate": 9.987091353079323e-06, + "loss": 0.5782, + "step": 537 + }, + { + "epoch": 0.052360097323600974, + "grad_norm": 1.6211222818460531, + "learning_rate": 9.986977920299437e-06, + "loss": 0.4047, + "step": 538 + }, + { + "epoch": 0.052457420924574207, + "grad_norm": 1.4911900726837217, + "learning_rate": 9.986863991960491e-06, + "loss": 0.3817, + "step": 539 + }, + { + "epoch": 0.052554744525547446, + "grad_norm": 1.530872687739145, + "learning_rate": 9.986749568073804e-06, + "loss": 0.4639, + "step": 540 + }, + { + "epoch": 0.05265206812652068, + "grad_norm": 1.766399180057757, + "learning_rate": 9.986634648650746e-06, + "loss": 0.5132, + "step": 541 + }, + { + "epoch": 0.05274939172749392, + "grad_norm": 1.7318370911583716, + "learning_rate": 9.98651923370274e-06, + "loss": 0.5845, + "step": 542 + }, + { + "epoch": 0.05284671532846715, + "grad_norm": 1.4523428175637472, + "learning_rate": 9.986403323241252e-06, + "loss": 0.3817, + "step": 543 + }, + { + "epoch": 0.05294403892944039, + "grad_norm": 1.3085205057626972, + "learning_rate": 9.9862869172778e-06, + "loss": 0.294, + "step": 544 + }, + { + "epoch": 0.053041362530413624, + "grad_norm": 1.749260064779093, + "learning_rate": 9.986170015823953e-06, + "loss": 0.3885, + "step": 545 + }, + { + "epoch": 0.053138686131386864, + "grad_norm": 1.9224820302612053, + "learning_rate": 9.986052618891326e-06, + "loss": 0.5841, + "step": 546 + }, + { + "epoch": 0.0532360097323601, + "grad_norm": 1.6019594770490224, + "learning_rate": 9.985934726491587e-06, + "loss": 0.5602, + "step": 547 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 1.63788543125369, + "learning_rate": 9.98581633863645e-06, + "loss": 0.4913, + "step": 548 + }, + { + "epoch": 0.05343065693430657, + "grad_norm": 1.7751230304686407, + "learning_rate": 9.985697455337677e-06, + "loss": 0.4575, + "step": 549 + }, + { + "epoch": 0.0535279805352798, + "grad_norm": 1.4813830287768246, + "learning_rate": 9.985578076607086e-06, + "loss": 0.2811, + "step": 550 + }, + { + "epoch": 0.05362530413625304, + "grad_norm": 1.8047180833743464, + "learning_rate": 9.985458202456534e-06, + "loss": 0.5564, + "step": 551 + }, + { + "epoch": 0.053722627737226275, + "grad_norm": 1.4776771818705197, + "learning_rate": 9.985337832897938e-06, + "loss": 0.2842, + "step": 552 + }, + { + "epoch": 0.053819951338199515, + "grad_norm": 1.800973083472876, + "learning_rate": 9.985216967943256e-06, + "loss": 0.4017, + "step": 553 + }, + { + "epoch": 0.05391727493917275, + "grad_norm": 1.4167019147788764, + "learning_rate": 9.985095607604502e-06, + "loss": 0.2676, + "step": 554 + }, + { + "epoch": 0.05401459854014599, + "grad_norm": 1.462279330828973, + "learning_rate": 9.984973751893732e-06, + "loss": 0.342, + "step": 555 + }, + { + "epoch": 0.05411192214111922, + "grad_norm": 1.7941608662857766, + "learning_rate": 9.984851400823056e-06, + "loss": 0.4851, + "step": 556 + }, + { + "epoch": 0.05420924574209246, + "grad_norm": 1.865163176610701, + "learning_rate": 9.984728554404632e-06, + "loss": 0.5938, + "step": 557 + }, + { + "epoch": 0.05430656934306569, + "grad_norm": 1.9578700904261006, + "learning_rate": 9.984605212650669e-06, + "loss": 0.5846, + "step": 558 + }, + { + "epoch": 0.05440389294403893, + "grad_norm": 1.7615345522382602, + "learning_rate": 9.98448137557342e-06, + "loss": 0.5517, + "step": 559 + }, + { + "epoch": 0.054501216545012166, + "grad_norm": 1.7987507193579173, + "learning_rate": 9.984357043185195e-06, + "loss": 0.4511, + "step": 560 + }, + { + "epoch": 0.0545985401459854, + "grad_norm": 1.8966136067258859, + "learning_rate": 9.984232215498347e-06, + "loss": 0.3339, + "step": 561 + }, + { + "epoch": 0.05469586374695864, + "grad_norm": 1.760439118311743, + "learning_rate": 9.98410689252528e-06, + "loss": 0.4797, + "step": 562 + }, + { + "epoch": 0.05479318734793187, + "grad_norm": 1.7467534741216573, + "learning_rate": 9.983981074278448e-06, + "loss": 0.3854, + "step": 563 + }, + { + "epoch": 0.05489051094890511, + "grad_norm": 1.638747457914032, + "learning_rate": 9.983854760770353e-06, + "loss": 0.3215, + "step": 564 + }, + { + "epoch": 0.054987834549878344, + "grad_norm": 1.565721167011275, + "learning_rate": 9.983727952013546e-06, + "loss": 0.3573, + "step": 565 + }, + { + "epoch": 0.05508515815085158, + "grad_norm": 1.819373023432736, + "learning_rate": 9.98360064802063e-06, + "loss": 0.304, + "step": 566 + }, + { + "epoch": 0.055182481751824816, + "grad_norm": 2.219648367380945, + "learning_rate": 9.983472848804254e-06, + "loss": 0.7398, + "step": 567 + }, + { + "epoch": 0.055279805352798056, + "grad_norm": 1.7935096739228122, + "learning_rate": 9.98334455437712e-06, + "loss": 0.3257, + "step": 568 + }, + { + "epoch": 0.05537712895377129, + "grad_norm": 2.085379879601924, + "learning_rate": 9.983215764751971e-06, + "loss": 0.3477, + "step": 569 + }, + { + "epoch": 0.05547445255474453, + "grad_norm": 1.528881264990704, + "learning_rate": 9.98308647994161e-06, + "loss": 0.4173, + "step": 570 + }, + { + "epoch": 0.05557177615571776, + "grad_norm": 1.282510416609492, + "learning_rate": 9.982956699958883e-06, + "loss": 0.3513, + "step": 571 + }, + { + "epoch": 0.055669099756690994, + "grad_norm": 1.6035600811723405, + "learning_rate": 9.982826424816688e-06, + "loss": 0.3318, + "step": 572 + }, + { + "epoch": 0.055766423357664234, + "grad_norm": 1.9455996381881653, + "learning_rate": 9.982695654527966e-06, + "loss": 0.4991, + "step": 573 + }, + { + "epoch": 0.05586374695863747, + "grad_norm": 1.8397262762514839, + "learning_rate": 9.982564389105714e-06, + "loss": 0.345, + "step": 574 + }, + { + "epoch": 0.05596107055961071, + "grad_norm": 1.7997461351876956, + "learning_rate": 9.982432628562978e-06, + "loss": 0.5384, + "step": 575 + }, + { + "epoch": 0.05605839416058394, + "grad_norm": 1.6246101205121968, + "learning_rate": 9.982300372912848e-06, + "loss": 0.5499, + "step": 576 + }, + { + "epoch": 0.05615571776155718, + "grad_norm": 1.9184631207748861, + "learning_rate": 9.982167622168467e-06, + "loss": 0.449, + "step": 577 + }, + { + "epoch": 0.05625304136253041, + "grad_norm": 1.5368079698239796, + "learning_rate": 9.982034376343029e-06, + "loss": 0.3311, + "step": 578 + }, + { + "epoch": 0.05635036496350365, + "grad_norm": 1.9061539422519105, + "learning_rate": 9.98190063544977e-06, + "loss": 0.4182, + "step": 579 + }, + { + "epoch": 0.056447688564476885, + "grad_norm": 1.6727227174184238, + "learning_rate": 9.981766399501984e-06, + "loss": 0.482, + "step": 580 + }, + { + "epoch": 0.056545012165450124, + "grad_norm": 1.8546055763617424, + "learning_rate": 9.98163166851301e-06, + "loss": 0.5758, + "step": 581 + }, + { + "epoch": 0.05664233576642336, + "grad_norm": 2.0350303098403706, + "learning_rate": 9.981496442496234e-06, + "loss": 0.5236, + "step": 582 + }, + { + "epoch": 0.0567396593673966, + "grad_norm": 1.3907379790284926, + "learning_rate": 9.981360721465095e-06, + "loss": 0.3375, + "step": 583 + }, + { + "epoch": 0.05683698296836983, + "grad_norm": 2.0168702766261486, + "learning_rate": 9.98122450543308e-06, + "loss": 0.595, + "step": 584 + }, + { + "epoch": 0.05693430656934306, + "grad_norm": 1.7248754760467295, + "learning_rate": 9.981087794413722e-06, + "loss": 0.3747, + "step": 585 + }, + { + "epoch": 0.0570316301703163, + "grad_norm": 1.8918865818240052, + "learning_rate": 9.98095058842061e-06, + "loss": 0.5805, + "step": 586 + }, + { + "epoch": 0.057128953771289535, + "grad_norm": 1.8691153689026438, + "learning_rate": 9.980812887467377e-06, + "loss": 0.3451, + "step": 587 + }, + { + "epoch": 0.057226277372262775, + "grad_norm": 1.7475224395533677, + "learning_rate": 9.980674691567705e-06, + "loss": 0.2789, + "step": 588 + }, + { + "epoch": 0.05732360097323601, + "grad_norm": 1.876124489873064, + "learning_rate": 9.980536000735328e-06, + "loss": 0.5917, + "step": 589 + }, + { + "epoch": 0.05742092457420925, + "grad_norm": 1.6438847446693803, + "learning_rate": 9.980396814984025e-06, + "loss": 0.3063, + "step": 590 + }, + { + "epoch": 0.05751824817518248, + "grad_norm": 1.7609146888426583, + "learning_rate": 9.980257134327634e-06, + "loss": 0.4177, + "step": 591 + }, + { + "epoch": 0.05761557177615572, + "grad_norm": 3.1047413099950445, + "learning_rate": 9.980116958780027e-06, + "loss": 0.2793, + "step": 592 + }, + { + "epoch": 0.05771289537712895, + "grad_norm": 1.3365913263494138, + "learning_rate": 9.979976288355137e-06, + "loss": 0.2754, + "step": 593 + }, + { + "epoch": 0.05781021897810219, + "grad_norm": 1.7378721977452198, + "learning_rate": 9.979835123066943e-06, + "loss": 0.4156, + "step": 594 + }, + { + "epoch": 0.057907542579075426, + "grad_norm": 1.7652517953930271, + "learning_rate": 9.979693462929472e-06, + "loss": 0.3768, + "step": 595 + }, + { + "epoch": 0.05800486618004866, + "grad_norm": 2.4155692425963675, + "learning_rate": 9.979551307956801e-06, + "loss": 0.6409, + "step": 596 + }, + { + "epoch": 0.0581021897810219, + "grad_norm": 2.2339995809091913, + "learning_rate": 9.979408658163055e-06, + "loss": 0.3134, + "step": 597 + }, + { + "epoch": 0.05819951338199513, + "grad_norm": 1.9788468018769068, + "learning_rate": 9.97926551356241e-06, + "loss": 0.2509, + "step": 598 + }, + { + "epoch": 0.05829683698296837, + "grad_norm": 4.0668515887714385, + "learning_rate": 9.979121874169091e-06, + "loss": 0.3322, + "step": 599 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 2.0552497355613264, + "learning_rate": 9.97897773999737e-06, + "loss": 0.2732, + "step": 600 + }, + { + "epoch": 0.058491484184914844, + "grad_norm": 1.7372746328291984, + "learning_rate": 9.978833111061573e-06, + "loss": 0.3021, + "step": 601 + }, + { + "epoch": 0.058588807785888077, + "grad_norm": 1.8426989129405926, + "learning_rate": 9.978687987376067e-06, + "loss": 0.3147, + "step": 602 + }, + { + "epoch": 0.058686131386861316, + "grad_norm": 1.456816302033054, + "learning_rate": 9.978542368955278e-06, + "loss": 0.3669, + "step": 603 + }, + { + "epoch": 0.05878345498783455, + "grad_norm": 2.1398878847147973, + "learning_rate": 9.978396255813672e-06, + "loss": 0.457, + "step": 604 + }, + { + "epoch": 0.05888077858880779, + "grad_norm": 1.860652260495742, + "learning_rate": 9.978249647965769e-06, + "loss": 0.5567, + "step": 605 + }, + { + "epoch": 0.05897810218978102, + "grad_norm": 1.7559525207322428, + "learning_rate": 9.97810254542614e-06, + "loss": 0.439, + "step": 606 + }, + { + "epoch": 0.059075425790754255, + "grad_norm": 1.4912680944094816, + "learning_rate": 9.977954948209402e-06, + "loss": 0.4431, + "step": 607 + }, + { + "epoch": 0.059172749391727494, + "grad_norm": 1.766690700595448, + "learning_rate": 9.97780685633022e-06, + "loss": 0.3187, + "step": 608 + }, + { + "epoch": 0.05927007299270073, + "grad_norm": 2.169180646458804, + "learning_rate": 9.977658269803312e-06, + "loss": 0.5042, + "step": 609 + }, + { + "epoch": 0.05936739659367397, + "grad_norm": 1.623119439845207, + "learning_rate": 9.977509188643441e-06, + "loss": 0.3632, + "step": 610 + }, + { + "epoch": 0.0594647201946472, + "grad_norm": 2.0976883017366226, + "learning_rate": 9.977359612865424e-06, + "loss": 0.6465, + "step": 611 + }, + { + "epoch": 0.05956204379562044, + "grad_norm": 1.59126192242755, + "learning_rate": 9.977209542484123e-06, + "loss": 0.4335, + "step": 612 + }, + { + "epoch": 0.05965936739659367, + "grad_norm": 1.6532378246551842, + "learning_rate": 9.97705897751445e-06, + "loss": 0.3462, + "step": 613 + }, + { + "epoch": 0.05975669099756691, + "grad_norm": 1.6478059833585124, + "learning_rate": 9.976907917971365e-06, + "loss": 0.4063, + "step": 614 + }, + { + "epoch": 0.059854014598540145, + "grad_norm": 1.750559308727237, + "learning_rate": 9.976756363869884e-06, + "loss": 0.5062, + "step": 615 + }, + { + "epoch": 0.059951338199513385, + "grad_norm": 1.6400113365898012, + "learning_rate": 9.976604315225063e-06, + "loss": 0.3699, + "step": 616 + }, + { + "epoch": 0.06004866180048662, + "grad_norm": 1.5449283565959169, + "learning_rate": 9.976451772052013e-06, + "loss": 0.3635, + "step": 617 + }, + { + "epoch": 0.06014598540145986, + "grad_norm": 1.3799772345005799, + "learning_rate": 9.97629873436589e-06, + "loss": 0.2747, + "step": 618 + }, + { + "epoch": 0.06024330900243309, + "grad_norm": 1.9454941262632244, + "learning_rate": 9.976145202181905e-06, + "loss": 0.4963, + "step": 619 + }, + { + "epoch": 0.06034063260340632, + "grad_norm": 1.5274916477255973, + "learning_rate": 9.975991175515311e-06, + "loss": 0.3348, + "step": 620 + }, + { + "epoch": 0.06043795620437956, + "grad_norm": 1.9623540496009142, + "learning_rate": 9.975836654381416e-06, + "loss": 0.5373, + "step": 621 + }, + { + "epoch": 0.060535279805352796, + "grad_norm": 1.4248144765181632, + "learning_rate": 9.975681638795575e-06, + "loss": 0.3137, + "step": 622 + }, + { + "epoch": 0.060632603406326036, + "grad_norm": 1.4366236793713136, + "learning_rate": 9.975526128773192e-06, + "loss": 0.3519, + "step": 623 + }, + { + "epoch": 0.06072992700729927, + "grad_norm": 1.8458441140553945, + "learning_rate": 9.97537012432972e-06, + "loss": 0.3937, + "step": 624 + }, + { + "epoch": 0.06082725060827251, + "grad_norm": 1.868271580826056, + "learning_rate": 9.975213625480658e-06, + "loss": 0.4567, + "step": 625 + }, + { + "epoch": 0.06092457420924574, + "grad_norm": 2.4613001964869223, + "learning_rate": 9.97505663224156e-06, + "loss": 0.5607, + "step": 626 + }, + { + "epoch": 0.06102189781021898, + "grad_norm": 1.6709839772769468, + "learning_rate": 9.974899144628027e-06, + "loss": 0.3233, + "step": 627 + }, + { + "epoch": 0.061119221411192214, + "grad_norm": 1.8046591620263965, + "learning_rate": 9.97474116265571e-06, + "loss": 0.3929, + "step": 628 + }, + { + "epoch": 0.06121654501216545, + "grad_norm": 1.7182161369033975, + "learning_rate": 9.974582686340304e-06, + "loss": 0.3804, + "step": 629 + }, + { + "epoch": 0.061313868613138686, + "grad_norm": 2.435940855169524, + "learning_rate": 9.974423715697558e-06, + "loss": 0.7453, + "step": 630 + }, + { + "epoch": 0.06141119221411192, + "grad_norm": 1.401143104634322, + "learning_rate": 9.974264250743272e-06, + "loss": 0.306, + "step": 631 + }, + { + "epoch": 0.06150851581508516, + "grad_norm": 1.540550326071636, + "learning_rate": 9.97410429149329e-06, + "loss": 0.3582, + "step": 632 + }, + { + "epoch": 0.06160583941605839, + "grad_norm": 4.038520112503673, + "learning_rate": 9.973943837963507e-06, + "loss": 0.2688, + "step": 633 + }, + { + "epoch": 0.06170316301703163, + "grad_norm": 2.032927304778425, + "learning_rate": 9.973782890169867e-06, + "loss": 0.6952, + "step": 634 + }, + { + "epoch": 0.061800486618004864, + "grad_norm": 1.5242884680104736, + "learning_rate": 9.973621448128364e-06, + "loss": 0.3957, + "step": 635 + }, + { + "epoch": 0.061897810218978104, + "grad_norm": 1.599953340803732, + "learning_rate": 9.973459511855042e-06, + "loss": 0.3783, + "step": 636 + }, + { + "epoch": 0.06199513381995134, + "grad_norm": 2.1886899708740697, + "learning_rate": 9.973297081365988e-06, + "loss": 0.5426, + "step": 637 + }, + { + "epoch": 0.06209245742092458, + "grad_norm": 1.363421719809718, + "learning_rate": 9.973134156677349e-06, + "loss": 0.2707, + "step": 638 + }, + { + "epoch": 0.06218978102189781, + "grad_norm": 1.883218491971664, + "learning_rate": 9.972970737805312e-06, + "loss": 0.543, + "step": 639 + }, + { + "epoch": 0.06228710462287105, + "grad_norm": 1.6336178778276322, + "learning_rate": 9.972806824766117e-06, + "loss": 0.4833, + "step": 640 + }, + { + "epoch": 0.06238442822384428, + "grad_norm": 1.74145478719615, + "learning_rate": 9.972642417576049e-06, + "loss": 0.5456, + "step": 641 + }, + { + "epoch": 0.062481751824817515, + "grad_norm": 1.3939447959630629, + "learning_rate": 9.972477516251448e-06, + "loss": 0.2935, + "step": 642 + }, + { + "epoch": 0.06257907542579075, + "grad_norm": 1.9741261661680443, + "learning_rate": 9.9723121208087e-06, + "loss": 0.4377, + "step": 643 + }, + { + "epoch": 0.06267639902676399, + "grad_norm": 2.214700253529172, + "learning_rate": 9.972146231264242e-06, + "loss": 0.6711, + "step": 644 + }, + { + "epoch": 0.06277372262773723, + "grad_norm": 1.7399845992974294, + "learning_rate": 9.971979847634554e-06, + "loss": 0.5327, + "step": 645 + }, + { + "epoch": 0.06287104622871047, + "grad_norm": 1.3552365502663122, + "learning_rate": 9.971812969936174e-06, + "loss": 0.3553, + "step": 646 + }, + { + "epoch": 0.06296836982968369, + "grad_norm": 1.8378075997453163, + "learning_rate": 9.971645598185685e-06, + "loss": 0.3709, + "step": 647 + }, + { + "epoch": 0.06306569343065693, + "grad_norm": 1.7441350204189767, + "learning_rate": 9.971477732399714e-06, + "loss": 0.489, + "step": 648 + }, + { + "epoch": 0.06316301703163017, + "grad_norm": 2.083031963167252, + "learning_rate": 9.971309372594947e-06, + "loss": 0.6196, + "step": 649 + }, + { + "epoch": 0.06326034063260341, + "grad_norm": 1.5678236487001533, + "learning_rate": 9.971140518788112e-06, + "loss": 0.3202, + "step": 650 + }, + { + "epoch": 0.06335766423357664, + "grad_norm": 1.7281008810115812, + "learning_rate": 9.970971170995988e-06, + "loss": 0.4169, + "step": 651 + }, + { + "epoch": 0.06345498783454988, + "grad_norm": 1.5626981990993993, + "learning_rate": 9.970801329235402e-06, + "loss": 0.4238, + "step": 652 + }, + { + "epoch": 0.06355231143552312, + "grad_norm": 1.5338214380715702, + "learning_rate": 9.970630993523234e-06, + "loss": 0.278, + "step": 653 + }, + { + "epoch": 0.06364963503649634, + "grad_norm": 1.7806299033721755, + "learning_rate": 9.970460163876409e-06, + "loss": 0.5649, + "step": 654 + }, + { + "epoch": 0.06374695863746958, + "grad_norm": 1.9349681554929028, + "learning_rate": 9.9702888403119e-06, + "loss": 0.3297, + "step": 655 + }, + { + "epoch": 0.06384428223844282, + "grad_norm": 1.4947723050696704, + "learning_rate": 9.970117022846736e-06, + "loss": 0.4077, + "step": 656 + }, + { + "epoch": 0.06394160583941606, + "grad_norm": 1.5696774237596223, + "learning_rate": 9.96994471149799e-06, + "loss": 0.4681, + "step": 657 + }, + { + "epoch": 0.06403892944038929, + "grad_norm": 1.7662095984112474, + "learning_rate": 9.969771906282781e-06, + "loss": 0.539, + "step": 658 + }, + { + "epoch": 0.06413625304136253, + "grad_norm": 2.926336951253308, + "learning_rate": 9.969598607218285e-06, + "loss": 0.4196, + "step": 659 + }, + { + "epoch": 0.06423357664233577, + "grad_norm": 3.148192138198314, + "learning_rate": 9.96942481432172e-06, + "loss": 0.4827, + "step": 660 + }, + { + "epoch": 0.06433090024330901, + "grad_norm": 1.790436662552377, + "learning_rate": 9.969250527610356e-06, + "loss": 0.4972, + "step": 661 + }, + { + "epoch": 0.06442822384428223, + "grad_norm": 1.4712739725679773, + "learning_rate": 9.969075747101514e-06, + "loss": 0.4112, + "step": 662 + }, + { + "epoch": 0.06452554744525547, + "grad_norm": 1.4521996617982842, + "learning_rate": 9.96890047281256e-06, + "loss": 0.3729, + "step": 663 + }, + { + "epoch": 0.06462287104622871, + "grad_norm": 1.5457088814513262, + "learning_rate": 9.96872470476091e-06, + "loss": 0.4294, + "step": 664 + }, + { + "epoch": 0.06472019464720194, + "grad_norm": 1.7644033340951866, + "learning_rate": 9.968548442964034e-06, + "loss": 0.4487, + "step": 665 + }, + { + "epoch": 0.06481751824817518, + "grad_norm": 1.632555708701406, + "learning_rate": 9.968371687439446e-06, + "loss": 0.3929, + "step": 666 + }, + { + "epoch": 0.06491484184914842, + "grad_norm": 1.8990302396780172, + "learning_rate": 9.968194438204708e-06, + "loss": 0.4101, + "step": 667 + }, + { + "epoch": 0.06501216545012166, + "grad_norm": 2.092762728551112, + "learning_rate": 9.968016695277436e-06, + "loss": 0.5712, + "step": 668 + }, + { + "epoch": 0.06510948905109488, + "grad_norm": 1.5876668887386824, + "learning_rate": 9.967838458675292e-06, + "loss": 0.494, + "step": 669 + }, + { + "epoch": 0.06520681265206812, + "grad_norm": 1.7536517597940893, + "learning_rate": 9.967659728415985e-06, + "loss": 0.6121, + "step": 670 + }, + { + "epoch": 0.06530413625304136, + "grad_norm": 1.9021294255711243, + "learning_rate": 9.96748050451728e-06, + "loss": 0.3634, + "step": 671 + }, + { + "epoch": 0.0654014598540146, + "grad_norm": 1.4457078547633553, + "learning_rate": 9.96730078699698e-06, + "loss": 0.4586, + "step": 672 + }, + { + "epoch": 0.06549878345498783, + "grad_norm": 1.6474950184261972, + "learning_rate": 9.967120575872952e-06, + "loss": 0.5028, + "step": 673 + }, + { + "epoch": 0.06559610705596107, + "grad_norm": 1.9901979572232373, + "learning_rate": 9.966939871163098e-06, + "loss": 0.6986, + "step": 674 + }, + { + "epoch": 0.06569343065693431, + "grad_norm": 1.3671458210722949, + "learning_rate": 9.966758672885375e-06, + "loss": 0.3945, + "step": 675 + }, + { + "epoch": 0.06579075425790755, + "grad_norm": 1.8371332697903162, + "learning_rate": 9.96657698105779e-06, + "loss": 0.6782, + "step": 676 + }, + { + "epoch": 0.06588807785888078, + "grad_norm": 1.1955013749239556, + "learning_rate": 9.966394795698397e-06, + "loss": 0.242, + "step": 677 + }, + { + "epoch": 0.06598540145985402, + "grad_norm": 1.5330975344313047, + "learning_rate": 9.966212116825302e-06, + "loss": 0.4351, + "step": 678 + }, + { + "epoch": 0.06608272506082725, + "grad_norm": 1.539581985713935, + "learning_rate": 9.966028944456657e-06, + "loss": 0.3512, + "step": 679 + }, + { + "epoch": 0.06618004866180048, + "grad_norm": 1.9573455375443363, + "learning_rate": 9.965845278610661e-06, + "loss": 0.4859, + "step": 680 + }, + { + "epoch": 0.06627737226277372, + "grad_norm": 1.8387055004344444, + "learning_rate": 9.96566111930557e-06, + "loss": 0.3831, + "step": 681 + }, + { + "epoch": 0.06637469586374696, + "grad_norm": 1.7056154014174738, + "learning_rate": 9.96547646655968e-06, + "loss": 0.4675, + "step": 682 + }, + { + "epoch": 0.0664720194647202, + "grad_norm": 1.881602931580563, + "learning_rate": 9.965291320391342e-06, + "loss": 0.5955, + "step": 683 + }, + { + "epoch": 0.06656934306569343, + "grad_norm": 2.9885065529853416, + "learning_rate": 9.965105680818955e-06, + "loss": 0.393, + "step": 684 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.7363492709096229, + "learning_rate": 9.964919547860963e-06, + "loss": 0.4903, + "step": 685 + }, + { + "epoch": 0.0667639902676399, + "grad_norm": 1.8182376939684146, + "learning_rate": 9.964732921535863e-06, + "loss": 0.5443, + "step": 686 + }, + { + "epoch": 0.06686131386861315, + "grad_norm": 1.6914779965026407, + "learning_rate": 9.964545801862202e-06, + "loss": 0.5119, + "step": 687 + }, + { + "epoch": 0.06695863746958637, + "grad_norm": 1.2736843314571082, + "learning_rate": 9.964358188858573e-06, + "loss": 0.2495, + "step": 688 + }, + { + "epoch": 0.06705596107055961, + "grad_norm": 1.5831736266585599, + "learning_rate": 9.96417008254362e-06, + "loss": 0.4489, + "step": 689 + }, + { + "epoch": 0.06715328467153285, + "grad_norm": 2.2148297560046224, + "learning_rate": 9.963981482936034e-06, + "loss": 0.5415, + "step": 690 + }, + { + "epoch": 0.06725060827250608, + "grad_norm": 1.5025934211262992, + "learning_rate": 9.963792390054558e-06, + "loss": 0.3903, + "step": 691 + }, + { + "epoch": 0.06734793187347932, + "grad_norm": 1.4602374679322867, + "learning_rate": 9.96360280391798e-06, + "loss": 0.3199, + "step": 692 + }, + { + "epoch": 0.06744525547445256, + "grad_norm": 1.5813416284844282, + "learning_rate": 9.963412724545142e-06, + "loss": 0.3213, + "step": 693 + }, + { + "epoch": 0.0675425790754258, + "grad_norm": 1.246883512769049, + "learning_rate": 9.96322215195493e-06, + "loss": 0.2644, + "step": 694 + }, + { + "epoch": 0.06763990267639902, + "grad_norm": 1.7094335347253355, + "learning_rate": 9.963031086166282e-06, + "loss": 0.4761, + "step": 695 + }, + { + "epoch": 0.06773722627737226, + "grad_norm": 1.6516611118524773, + "learning_rate": 9.962839527198184e-06, + "loss": 0.4823, + "step": 696 + }, + { + "epoch": 0.0678345498783455, + "grad_norm": 1.3531669839243998, + "learning_rate": 9.962647475069672e-06, + "loss": 0.4272, + "step": 697 + }, + { + "epoch": 0.06793187347931874, + "grad_norm": 1.9430916606586504, + "learning_rate": 9.962454929799829e-06, + "loss": 0.5776, + "step": 698 + }, + { + "epoch": 0.06802919708029197, + "grad_norm": 1.8772536403383466, + "learning_rate": 9.962261891407792e-06, + "loss": 0.6338, + "step": 699 + }, + { + "epoch": 0.0681265206812652, + "grad_norm": 1.3972932620324034, + "learning_rate": 9.96206835991274e-06, + "loss": 0.3671, + "step": 700 + }, + { + "epoch": 0.06822384428223845, + "grad_norm": 1.287329601381866, + "learning_rate": 9.961874335333904e-06, + "loss": 0.2744, + "step": 701 + }, + { + "epoch": 0.06832116788321167, + "grad_norm": 1.5600519457751545, + "learning_rate": 9.961679817690566e-06, + "loss": 0.4433, + "step": 702 + }, + { + "epoch": 0.06841849148418491, + "grad_norm": 1.3898736874388666, + "learning_rate": 9.961484807002056e-06, + "loss": 0.4197, + "step": 703 + }, + { + "epoch": 0.06851581508515815, + "grad_norm": 1.672202746628868, + "learning_rate": 9.961289303287749e-06, + "loss": 0.4601, + "step": 704 + }, + { + "epoch": 0.06861313868613139, + "grad_norm": 1.7427655274680753, + "learning_rate": 9.961093306567076e-06, + "loss": 0.5845, + "step": 705 + }, + { + "epoch": 0.06871046228710462, + "grad_norm": 1.794570108008766, + "learning_rate": 9.960896816859512e-06, + "loss": 0.3459, + "step": 706 + }, + { + "epoch": 0.06880778588807786, + "grad_norm": 1.6024314197975584, + "learning_rate": 9.960699834184582e-06, + "loss": 0.4441, + "step": 707 + }, + { + "epoch": 0.0689051094890511, + "grad_norm": 1.619306935418848, + "learning_rate": 9.960502358561858e-06, + "loss": 0.4647, + "step": 708 + }, + { + "epoch": 0.06900243309002434, + "grad_norm": 1.5009190604836247, + "learning_rate": 9.960304390010968e-06, + "loss": 0.373, + "step": 709 + }, + { + "epoch": 0.06909975669099756, + "grad_norm": 1.8613999824223078, + "learning_rate": 9.960105928551583e-06, + "loss": 0.3926, + "step": 710 + }, + { + "epoch": 0.0691970802919708, + "grad_norm": 2.8907340364253757, + "learning_rate": 9.959906974203422e-06, + "loss": 0.5451, + "step": 711 + }, + { + "epoch": 0.06929440389294404, + "grad_norm": 1.826374356881247, + "learning_rate": 9.959707526986256e-06, + "loss": 0.4341, + "step": 712 + }, + { + "epoch": 0.06939172749391727, + "grad_norm": 2.5001373253299133, + "learning_rate": 9.959507586919903e-06, + "loss": 0.6643, + "step": 713 + }, + { + "epoch": 0.06948905109489051, + "grad_norm": 1.769427365923108, + "learning_rate": 9.959307154024234e-06, + "loss": 0.5431, + "step": 714 + }, + { + "epoch": 0.06958637469586375, + "grad_norm": 2.3285358245695322, + "learning_rate": 9.959106228319166e-06, + "loss": 0.5274, + "step": 715 + }, + { + "epoch": 0.06968369829683699, + "grad_norm": 1.4070234926508725, + "learning_rate": 9.958904809824663e-06, + "loss": 0.3257, + "step": 716 + }, + { + "epoch": 0.06978102189781021, + "grad_norm": 1.9284568290872997, + "learning_rate": 9.958702898560742e-06, + "loss": 0.5648, + "step": 717 + }, + { + "epoch": 0.06987834549878345, + "grad_norm": 2.092543866644565, + "learning_rate": 9.958500494547465e-06, + "loss": 0.6256, + "step": 718 + }, + { + "epoch": 0.0699756690997567, + "grad_norm": 1.5948763588365042, + "learning_rate": 9.958297597804947e-06, + "loss": 0.4011, + "step": 719 + }, + { + "epoch": 0.07007299270072993, + "grad_norm": 1.2246362905267065, + "learning_rate": 9.958094208353348e-06, + "loss": 0.2444, + "step": 720 + }, + { + "epoch": 0.07017031630170316, + "grad_norm": 1.2302916868666773, + "learning_rate": 9.95789032621288e-06, + "loss": 0.3191, + "step": 721 + }, + { + "epoch": 0.0702676399026764, + "grad_norm": 1.5504396768673763, + "learning_rate": 9.957685951403803e-06, + "loss": 0.3112, + "step": 722 + }, + { + "epoch": 0.07036496350364964, + "grad_norm": 2.1205819146422438, + "learning_rate": 9.957481083946427e-06, + "loss": 0.3453, + "step": 723 + }, + { + "epoch": 0.07046228710462286, + "grad_norm": 2.048519725880563, + "learning_rate": 9.957275723861108e-06, + "loss": 0.5266, + "step": 724 + }, + { + "epoch": 0.0705596107055961, + "grad_norm": 1.4453693275620771, + "learning_rate": 9.957069871168253e-06, + "loss": 0.3082, + "step": 725 + }, + { + "epoch": 0.07065693430656934, + "grad_norm": 1.8824931146868138, + "learning_rate": 9.956863525888318e-06, + "loss": 0.588, + "step": 726 + }, + { + "epoch": 0.07075425790754258, + "grad_norm": 1.6143333569692804, + "learning_rate": 9.956656688041807e-06, + "loss": 0.4126, + "step": 727 + }, + { + "epoch": 0.07085158150851581, + "grad_norm": 1.7905307392122496, + "learning_rate": 9.956449357649276e-06, + "loss": 0.521, + "step": 728 + }, + { + "epoch": 0.07094890510948905, + "grad_norm": 1.3295021098228834, + "learning_rate": 9.956241534731325e-06, + "loss": 0.31, + "step": 729 + }, + { + "epoch": 0.07104622871046229, + "grad_norm": 1.5783278835300563, + "learning_rate": 9.956033219308607e-06, + "loss": 0.3091, + "step": 730 + }, + { + "epoch": 0.07114355231143553, + "grad_norm": 1.9905003004076265, + "learning_rate": 9.955824411401822e-06, + "loss": 0.3843, + "step": 731 + }, + { + "epoch": 0.07124087591240875, + "grad_norm": 1.7644558301646922, + "learning_rate": 9.955615111031717e-06, + "loss": 0.4288, + "step": 732 + }, + { + "epoch": 0.071338199513382, + "grad_norm": 1.5922207695027908, + "learning_rate": 9.955405318219096e-06, + "loss": 0.4767, + "step": 733 + }, + { + "epoch": 0.07143552311435523, + "grad_norm": 1.7054240956141933, + "learning_rate": 9.955195032984798e-06, + "loss": 0.4082, + "step": 734 + }, + { + "epoch": 0.07153284671532846, + "grad_norm": 1.3954970063738148, + "learning_rate": 9.954984255349729e-06, + "loss": 0.318, + "step": 735 + }, + { + "epoch": 0.0716301703163017, + "grad_norm": 1.7287069268697828, + "learning_rate": 9.954772985334825e-06, + "loss": 0.4998, + "step": 736 + }, + { + "epoch": 0.07172749391727494, + "grad_norm": 1.4535895804720915, + "learning_rate": 9.954561222961086e-06, + "loss": 0.2489, + "step": 737 + }, + { + "epoch": 0.07182481751824818, + "grad_norm": 1.7113518757446542, + "learning_rate": 9.954348968249552e-06, + "loss": 0.4578, + "step": 738 + }, + { + "epoch": 0.0719221411192214, + "grad_norm": 1.6741613993254088, + "learning_rate": 9.954136221221316e-06, + "loss": 0.4907, + "step": 739 + }, + { + "epoch": 0.07201946472019465, + "grad_norm": 1.590982465166657, + "learning_rate": 9.95392298189752e-06, + "loss": 0.4116, + "step": 740 + }, + { + "epoch": 0.07211678832116789, + "grad_norm": 1.422974716648181, + "learning_rate": 9.953709250299351e-06, + "loss": 0.3501, + "step": 741 + }, + { + "epoch": 0.07221411192214112, + "grad_norm": 1.8424007198547667, + "learning_rate": 9.953495026448048e-06, + "loss": 0.5647, + "step": 742 + }, + { + "epoch": 0.07231143552311435, + "grad_norm": 1.6572484299897867, + "learning_rate": 9.953280310364902e-06, + "loss": 0.3937, + "step": 743 + }, + { + "epoch": 0.07240875912408759, + "grad_norm": 1.6027770112754065, + "learning_rate": 9.953065102071245e-06, + "loss": 0.3845, + "step": 744 + }, + { + "epoch": 0.07250608272506083, + "grad_norm": 1.3618658637431431, + "learning_rate": 9.952849401588464e-06, + "loss": 0.3946, + "step": 745 + }, + { + "epoch": 0.07260340632603407, + "grad_norm": 1.63075572158439, + "learning_rate": 9.952633208937997e-06, + "loss": 0.4506, + "step": 746 + }, + { + "epoch": 0.0727007299270073, + "grad_norm": 1.483187632244976, + "learning_rate": 9.95241652414132e-06, + "loss": 0.3908, + "step": 747 + }, + { + "epoch": 0.07279805352798054, + "grad_norm": 2.147960263046311, + "learning_rate": 9.952199347219972e-06, + "loss": 0.5249, + "step": 748 + }, + { + "epoch": 0.07289537712895378, + "grad_norm": 1.5046941105429004, + "learning_rate": 9.951981678195529e-06, + "loss": 0.3592, + "step": 749 + }, + { + "epoch": 0.072992700729927, + "grad_norm": 1.1457618113072725, + "learning_rate": 9.951763517089624e-06, + "loss": 0.2197, + "step": 750 + }, + { + "epoch": 0.07309002433090024, + "grad_norm": 1.9275946136488011, + "learning_rate": 9.951544863923934e-06, + "loss": 0.5692, + "step": 751 + }, + { + "epoch": 0.07318734793187348, + "grad_norm": 1.9590929330277462, + "learning_rate": 9.95132571872019e-06, + "loss": 0.7243, + "step": 752 + }, + { + "epoch": 0.07328467153284672, + "grad_norm": 2.1368780826391283, + "learning_rate": 9.951106081500162e-06, + "loss": 0.7601, + "step": 753 + }, + { + "epoch": 0.07338199513381995, + "grad_norm": 2.0085695969306396, + "learning_rate": 9.950885952285682e-06, + "loss": 0.5541, + "step": 754 + }, + { + "epoch": 0.07347931873479319, + "grad_norm": 1.9283983503616706, + "learning_rate": 9.950665331098622e-06, + "loss": 0.3832, + "step": 755 + }, + { + "epoch": 0.07357664233576643, + "grad_norm": 1.4173732379297153, + "learning_rate": 9.950444217960902e-06, + "loss": 0.379, + "step": 756 + }, + { + "epoch": 0.07367396593673967, + "grad_norm": 1.5015176407129935, + "learning_rate": 9.9502226128945e-06, + "loss": 0.4696, + "step": 757 + }, + { + "epoch": 0.07377128953771289, + "grad_norm": 1.6746905852394565, + "learning_rate": 9.950000515921434e-06, + "loss": 0.2984, + "step": 758 + }, + { + "epoch": 0.07386861313868613, + "grad_norm": 1.4429847737048944, + "learning_rate": 9.949777927063776e-06, + "loss": 0.3748, + "step": 759 + }, + { + "epoch": 0.07396593673965937, + "grad_norm": 1.1895632638034424, + "learning_rate": 9.94955484634364e-06, + "loss": 0.3014, + "step": 760 + }, + { + "epoch": 0.0740632603406326, + "grad_norm": 1.5497241513071458, + "learning_rate": 9.949331273783198e-06, + "loss": 0.5458, + "step": 761 + }, + { + "epoch": 0.07416058394160584, + "grad_norm": 1.5531214201672936, + "learning_rate": 9.949107209404664e-06, + "loss": 0.4575, + "step": 762 + }, + { + "epoch": 0.07425790754257908, + "grad_norm": 1.3336107839559097, + "learning_rate": 9.948882653230306e-06, + "loss": 0.4227, + "step": 763 + }, + { + "epoch": 0.07435523114355232, + "grad_norm": 1.7418209768074853, + "learning_rate": 9.948657605282437e-06, + "loss": 0.659, + "step": 764 + }, + { + "epoch": 0.07445255474452554, + "grad_norm": 1.462439433090815, + "learning_rate": 9.94843206558342e-06, + "loss": 0.445, + "step": 765 + }, + { + "epoch": 0.07454987834549878, + "grad_norm": 1.0856086178050317, + "learning_rate": 9.948206034155666e-06, + "loss": 0.2245, + "step": 766 + }, + { + "epoch": 0.07464720194647202, + "grad_norm": 1.458503858496447, + "learning_rate": 9.947979511021638e-06, + "loss": 0.3009, + "step": 767 + }, + { + "epoch": 0.07474452554744526, + "grad_norm": 1.1921292471996519, + "learning_rate": 9.947752496203844e-06, + "loss": 0.2988, + "step": 768 + }, + { + "epoch": 0.07484184914841849, + "grad_norm": 1.6693024138876786, + "learning_rate": 9.947524989724844e-06, + "loss": 0.4783, + "step": 769 + }, + { + "epoch": 0.07493917274939173, + "grad_norm": 1.4928671202909605, + "learning_rate": 9.947296991607244e-06, + "loss": 0.4161, + "step": 770 + }, + { + "epoch": 0.07503649635036497, + "grad_norm": 1.4549005796935413, + "learning_rate": 9.947068501873702e-06, + "loss": 0.4186, + "step": 771 + }, + { + "epoch": 0.0751338199513382, + "grad_norm": 1.7544781744298734, + "learning_rate": 9.946839520546923e-06, + "loss": 0.5593, + "step": 772 + }, + { + "epoch": 0.07523114355231143, + "grad_norm": 1.561541454027553, + "learning_rate": 9.946610047649659e-06, + "loss": 0.5097, + "step": 773 + }, + { + "epoch": 0.07532846715328467, + "grad_norm": 1.598616630831168, + "learning_rate": 9.946380083204714e-06, + "loss": 0.3744, + "step": 774 + }, + { + "epoch": 0.07542579075425791, + "grad_norm": 1.6915556597188157, + "learning_rate": 9.94614962723494e-06, + "loss": 0.439, + "step": 775 + }, + { + "epoch": 0.07552311435523114, + "grad_norm": 1.220024420697048, + "learning_rate": 9.945918679763237e-06, + "loss": 0.2339, + "step": 776 + }, + { + "epoch": 0.07562043795620438, + "grad_norm": 1.6061445238682988, + "learning_rate": 9.945687240812556e-06, + "loss": 0.4493, + "step": 777 + }, + { + "epoch": 0.07571776155717762, + "grad_norm": 1.400813806243779, + "learning_rate": 9.945455310405895e-06, + "loss": 0.4513, + "step": 778 + }, + { + "epoch": 0.07581508515815086, + "grad_norm": 1.753751480308555, + "learning_rate": 9.945222888566298e-06, + "loss": 0.5379, + "step": 779 + }, + { + "epoch": 0.07591240875912408, + "grad_norm": 1.4421667558329163, + "learning_rate": 9.944989975316862e-06, + "loss": 0.4118, + "step": 780 + }, + { + "epoch": 0.07600973236009732, + "grad_norm": 1.4411974086247974, + "learning_rate": 9.944756570680733e-06, + "loss": 0.3295, + "step": 781 + }, + { + "epoch": 0.07610705596107056, + "grad_norm": 1.5545586767450623, + "learning_rate": 9.944522674681107e-06, + "loss": 0.4146, + "step": 782 + }, + { + "epoch": 0.07620437956204379, + "grad_norm": 2.0019900434858084, + "learning_rate": 9.944288287341222e-06, + "loss": 0.4945, + "step": 783 + }, + { + "epoch": 0.07630170316301703, + "grad_norm": 1.5834930071710975, + "learning_rate": 9.944053408684371e-06, + "loss": 0.3781, + "step": 784 + }, + { + "epoch": 0.07639902676399027, + "grad_norm": 1.5272521164667598, + "learning_rate": 9.943818038733894e-06, + "loss": 0.3865, + "step": 785 + }, + { + "epoch": 0.07649635036496351, + "grad_norm": 1.8005925077547513, + "learning_rate": 9.94358217751318e-06, + "loss": 0.3951, + "step": 786 + }, + { + "epoch": 0.07659367396593673, + "grad_norm": 2.0471085276865995, + "learning_rate": 9.943345825045664e-06, + "loss": 0.6391, + "step": 787 + }, + { + "epoch": 0.07669099756690997, + "grad_norm": 1.7893386028077656, + "learning_rate": 9.943108981354839e-06, + "loss": 0.6373, + "step": 788 + }, + { + "epoch": 0.07678832116788321, + "grad_norm": 1.6529186502183046, + "learning_rate": 9.942871646464234e-06, + "loss": 0.4901, + "step": 789 + }, + { + "epoch": 0.07688564476885645, + "grad_norm": 1.8449837387732961, + "learning_rate": 9.942633820397436e-06, + "loss": 0.4444, + "step": 790 + }, + { + "epoch": 0.07698296836982968, + "grad_norm": 1.5278738521461448, + "learning_rate": 9.942395503178077e-06, + "loss": 0.3701, + "step": 791 + }, + { + "epoch": 0.07708029197080292, + "grad_norm": 1.8197808533034088, + "learning_rate": 9.942156694829838e-06, + "loss": 0.6142, + "step": 792 + }, + { + "epoch": 0.07717761557177616, + "grad_norm": 1.8496691201700692, + "learning_rate": 9.941917395376452e-06, + "loss": 0.2021, + "step": 793 + }, + { + "epoch": 0.07727493917274939, + "grad_norm": 1.8762664332677217, + "learning_rate": 9.941677604841696e-06, + "loss": 0.6742, + "step": 794 + }, + { + "epoch": 0.07737226277372262, + "grad_norm": 1.5933514264940258, + "learning_rate": 9.9414373232494e-06, + "loss": 0.5156, + "step": 795 + }, + { + "epoch": 0.07746958637469586, + "grad_norm": 1.538651154827247, + "learning_rate": 9.94119655062344e-06, + "loss": 0.446, + "step": 796 + }, + { + "epoch": 0.0775669099756691, + "grad_norm": 3.7300878200470926, + "learning_rate": 9.94095528698774e-06, + "loss": 0.2745, + "step": 797 + }, + { + "epoch": 0.07766423357664233, + "grad_norm": 1.685774804326696, + "learning_rate": 9.940713532366277e-06, + "loss": 0.4236, + "step": 798 + }, + { + "epoch": 0.07776155717761557, + "grad_norm": 1.2528388212678458, + "learning_rate": 9.940471286783074e-06, + "loss": 0.308, + "step": 799 + }, + { + "epoch": 0.07785888077858881, + "grad_norm": 1.5082779398207746, + "learning_rate": 9.940228550262203e-06, + "loss": 0.4925, + "step": 800 + }, + { + "epoch": 0.07795620437956205, + "grad_norm": 1.544326069333433, + "learning_rate": 9.939985322827784e-06, + "loss": 0.4341, + "step": 801 + }, + { + "epoch": 0.07805352798053528, + "grad_norm": 1.4959220289677864, + "learning_rate": 9.939741604503987e-06, + "loss": 0.4548, + "step": 802 + }, + { + "epoch": 0.07815085158150852, + "grad_norm": 1.682287714178995, + "learning_rate": 9.93949739531503e-06, + "loss": 0.5277, + "step": 803 + }, + { + "epoch": 0.07824817518248176, + "grad_norm": 1.6519496438708445, + "learning_rate": 9.93925269528518e-06, + "loss": 0.3074, + "step": 804 + }, + { + "epoch": 0.07834549878345498, + "grad_norm": 1.4379883641500402, + "learning_rate": 9.939007504438756e-06, + "loss": 0.3069, + "step": 805 + }, + { + "epoch": 0.07844282238442822, + "grad_norm": 2.0644552037743793, + "learning_rate": 9.93876182280012e-06, + "loss": 0.4479, + "step": 806 + }, + { + "epoch": 0.07854014598540146, + "grad_norm": 1.4791313310441092, + "learning_rate": 9.938515650393685e-06, + "loss": 0.4255, + "step": 807 + }, + { + "epoch": 0.0786374695863747, + "grad_norm": 1.4280736600967436, + "learning_rate": 9.938268987243914e-06, + "loss": 0.466, + "step": 808 + }, + { + "epoch": 0.07873479318734793, + "grad_norm": 1.610976672135659, + "learning_rate": 9.93802183337532e-06, + "loss": 0.4327, + "step": 809 + }, + { + "epoch": 0.07883211678832117, + "grad_norm": 1.5447130604673693, + "learning_rate": 9.93777418881246e-06, + "loss": 0.4931, + "step": 810 + }, + { + "epoch": 0.0789294403892944, + "grad_norm": 1.3831325957946852, + "learning_rate": 9.937526053579944e-06, + "loss": 0.3877, + "step": 811 + }, + { + "epoch": 0.07902676399026765, + "grad_norm": 1.4247112282736865, + "learning_rate": 9.93727742770243e-06, + "loss": 0.4168, + "step": 812 + }, + { + "epoch": 0.07912408759124087, + "grad_norm": 1.5074130304911886, + "learning_rate": 9.937028311204624e-06, + "loss": 0.4747, + "step": 813 + }, + { + "epoch": 0.07922141119221411, + "grad_norm": 1.4955958242475926, + "learning_rate": 9.936778704111278e-06, + "loss": 0.2999, + "step": 814 + }, + { + "epoch": 0.07931873479318735, + "grad_norm": 1.6038468607718186, + "learning_rate": 9.9365286064472e-06, + "loss": 0.4897, + "step": 815 + }, + { + "epoch": 0.07941605839416059, + "grad_norm": 1.8040845780349017, + "learning_rate": 9.93627801823724e-06, + "loss": 0.6413, + "step": 816 + }, + { + "epoch": 0.07951338199513382, + "grad_norm": 1.4598215502284355, + "learning_rate": 9.936026939506298e-06, + "loss": 0.3687, + "step": 817 + }, + { + "epoch": 0.07961070559610706, + "grad_norm": 1.340412030499075, + "learning_rate": 9.935775370279324e-06, + "loss": 0.3833, + "step": 818 + }, + { + "epoch": 0.0797080291970803, + "grad_norm": 1.6913032059853774, + "learning_rate": 9.935523310581318e-06, + "loss": 0.5857, + "step": 819 + }, + { + "epoch": 0.07980535279805352, + "grad_norm": 1.9970663728185467, + "learning_rate": 9.93527076043733e-06, + "loss": 0.6843, + "step": 820 + }, + { + "epoch": 0.07990267639902676, + "grad_norm": 1.4408921562941295, + "learning_rate": 9.93501771987245e-06, + "loss": 0.4385, + "step": 821 + }, + { + "epoch": 0.08, + "grad_norm": 1.5184490203891443, + "learning_rate": 9.934764188911827e-06, + "loss": 0.4708, + "step": 822 + }, + { + "epoch": 0.08009732360097324, + "grad_norm": 1.8501562903086661, + "learning_rate": 9.934510167580654e-06, + "loss": 0.6431, + "step": 823 + }, + { + "epoch": 0.08019464720194647, + "grad_norm": 1.6997829158405129, + "learning_rate": 9.934255655904172e-06, + "loss": 0.5188, + "step": 824 + }, + { + "epoch": 0.08029197080291971, + "grad_norm": 1.8510241792275326, + "learning_rate": 9.934000653907674e-06, + "loss": 0.5457, + "step": 825 + }, + { + "epoch": 0.08038929440389295, + "grad_norm": 1.6853569692908912, + "learning_rate": 9.933745161616498e-06, + "loss": 0.5062, + "step": 826 + }, + { + "epoch": 0.08048661800486619, + "grad_norm": 1.3066104263898661, + "learning_rate": 9.93348917905603e-06, + "loss": 0.404, + "step": 827 + }, + { + "epoch": 0.08058394160583941, + "grad_norm": 1.2788244408859646, + "learning_rate": 9.933232706251712e-06, + "loss": 0.3253, + "step": 828 + }, + { + "epoch": 0.08068126520681265, + "grad_norm": 2.2690800072126325, + "learning_rate": 9.932975743229027e-06, + "loss": 0.3405, + "step": 829 + }, + { + "epoch": 0.08077858880778589, + "grad_norm": 1.9113871035353245, + "learning_rate": 9.932718290013512e-06, + "loss": 0.5989, + "step": 830 + }, + { + "epoch": 0.08087591240875912, + "grad_norm": 1.3655256798283997, + "learning_rate": 9.932460346630748e-06, + "loss": 0.2942, + "step": 831 + }, + { + "epoch": 0.08097323600973236, + "grad_norm": 1.5234864838378999, + "learning_rate": 9.932201913106366e-06, + "loss": 0.3913, + "step": 832 + }, + { + "epoch": 0.0810705596107056, + "grad_norm": 1.3752195876516826, + "learning_rate": 9.93194298946605e-06, + "loss": 0.3293, + "step": 833 + }, + { + "epoch": 0.08116788321167884, + "grad_norm": 1.4842622412969824, + "learning_rate": 9.931683575735527e-06, + "loss": 0.4157, + "step": 834 + }, + { + "epoch": 0.08126520681265206, + "grad_norm": 4.003685207313109, + "learning_rate": 9.931423671940577e-06, + "loss": 0.3276, + "step": 835 + }, + { + "epoch": 0.0813625304136253, + "grad_norm": 1.509943035011216, + "learning_rate": 9.931163278107023e-06, + "loss": 0.4045, + "step": 836 + }, + { + "epoch": 0.08145985401459854, + "grad_norm": 1.4382523765338775, + "learning_rate": 9.930902394260746e-06, + "loss": 0.2709, + "step": 837 + }, + { + "epoch": 0.08155717761557178, + "grad_norm": 1.4492711471586157, + "learning_rate": 9.930641020427665e-06, + "loss": 0.3957, + "step": 838 + }, + { + "epoch": 0.08165450121654501, + "grad_norm": 1.7428876214187694, + "learning_rate": 9.930379156633758e-06, + "loss": 0.5257, + "step": 839 + }, + { + "epoch": 0.08175182481751825, + "grad_norm": 1.5652514836380926, + "learning_rate": 9.930116802905042e-06, + "loss": 0.4948, + "step": 840 + }, + { + "epoch": 0.08184914841849149, + "grad_norm": 2.4133112951540494, + "learning_rate": 9.929853959267589e-06, + "loss": 0.5455, + "step": 841 + }, + { + "epoch": 0.08194647201946471, + "grad_norm": 1.4309460046419233, + "learning_rate": 9.929590625747518e-06, + "loss": 0.4057, + "step": 842 + }, + { + "epoch": 0.08204379562043795, + "grad_norm": 1.0450296792009146, + "learning_rate": 9.929326802370995e-06, + "loss": 0.2332, + "step": 843 + }, + { + "epoch": 0.0821411192214112, + "grad_norm": 1.1201933325217828, + "learning_rate": 9.92906248916424e-06, + "loss": 0.3264, + "step": 844 + }, + { + "epoch": 0.08223844282238443, + "grad_norm": 1.6243579769967154, + "learning_rate": 9.928797686153515e-06, + "loss": 0.5385, + "step": 845 + }, + { + "epoch": 0.08233576642335766, + "grad_norm": 1.3496069901220336, + "learning_rate": 9.928532393365136e-06, + "loss": 0.3875, + "step": 846 + }, + { + "epoch": 0.0824330900243309, + "grad_norm": 1.4862888245769246, + "learning_rate": 9.928266610825462e-06, + "loss": 0.4493, + "step": 847 + }, + { + "epoch": 0.08253041362530414, + "grad_norm": 1.8305160014899666, + "learning_rate": 9.928000338560906e-06, + "loss": 0.4582, + "step": 848 + }, + { + "epoch": 0.08262773722627738, + "grad_norm": 1.642584946989029, + "learning_rate": 9.927733576597926e-06, + "loss": 0.3347, + "step": 849 + }, + { + "epoch": 0.0827250608272506, + "grad_norm": 1.5413363162928122, + "learning_rate": 9.927466324963033e-06, + "loss": 0.4607, + "step": 850 + }, + { + "epoch": 0.08282238442822384, + "grad_norm": 1.7093263469236866, + "learning_rate": 9.927198583682784e-06, + "loss": 0.5706, + "step": 851 + }, + { + "epoch": 0.08291970802919708, + "grad_norm": 1.531714933227777, + "learning_rate": 9.926930352783781e-06, + "loss": 0.533, + "step": 852 + }, + { + "epoch": 0.08301703163017031, + "grad_norm": 1.8181822267445191, + "learning_rate": 9.926661632292683e-06, + "loss": 0.5946, + "step": 853 + }, + { + "epoch": 0.08311435523114355, + "grad_norm": 1.8304662465930317, + "learning_rate": 9.926392422236189e-06, + "loss": 0.3746, + "step": 854 + }, + { + "epoch": 0.08321167883211679, + "grad_norm": 1.3135536142885351, + "learning_rate": 9.926122722641051e-06, + "loss": 0.429, + "step": 855 + }, + { + "epoch": 0.08330900243309003, + "grad_norm": 1.714390027755308, + "learning_rate": 9.925852533534071e-06, + "loss": 0.6806, + "step": 856 + }, + { + "epoch": 0.08340632603406326, + "grad_norm": 1.3399957064659453, + "learning_rate": 9.925581854942099e-06, + "loss": 0.2824, + "step": 857 + }, + { + "epoch": 0.0835036496350365, + "grad_norm": 1.3705351036499993, + "learning_rate": 9.925310686892026e-06, + "loss": 0.3085, + "step": 858 + }, + { + "epoch": 0.08360097323600973, + "grad_norm": 1.5064665959171673, + "learning_rate": 9.925039029410807e-06, + "loss": 0.4445, + "step": 859 + }, + { + "epoch": 0.08369829683698297, + "grad_norm": 1.725614330530946, + "learning_rate": 9.924766882525433e-06, + "loss": 0.4704, + "step": 860 + }, + { + "epoch": 0.0837956204379562, + "grad_norm": 1.765372064078189, + "learning_rate": 9.924494246262944e-06, + "loss": 0.6383, + "step": 861 + }, + { + "epoch": 0.08389294403892944, + "grad_norm": 2.085503007877936, + "learning_rate": 9.924221120650434e-06, + "loss": 0.296, + "step": 862 + }, + { + "epoch": 0.08399026763990268, + "grad_norm": 1.7898541160892734, + "learning_rate": 9.923947505715046e-06, + "loss": 0.5991, + "step": 863 + }, + { + "epoch": 0.0840875912408759, + "grad_norm": 1.6476104975968628, + "learning_rate": 9.923673401483968e-06, + "loss": 0.4734, + "step": 864 + }, + { + "epoch": 0.08418491484184915, + "grad_norm": 1.5502768976775265, + "learning_rate": 9.923398807984439e-06, + "loss": 0.2764, + "step": 865 + }, + { + "epoch": 0.08428223844282239, + "grad_norm": 1.2398437846135097, + "learning_rate": 9.923123725243744e-06, + "loss": 0.2705, + "step": 866 + }, + { + "epoch": 0.08437956204379563, + "grad_norm": 1.5290591078236662, + "learning_rate": 9.922848153289217e-06, + "loss": 0.4228, + "step": 867 + }, + { + "epoch": 0.08447688564476885, + "grad_norm": 1.134889947118225, + "learning_rate": 9.922572092148244e-06, + "loss": 0.2953, + "step": 868 + }, + { + "epoch": 0.08457420924574209, + "grad_norm": 1.6307620082274505, + "learning_rate": 9.922295541848257e-06, + "loss": 0.3363, + "step": 869 + }, + { + "epoch": 0.08467153284671533, + "grad_norm": 1.373015271795792, + "learning_rate": 9.922018502416736e-06, + "loss": 0.3593, + "step": 870 + }, + { + "epoch": 0.08476885644768857, + "grad_norm": 1.7500724096304088, + "learning_rate": 9.921740973881211e-06, + "loss": 0.5236, + "step": 871 + }, + { + "epoch": 0.0848661800486618, + "grad_norm": 1.6167507595463353, + "learning_rate": 9.92146295626926e-06, + "loss": 0.5138, + "step": 872 + }, + { + "epoch": 0.08496350364963504, + "grad_norm": 1.0398007401901226, + "learning_rate": 9.92118444960851e-06, + "loss": 0.295, + "step": 873 + }, + { + "epoch": 0.08506082725060828, + "grad_norm": 1.4140920056378707, + "learning_rate": 9.920905453926637e-06, + "loss": 0.4192, + "step": 874 + }, + { + "epoch": 0.0851581508515815, + "grad_norm": 1.8785238213855096, + "learning_rate": 9.920625969251365e-06, + "loss": 0.4228, + "step": 875 + }, + { + "epoch": 0.08525547445255474, + "grad_norm": 1.719991686268608, + "learning_rate": 9.920345995610465e-06, + "loss": 0.5026, + "step": 876 + }, + { + "epoch": 0.08535279805352798, + "grad_norm": 1.7112372148926476, + "learning_rate": 9.92006553303176e-06, + "loss": 0.3157, + "step": 877 + }, + { + "epoch": 0.08545012165450122, + "grad_norm": 2.5105720144829116, + "learning_rate": 9.919784581543117e-06, + "loss": 0.4777, + "step": 878 + }, + { + "epoch": 0.08554744525547445, + "grad_norm": 1.42848630379055, + "learning_rate": 9.919503141172458e-06, + "loss": 0.3998, + "step": 879 + }, + { + "epoch": 0.08564476885644769, + "grad_norm": 1.4246136626839867, + "learning_rate": 9.919221211947748e-06, + "loss": 0.4415, + "step": 880 + }, + { + "epoch": 0.08574209245742093, + "grad_norm": 1.939970471855472, + "learning_rate": 9.918938793897002e-06, + "loss": 0.5887, + "step": 881 + }, + { + "epoch": 0.08583941605839417, + "grad_norm": 1.5467402852284964, + "learning_rate": 9.918655887048285e-06, + "loss": 0.3726, + "step": 882 + }, + { + "epoch": 0.08593673965936739, + "grad_norm": 1.6261636529000345, + "learning_rate": 9.918372491429708e-06, + "loss": 0.3382, + "step": 883 + }, + { + "epoch": 0.08603406326034063, + "grad_norm": 1.4859289768748727, + "learning_rate": 9.918088607069434e-06, + "loss": 0.4837, + "step": 884 + }, + { + "epoch": 0.08613138686131387, + "grad_norm": 1.8534453271170916, + "learning_rate": 9.917804233995673e-06, + "loss": 0.5948, + "step": 885 + }, + { + "epoch": 0.08622871046228711, + "grad_norm": 1.3491809126204122, + "learning_rate": 9.917519372236684e-06, + "loss": 0.381, + "step": 886 + }, + { + "epoch": 0.08632603406326034, + "grad_norm": 1.4913268478302555, + "learning_rate": 9.91723402182077e-06, + "loss": 0.2872, + "step": 887 + }, + { + "epoch": 0.08642335766423358, + "grad_norm": 1.5345667515291348, + "learning_rate": 9.916948182776289e-06, + "loss": 0.4426, + "step": 888 + }, + { + "epoch": 0.08652068126520682, + "grad_norm": 1.9142340135608018, + "learning_rate": 9.916661855131646e-06, + "loss": 0.467, + "step": 889 + }, + { + "epoch": 0.08661800486618004, + "grad_norm": 1.7451883652681546, + "learning_rate": 9.916375038915291e-06, + "loss": 0.3579, + "step": 890 + }, + { + "epoch": 0.08671532846715328, + "grad_norm": 3.3675828599824618, + "learning_rate": 9.916087734155728e-06, + "loss": 0.3965, + "step": 891 + }, + { + "epoch": 0.08681265206812652, + "grad_norm": 1.6430989821947144, + "learning_rate": 9.915799940881504e-06, + "loss": 0.5089, + "step": 892 + }, + { + "epoch": 0.08690997566909976, + "grad_norm": 1.8434153107573372, + "learning_rate": 9.915511659121219e-06, + "loss": 0.6513, + "step": 893 + }, + { + "epoch": 0.08700729927007299, + "grad_norm": 1.7259560464984558, + "learning_rate": 9.91522288890352e-06, + "loss": 0.5963, + "step": 894 + }, + { + "epoch": 0.08710462287104623, + "grad_norm": 1.4417036209809253, + "learning_rate": 9.9149336302571e-06, + "loss": 0.4076, + "step": 895 + }, + { + "epoch": 0.08720194647201947, + "grad_norm": 1.4565626930182671, + "learning_rate": 9.914643883210704e-06, + "loss": 0.3548, + "step": 896 + }, + { + "epoch": 0.08729927007299271, + "grad_norm": 1.8286482885292266, + "learning_rate": 9.914353647793126e-06, + "loss": 0.5158, + "step": 897 + }, + { + "epoch": 0.08739659367396593, + "grad_norm": 1.573235746781315, + "learning_rate": 9.914062924033204e-06, + "loss": 0.4804, + "step": 898 + }, + { + "epoch": 0.08749391727493917, + "grad_norm": 1.7725042500734154, + "learning_rate": 9.91377171195983e-06, + "loss": 0.4037, + "step": 899 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 1.5572801757524644, + "learning_rate": 9.913480011601939e-06, + "loss": 0.2757, + "step": 900 + }, + { + "epoch": 0.08768856447688564, + "grad_norm": 1.690990088453521, + "learning_rate": 9.91318782298852e-06, + "loss": 0.624, + "step": 901 + }, + { + "epoch": 0.08778588807785888, + "grad_norm": 1.5797017595834213, + "learning_rate": 9.912895146148609e-06, + "loss": 0.418, + "step": 902 + }, + { + "epoch": 0.08788321167883212, + "grad_norm": 1.722754374021215, + "learning_rate": 9.912601981111287e-06, + "loss": 0.5991, + "step": 903 + }, + { + "epoch": 0.08798053527980536, + "grad_norm": 1.2395740583484196, + "learning_rate": 9.912308327905683e-06, + "loss": 0.3632, + "step": 904 + }, + { + "epoch": 0.08807785888077858, + "grad_norm": 1.8637568028899596, + "learning_rate": 9.912014186560985e-06, + "loss": 0.5766, + "step": 905 + }, + { + "epoch": 0.08817518248175182, + "grad_norm": 1.8489319991981024, + "learning_rate": 9.911719557106418e-06, + "loss": 0.6834, + "step": 906 + }, + { + "epoch": 0.08827250608272506, + "grad_norm": 1.6692858460733677, + "learning_rate": 9.911424439571258e-06, + "loss": 0.5067, + "step": 907 + }, + { + "epoch": 0.0883698296836983, + "grad_norm": 1.4727605888984552, + "learning_rate": 9.911128833984834e-06, + "loss": 0.3141, + "step": 908 + }, + { + "epoch": 0.08846715328467153, + "grad_norm": 1.644393806422472, + "learning_rate": 9.910832740376518e-06, + "loss": 0.4599, + "step": 909 + }, + { + "epoch": 0.08856447688564477, + "grad_norm": 1.730275300452632, + "learning_rate": 9.910536158775734e-06, + "loss": 0.3908, + "step": 910 + }, + { + "epoch": 0.08866180048661801, + "grad_norm": 1.7281903494262714, + "learning_rate": 9.910239089211955e-06, + "loss": 0.5919, + "step": 911 + }, + { + "epoch": 0.08875912408759123, + "grad_norm": 1.7234172913238917, + "learning_rate": 9.909941531714699e-06, + "loss": 0.609, + "step": 912 + }, + { + "epoch": 0.08885644768856447, + "grad_norm": 1.4594702058569258, + "learning_rate": 9.909643486313533e-06, + "loss": 0.4399, + "step": 913 + }, + { + "epoch": 0.08895377128953771, + "grad_norm": 1.4625782448468165, + "learning_rate": 9.90934495303808e-06, + "loss": 0.4011, + "step": 914 + }, + { + "epoch": 0.08905109489051095, + "grad_norm": 1.7262645481609784, + "learning_rate": 9.909045931918e-06, + "loss": 0.4992, + "step": 915 + }, + { + "epoch": 0.08914841849148418, + "grad_norm": 1.6255222361700263, + "learning_rate": 9.908746422983007e-06, + "loss": 0.4909, + "step": 916 + }, + { + "epoch": 0.08924574209245742, + "grad_norm": 1.7512982185254946, + "learning_rate": 9.908446426262865e-06, + "loss": 0.5527, + "step": 917 + }, + { + "epoch": 0.08934306569343066, + "grad_norm": 1.617605772613541, + "learning_rate": 9.908145941787386e-06, + "loss": 0.3228, + "step": 918 + }, + { + "epoch": 0.0894403892944039, + "grad_norm": 1.489706963519404, + "learning_rate": 9.907844969586427e-06, + "loss": 0.4838, + "step": 919 + }, + { + "epoch": 0.08953771289537713, + "grad_norm": 1.193837371345013, + "learning_rate": 9.907543509689896e-06, + "loss": 0.284, + "step": 920 + }, + { + "epoch": 0.08963503649635036, + "grad_norm": 1.5855787651349198, + "learning_rate": 9.907241562127752e-06, + "loss": 0.4641, + "step": 921 + }, + { + "epoch": 0.0897323600973236, + "grad_norm": 1.2401284480478103, + "learning_rate": 9.906939126929998e-06, + "loss": 0.246, + "step": 922 + }, + { + "epoch": 0.08982968369829683, + "grad_norm": 1.503842201355298, + "learning_rate": 9.906636204126685e-06, + "loss": 0.4031, + "step": 923 + }, + { + "epoch": 0.08992700729927007, + "grad_norm": 1.9138265658958267, + "learning_rate": 9.906332793747917e-06, + "loss": 0.587, + "step": 924 + }, + { + "epoch": 0.09002433090024331, + "grad_norm": 1.5381184892388742, + "learning_rate": 9.906028895823844e-06, + "loss": 0.4119, + "step": 925 + }, + { + "epoch": 0.09012165450121655, + "grad_norm": 1.5769181877690257, + "learning_rate": 9.905724510384664e-06, + "loss": 0.4071, + "step": 926 + }, + { + "epoch": 0.09021897810218978, + "grad_norm": 1.4644408625641083, + "learning_rate": 9.905419637460625e-06, + "loss": 0.3656, + "step": 927 + }, + { + "epoch": 0.09031630170316302, + "grad_norm": 2.043739071504731, + "learning_rate": 9.90511427708202e-06, + "loss": 0.6317, + "step": 928 + }, + { + "epoch": 0.09041362530413626, + "grad_norm": 1.8397228419915481, + "learning_rate": 9.904808429279195e-06, + "loss": 0.6656, + "step": 929 + }, + { + "epoch": 0.0905109489051095, + "grad_norm": 1.6689588837493128, + "learning_rate": 9.904502094082542e-06, + "loss": 0.4603, + "step": 930 + }, + { + "epoch": 0.09060827250608272, + "grad_norm": 1.7157610479724803, + "learning_rate": 9.9041952715225e-06, + "loss": 0.3566, + "step": 931 + }, + { + "epoch": 0.09070559610705596, + "grad_norm": 1.5797548847560638, + "learning_rate": 9.90388796162956e-06, + "loss": 0.527, + "step": 932 + }, + { + "epoch": 0.0908029197080292, + "grad_norm": 1.3861944362556795, + "learning_rate": 9.903580164434262e-06, + "loss": 0.3555, + "step": 933 + }, + { + "epoch": 0.09090024330900243, + "grad_norm": 1.4873043668950738, + "learning_rate": 9.903271879967185e-06, + "loss": 0.3606, + "step": 934 + }, + { + "epoch": 0.09099756690997567, + "grad_norm": 1.5471770637050817, + "learning_rate": 9.90296310825897e-06, + "loss": 0.5407, + "step": 935 + }, + { + "epoch": 0.0910948905109489, + "grad_norm": 1.7410898214633266, + "learning_rate": 9.902653849340296e-06, + "loss": 0.5604, + "step": 936 + }, + { + "epoch": 0.09119221411192215, + "grad_norm": 1.490257412993615, + "learning_rate": 9.902344103241897e-06, + "loss": 0.4293, + "step": 937 + }, + { + "epoch": 0.09128953771289537, + "grad_norm": 1.3076716120407041, + "learning_rate": 9.90203386999455e-06, + "loss": 0.4311, + "step": 938 + }, + { + "epoch": 0.09138686131386861, + "grad_norm": 1.63883307554104, + "learning_rate": 9.901723149629085e-06, + "loss": 0.5026, + "step": 939 + }, + { + "epoch": 0.09148418491484185, + "grad_norm": 1.460694807977355, + "learning_rate": 9.901411942176377e-06, + "loss": 0.4449, + "step": 940 + }, + { + "epoch": 0.09158150851581509, + "grad_norm": 1.631318499416747, + "learning_rate": 9.901100247667352e-06, + "loss": 0.4762, + "step": 941 + }, + { + "epoch": 0.09167883211678832, + "grad_norm": 1.472942456024595, + "learning_rate": 9.900788066132982e-06, + "loss": 0.4208, + "step": 942 + }, + { + "epoch": 0.09177615571776156, + "grad_norm": 1.9471723252943203, + "learning_rate": 9.900475397604292e-06, + "loss": 0.4887, + "step": 943 + }, + { + "epoch": 0.0918734793187348, + "grad_norm": 1.4192635165617975, + "learning_rate": 9.900162242112348e-06, + "loss": 0.4753, + "step": 944 + }, + { + "epoch": 0.09197080291970802, + "grad_norm": 1.7864248496903834, + "learning_rate": 9.89984859968827e-06, + "loss": 0.6063, + "step": 945 + }, + { + "epoch": 0.09206812652068126, + "grad_norm": 1.402919088092856, + "learning_rate": 9.899534470363225e-06, + "loss": 0.3561, + "step": 946 + }, + { + "epoch": 0.0921654501216545, + "grad_norm": 1.15011785152118, + "learning_rate": 9.89921985416843e-06, + "loss": 0.2605, + "step": 947 + }, + { + "epoch": 0.09226277372262774, + "grad_norm": 1.2940536511249239, + "learning_rate": 9.898904751135145e-06, + "loss": 0.2503, + "step": 948 + }, + { + "epoch": 0.09236009732360097, + "grad_norm": 1.5093308152075566, + "learning_rate": 9.898589161294684e-06, + "loss": 0.4185, + "step": 949 + }, + { + "epoch": 0.09245742092457421, + "grad_norm": 1.5826010349075055, + "learning_rate": 9.898273084678406e-06, + "loss": 0.536, + "step": 950 + }, + { + "epoch": 0.09255474452554745, + "grad_norm": 1.5672518381317015, + "learning_rate": 9.897956521317724e-06, + "loss": 0.5068, + "step": 951 + }, + { + "epoch": 0.09265206812652069, + "grad_norm": 1.784767292144658, + "learning_rate": 9.89763947124409e-06, + "loss": 0.6601, + "step": 952 + }, + { + "epoch": 0.09274939172749391, + "grad_norm": 1.620681747107968, + "learning_rate": 9.897321934489011e-06, + "loss": 0.5402, + "step": 953 + }, + { + "epoch": 0.09284671532846715, + "grad_norm": 1.7479722673062432, + "learning_rate": 9.897003911084042e-06, + "loss": 0.6593, + "step": 954 + }, + { + "epoch": 0.09294403892944039, + "grad_norm": 1.6618363798373263, + "learning_rate": 9.896685401060783e-06, + "loss": 0.6086, + "step": 955 + }, + { + "epoch": 0.09304136253041363, + "grad_norm": 1.3782603882872615, + "learning_rate": 9.896366404450888e-06, + "loss": 0.3431, + "step": 956 + }, + { + "epoch": 0.09313868613138686, + "grad_norm": 1.6607836446620106, + "learning_rate": 9.896046921286053e-06, + "loss": 0.4015, + "step": 957 + }, + { + "epoch": 0.0932360097323601, + "grad_norm": 1.372535143543006, + "learning_rate": 9.895726951598026e-06, + "loss": 0.3627, + "step": 958 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 1.965175835699204, + "learning_rate": 9.895406495418602e-06, + "loss": 0.434, + "step": 959 + }, + { + "epoch": 0.09343065693430656, + "grad_norm": 1.6072227382486934, + "learning_rate": 9.895085552779626e-06, + "loss": 0.3666, + "step": 960 + }, + { + "epoch": 0.0935279805352798, + "grad_norm": 1.8680414138630521, + "learning_rate": 9.894764123712991e-06, + "loss": 0.6182, + "step": 961 + }, + { + "epoch": 0.09362530413625304, + "grad_norm": 1.7249394724081422, + "learning_rate": 9.894442208250636e-06, + "loss": 0.569, + "step": 962 + }, + { + "epoch": 0.09372262773722628, + "grad_norm": 1.7887658285510963, + "learning_rate": 9.894119806424549e-06, + "loss": 0.4825, + "step": 963 + }, + { + "epoch": 0.09381995133819951, + "grad_norm": 1.4470695743772581, + "learning_rate": 9.89379691826677e-06, + "loss": 0.4036, + "step": 964 + }, + { + "epoch": 0.09391727493917275, + "grad_norm": 1.739037372856574, + "learning_rate": 9.893473543809383e-06, + "loss": 0.3734, + "step": 965 + }, + { + "epoch": 0.09401459854014599, + "grad_norm": 1.2401623802615098, + "learning_rate": 9.893149683084522e-06, + "loss": 0.2892, + "step": 966 + }, + { + "epoch": 0.09411192214111923, + "grad_norm": 1.632367817316159, + "learning_rate": 9.892825336124369e-06, + "loss": 0.3324, + "step": 967 + }, + { + "epoch": 0.09420924574209245, + "grad_norm": 1.4553279790204596, + "learning_rate": 9.892500502961156e-06, + "loss": 0.4518, + "step": 968 + }, + { + "epoch": 0.0943065693430657, + "grad_norm": 2.0184949211791867, + "learning_rate": 9.892175183627161e-06, + "loss": 0.496, + "step": 969 + }, + { + "epoch": 0.09440389294403893, + "grad_norm": 1.3847811204395728, + "learning_rate": 9.89184937815471e-06, + "loss": 0.3908, + "step": 970 + }, + { + "epoch": 0.09450121654501216, + "grad_norm": 1.7325451795183482, + "learning_rate": 9.89152308657618e-06, + "loss": 0.5813, + "step": 971 + }, + { + "epoch": 0.0945985401459854, + "grad_norm": 1.3485480854398895, + "learning_rate": 9.891196308923994e-06, + "loss": 0.2773, + "step": 972 + }, + { + "epoch": 0.09469586374695864, + "grad_norm": 1.6137214411092917, + "learning_rate": 9.890869045230625e-06, + "loss": 0.573, + "step": 973 + }, + { + "epoch": 0.09479318734793188, + "grad_norm": 1.8098732560393935, + "learning_rate": 9.890541295528593e-06, + "loss": 0.5765, + "step": 974 + }, + { + "epoch": 0.0948905109489051, + "grad_norm": 1.7169741386061155, + "learning_rate": 9.890213059850467e-06, + "loss": 0.5463, + "step": 975 + }, + { + "epoch": 0.09498783454987834, + "grad_norm": 1.6226425677233698, + "learning_rate": 9.889884338228861e-06, + "loss": 0.459, + "step": 976 + }, + { + "epoch": 0.09508515815085158, + "grad_norm": 1.5712338302132318, + "learning_rate": 9.889555130696445e-06, + "loss": 0.2926, + "step": 977 + }, + { + "epoch": 0.09518248175182482, + "grad_norm": 2.368668096329164, + "learning_rate": 9.88922543728593e-06, + "loss": 0.4602, + "step": 978 + }, + { + "epoch": 0.09527980535279805, + "grad_norm": 1.5481463515619227, + "learning_rate": 9.888895258030077e-06, + "loss": 0.382, + "step": 979 + }, + { + "epoch": 0.09537712895377129, + "grad_norm": 1.5566394762827083, + "learning_rate": 9.888564592961698e-06, + "loss": 0.4432, + "step": 980 + }, + { + "epoch": 0.09547445255474453, + "grad_norm": 1.2929219586068095, + "learning_rate": 9.888233442113651e-06, + "loss": 0.2986, + "step": 981 + }, + { + "epoch": 0.09557177615571776, + "grad_norm": 1.7926346211976876, + "learning_rate": 9.887901805518841e-06, + "loss": 0.4536, + "step": 982 + }, + { + "epoch": 0.095669099756691, + "grad_norm": 1.5810862037952855, + "learning_rate": 9.887569683210225e-06, + "loss": 0.5143, + "step": 983 + }, + { + "epoch": 0.09576642335766423, + "grad_norm": 1.486412737689962, + "learning_rate": 9.887237075220805e-06, + "loss": 0.4422, + "step": 984 + }, + { + "epoch": 0.09586374695863747, + "grad_norm": 1.5634292890846626, + "learning_rate": 9.886903981583633e-06, + "loss": 0.5158, + "step": 985 + }, + { + "epoch": 0.0959610705596107, + "grad_norm": 1.4911106877832496, + "learning_rate": 9.88657040233181e-06, + "loss": 0.3584, + "step": 986 + }, + { + "epoch": 0.09605839416058394, + "grad_norm": 1.8920202230134835, + "learning_rate": 9.886236337498481e-06, + "loss": 0.7059, + "step": 987 + }, + { + "epoch": 0.09615571776155718, + "grad_norm": 1.9765830057761664, + "learning_rate": 9.885901787116844e-06, + "loss": 0.3363, + "step": 988 + }, + { + "epoch": 0.09625304136253042, + "grad_norm": 1.7412713212065478, + "learning_rate": 9.885566751220144e-06, + "loss": 0.6238, + "step": 989 + }, + { + "epoch": 0.09635036496350365, + "grad_norm": 1.4558500764026314, + "learning_rate": 9.885231229841675e-06, + "loss": 0.5033, + "step": 990 + }, + { + "epoch": 0.09644768856447689, + "grad_norm": 1.5722863237428275, + "learning_rate": 9.884895223014772e-06, + "loss": 0.3026, + "step": 991 + }, + { + "epoch": 0.09654501216545013, + "grad_norm": 1.7850396516814273, + "learning_rate": 9.88455873077283e-06, + "loss": 0.6797, + "step": 992 + }, + { + "epoch": 0.09664233576642335, + "grad_norm": 1.5907642595826164, + "learning_rate": 9.884221753149286e-06, + "loss": 0.5051, + "step": 993 + }, + { + "epoch": 0.09673965936739659, + "grad_norm": 1.383326117178851, + "learning_rate": 9.883884290177623e-06, + "loss": 0.394, + "step": 994 + }, + { + "epoch": 0.09683698296836983, + "grad_norm": 1.5330791836349085, + "learning_rate": 9.883546341891375e-06, + "loss": 0.4531, + "step": 995 + }, + { + "epoch": 0.09693430656934307, + "grad_norm": 1.3858453283442664, + "learning_rate": 9.883207908324126e-06, + "loss": 0.4674, + "step": 996 + }, + { + "epoch": 0.0970316301703163, + "grad_norm": 1.2633519423598012, + "learning_rate": 9.882868989509507e-06, + "loss": 0.3053, + "step": 997 + }, + { + "epoch": 0.09712895377128954, + "grad_norm": 1.5725755469000553, + "learning_rate": 9.882529585481194e-06, + "loss": 0.5382, + "step": 998 + }, + { + "epoch": 0.09722627737226278, + "grad_norm": 1.594807816051373, + "learning_rate": 9.882189696272916e-06, + "loss": 0.5027, + "step": 999 + }, + { + "epoch": 0.09732360097323602, + "grad_norm": 1.7855937930735857, + "learning_rate": 9.881849321918446e-06, + "loss": 0.6336, + "step": 1000 + }, + { + "epoch": 0.09742092457420924, + "grad_norm": 1.8161736452208326, + "learning_rate": 9.88150846245161e-06, + "loss": 0.5432, + "step": 1001 + }, + { + "epoch": 0.09751824817518248, + "grad_norm": 1.2323791206307224, + "learning_rate": 9.881167117906276e-06, + "loss": 0.3361, + "step": 1002 + }, + { + "epoch": 0.09761557177615572, + "grad_norm": 1.6720448345305876, + "learning_rate": 9.880825288316367e-06, + "loss": 0.3583, + "step": 1003 + }, + { + "epoch": 0.09771289537712895, + "grad_norm": 1.408364549926656, + "learning_rate": 9.880482973715846e-06, + "loss": 0.3847, + "step": 1004 + }, + { + "epoch": 0.09781021897810219, + "grad_norm": 1.493256031544701, + "learning_rate": 9.880140174138735e-06, + "loss": 0.3611, + "step": 1005 + }, + { + "epoch": 0.09790754257907543, + "grad_norm": 1.3658283125944337, + "learning_rate": 9.879796889619093e-06, + "loss": 0.3555, + "step": 1006 + }, + { + "epoch": 0.09800486618004867, + "grad_norm": 1.7346143127846696, + "learning_rate": 9.879453120191037e-06, + "loss": 0.5028, + "step": 1007 + }, + { + "epoch": 0.09810218978102189, + "grad_norm": 1.9094090784905724, + "learning_rate": 9.879108865888724e-06, + "loss": 0.4799, + "step": 1008 + }, + { + "epoch": 0.09819951338199513, + "grad_norm": 1.1235415223499565, + "learning_rate": 9.878764126746364e-06, + "loss": 0.2181, + "step": 1009 + }, + { + "epoch": 0.09829683698296837, + "grad_norm": 1.494557121918356, + "learning_rate": 9.878418902798215e-06, + "loss": 0.4548, + "step": 1010 + }, + { + "epoch": 0.09839416058394161, + "grad_norm": 1.5340021274706077, + "learning_rate": 9.87807319407858e-06, + "loss": 0.4952, + "step": 1011 + }, + { + "epoch": 0.09849148418491484, + "grad_norm": 1.2523545024978981, + "learning_rate": 9.877727000621815e-06, + "loss": 0.2887, + "step": 1012 + }, + { + "epoch": 0.09858880778588808, + "grad_norm": 1.424446798325285, + "learning_rate": 9.877380322462317e-06, + "loss": 0.3628, + "step": 1013 + }, + { + "epoch": 0.09868613138686132, + "grad_norm": 1.6382574528105933, + "learning_rate": 9.877033159634542e-06, + "loss": 0.5396, + "step": 1014 + }, + { + "epoch": 0.09878345498783454, + "grad_norm": 1.544256440771578, + "learning_rate": 9.876685512172982e-06, + "loss": 0.4031, + "step": 1015 + }, + { + "epoch": 0.09888077858880778, + "grad_norm": 1.620162733287423, + "learning_rate": 9.876337380112185e-06, + "loss": 0.4925, + "step": 1016 + }, + { + "epoch": 0.09897810218978102, + "grad_norm": 1.6140460771461889, + "learning_rate": 9.875988763486746e-06, + "loss": 0.5549, + "step": 1017 + }, + { + "epoch": 0.09907542579075426, + "grad_norm": 1.6187864498320685, + "learning_rate": 9.875639662331307e-06, + "loss": 0.5034, + "step": 1018 + }, + { + "epoch": 0.09917274939172749, + "grad_norm": 1.249422512171971, + "learning_rate": 9.875290076680557e-06, + "loss": 0.236, + "step": 1019 + }, + { + "epoch": 0.09927007299270073, + "grad_norm": 1.5835572971087337, + "learning_rate": 9.874940006569236e-06, + "loss": 0.5309, + "step": 1020 + }, + { + "epoch": 0.09936739659367397, + "grad_norm": 0.8658795502351594, + "learning_rate": 9.874589452032131e-06, + "loss": 0.1911, + "step": 1021 + }, + { + "epoch": 0.09946472019464721, + "grad_norm": 1.3171385587421753, + "learning_rate": 9.874238413104076e-06, + "loss": 0.3486, + "step": 1022 + }, + { + "epoch": 0.09956204379562043, + "grad_norm": 1.4498439375980756, + "learning_rate": 9.873886889819953e-06, + "loss": 0.1986, + "step": 1023 + }, + { + "epoch": 0.09965936739659367, + "grad_norm": 1.5991307847988792, + "learning_rate": 9.873534882214692e-06, + "loss": 0.6397, + "step": 1024 + }, + { + "epoch": 0.09975669099756691, + "grad_norm": 1.6135151765084201, + "learning_rate": 9.873182390323277e-06, + "loss": 0.4338, + "step": 1025 + }, + { + "epoch": 0.09985401459854015, + "grad_norm": 1.465261170994732, + "learning_rate": 9.872829414180733e-06, + "loss": 0.4692, + "step": 1026 + }, + { + "epoch": 0.09995133819951338, + "grad_norm": 1.6964068418559575, + "learning_rate": 9.872475953822134e-06, + "loss": 0.4763, + "step": 1027 + }, + { + "epoch": 0.10004866180048662, + "grad_norm": 1.5209137969308788, + "learning_rate": 9.872122009282604e-06, + "loss": 0.4266, + "step": 1028 + }, + { + "epoch": 0.10014598540145986, + "grad_norm": 1.4495568716439686, + "learning_rate": 9.871767580597316e-06, + "loss": 0.4087, + "step": 1029 + }, + { + "epoch": 0.10024330900243308, + "grad_norm": 1.344434785457905, + "learning_rate": 9.871412667801488e-06, + "loss": 0.3797, + "step": 1030 + }, + { + "epoch": 0.10034063260340632, + "grad_norm": 1.5794908259633444, + "learning_rate": 9.871057270930392e-06, + "loss": 0.3939, + "step": 1031 + }, + { + "epoch": 0.10043795620437956, + "grad_norm": 1.5876979734473795, + "learning_rate": 9.870701390019337e-06, + "loss": 0.484, + "step": 1032 + }, + { + "epoch": 0.1005352798053528, + "grad_norm": 1.8773231101994967, + "learning_rate": 9.870345025103694e-06, + "loss": 0.5893, + "step": 1033 + }, + { + "epoch": 0.10063260340632603, + "grad_norm": 1.4927383125242464, + "learning_rate": 9.869988176218871e-06, + "loss": 0.4138, + "step": 1034 + }, + { + "epoch": 0.10072992700729927, + "grad_norm": 1.4766306382054422, + "learning_rate": 9.869630843400331e-06, + "loss": 0.4125, + "step": 1035 + }, + { + "epoch": 0.10082725060827251, + "grad_norm": 2.1872385141217388, + "learning_rate": 9.86927302668358e-06, + "loss": 0.4581, + "step": 1036 + }, + { + "epoch": 0.10092457420924575, + "grad_norm": 1.4275090865666056, + "learning_rate": 9.868914726104174e-06, + "loss": 0.2393, + "step": 1037 + }, + { + "epoch": 0.10102189781021897, + "grad_norm": 1.6989614006808447, + "learning_rate": 9.868555941697721e-06, + "loss": 0.4941, + "step": 1038 + }, + { + "epoch": 0.10111922141119221, + "grad_norm": 1.4357333730365565, + "learning_rate": 9.86819667349987e-06, + "loss": 0.4907, + "step": 1039 + }, + { + "epoch": 0.10121654501216545, + "grad_norm": 2.0026376735495055, + "learning_rate": 9.867836921546326e-06, + "loss": 0.8695, + "step": 1040 + }, + { + "epoch": 0.10131386861313868, + "grad_norm": 1.6951372609783342, + "learning_rate": 9.867476685872833e-06, + "loss": 0.6236, + "step": 1041 + }, + { + "epoch": 0.10141119221411192, + "grad_norm": 1.6963236381946833, + "learning_rate": 9.86711596651519e-06, + "loss": 0.6358, + "step": 1042 + }, + { + "epoch": 0.10150851581508516, + "grad_norm": 1.5189733584329748, + "learning_rate": 9.866754763509242e-06, + "loss": 0.4374, + "step": 1043 + }, + { + "epoch": 0.1016058394160584, + "grad_norm": 1.2748045341406278, + "learning_rate": 9.866393076890881e-06, + "loss": 0.4213, + "step": 1044 + }, + { + "epoch": 0.10170316301703163, + "grad_norm": 1.7405552081322075, + "learning_rate": 9.866030906696051e-06, + "loss": 0.6708, + "step": 1045 + }, + { + "epoch": 0.10180048661800487, + "grad_norm": 1.3495682131815454, + "learning_rate": 9.865668252960737e-06, + "loss": 0.3531, + "step": 1046 + }, + { + "epoch": 0.1018978102189781, + "grad_norm": 1.5653185028552046, + "learning_rate": 9.86530511572098e-06, + "loss": 0.4331, + "step": 1047 + }, + { + "epoch": 0.10199513381995134, + "grad_norm": 1.3992858529840162, + "learning_rate": 9.864941495012861e-06, + "loss": 0.3388, + "step": 1048 + }, + { + "epoch": 0.10209245742092457, + "grad_norm": 1.6270586325333123, + "learning_rate": 9.864577390872516e-06, + "loss": 0.4234, + "step": 1049 + }, + { + "epoch": 0.10218978102189781, + "grad_norm": 1.8656971621974168, + "learning_rate": 9.864212803336126e-06, + "loss": 0.718, + "step": 1050 + }, + { + "epoch": 0.10228710462287105, + "grad_norm": 1.4029758909387644, + "learning_rate": 9.86384773243992e-06, + "loss": 0.3892, + "step": 1051 + }, + { + "epoch": 0.10238442822384428, + "grad_norm": 1.1023559958942302, + "learning_rate": 9.863482178220176e-06, + "loss": 0.2453, + "step": 1052 + }, + { + "epoch": 0.10248175182481752, + "grad_norm": 1.5775869982106272, + "learning_rate": 9.863116140713219e-06, + "loss": 0.5324, + "step": 1053 + }, + { + "epoch": 0.10257907542579076, + "grad_norm": 1.603675899324949, + "learning_rate": 9.86274961995542e-06, + "loss": 0.4521, + "step": 1054 + }, + { + "epoch": 0.102676399026764, + "grad_norm": 1.6020699046167006, + "learning_rate": 9.862382615983203e-06, + "loss": 0.4545, + "step": 1055 + }, + { + "epoch": 0.10277372262773722, + "grad_norm": 1.474718021659803, + "learning_rate": 9.862015128833036e-06, + "loss": 0.4822, + "step": 1056 + }, + { + "epoch": 0.10287104622871046, + "grad_norm": 1.6033514684549, + "learning_rate": 9.861647158541438e-06, + "loss": 0.5069, + "step": 1057 + }, + { + "epoch": 0.1029683698296837, + "grad_norm": 1.4841655382640788, + "learning_rate": 9.861278705144974e-06, + "loss": 0.3865, + "step": 1058 + }, + { + "epoch": 0.10306569343065694, + "grad_norm": 1.1425556408878823, + "learning_rate": 9.860909768680259e-06, + "loss": 0.2443, + "step": 1059 + }, + { + "epoch": 0.10316301703163017, + "grad_norm": 1.5288676978753954, + "learning_rate": 9.86054034918395e-06, + "loss": 0.3652, + "step": 1060 + }, + { + "epoch": 0.1032603406326034, + "grad_norm": 1.5264484093473076, + "learning_rate": 9.860170446692758e-06, + "loss": 0.3318, + "step": 1061 + }, + { + "epoch": 0.10335766423357665, + "grad_norm": 1.4476258605632986, + "learning_rate": 9.859800061243443e-06, + "loss": 0.4518, + "step": 1062 + }, + { + "epoch": 0.10345498783454987, + "grad_norm": 1.336933590040686, + "learning_rate": 9.859429192872809e-06, + "loss": 0.2652, + "step": 1063 + }, + { + "epoch": 0.10355231143552311, + "grad_norm": 1.6050187197155075, + "learning_rate": 9.859057841617709e-06, + "loss": 0.5383, + "step": 1064 + }, + { + "epoch": 0.10364963503649635, + "grad_norm": 1.3472405276196469, + "learning_rate": 9.858686007515045e-06, + "loss": 0.4483, + "step": 1065 + }, + { + "epoch": 0.10374695863746959, + "grad_norm": 1.4838970820374793, + "learning_rate": 9.858313690601767e-06, + "loss": 0.3506, + "step": 1066 + }, + { + "epoch": 0.10384428223844282, + "grad_norm": 1.5911831099601979, + "learning_rate": 9.857940890914868e-06, + "loss": 0.3995, + "step": 1067 + }, + { + "epoch": 0.10394160583941606, + "grad_norm": 1.415577451063168, + "learning_rate": 9.8575676084914e-06, + "loss": 0.4773, + "step": 1068 + }, + { + "epoch": 0.1040389294403893, + "grad_norm": 1.7250253730787564, + "learning_rate": 9.857193843368451e-06, + "loss": 0.4456, + "step": 1069 + }, + { + "epoch": 0.10413625304136254, + "grad_norm": 1.5066269873708278, + "learning_rate": 9.856819595583166e-06, + "loss": 0.5481, + "step": 1070 + }, + { + "epoch": 0.10423357664233576, + "grad_norm": 1.5626665408071483, + "learning_rate": 9.856444865172732e-06, + "loss": 0.5382, + "step": 1071 + }, + { + "epoch": 0.104330900243309, + "grad_norm": 1.9089561390061884, + "learning_rate": 9.856069652174385e-06, + "loss": 0.5533, + "step": 1072 + }, + { + "epoch": 0.10442822384428224, + "grad_norm": 1.2757688373398666, + "learning_rate": 9.855693956625414e-06, + "loss": 0.3065, + "step": 1073 + }, + { + "epoch": 0.10452554744525547, + "grad_norm": 1.7230598513214688, + "learning_rate": 9.85531777856315e-06, + "loss": 0.5367, + "step": 1074 + }, + { + "epoch": 0.10462287104622871, + "grad_norm": 1.8368494244508635, + "learning_rate": 9.854941118024973e-06, + "loss": 0.4587, + "step": 1075 + }, + { + "epoch": 0.10472019464720195, + "grad_norm": 1.418583003899538, + "learning_rate": 9.854563975048314e-06, + "loss": 0.405, + "step": 1076 + }, + { + "epoch": 0.10481751824817519, + "grad_norm": 1.555078045275604, + "learning_rate": 9.854186349670648e-06, + "loss": 0.5572, + "step": 1077 + }, + { + "epoch": 0.10491484184914841, + "grad_norm": 1.5414220083120458, + "learning_rate": 9.853808241929502e-06, + "loss": 0.3382, + "step": 1078 + }, + { + "epoch": 0.10501216545012165, + "grad_norm": 1.2895897451723073, + "learning_rate": 9.853429651862445e-06, + "loss": 0.4342, + "step": 1079 + }, + { + "epoch": 0.10510948905109489, + "grad_norm": 1.3117010773132232, + "learning_rate": 9.853050579507104e-06, + "loss": 0.3751, + "step": 1080 + }, + { + "epoch": 0.10520681265206813, + "grad_norm": 1.5440994948167002, + "learning_rate": 9.852671024901141e-06, + "loss": 0.4971, + "step": 1081 + }, + { + "epoch": 0.10530413625304136, + "grad_norm": 1.2028388141262132, + "learning_rate": 9.852290988082278e-06, + "loss": 0.3933, + "step": 1082 + }, + { + "epoch": 0.1054014598540146, + "grad_norm": 1.6199890049219825, + "learning_rate": 9.851910469088275e-06, + "loss": 0.5394, + "step": 1083 + }, + { + "epoch": 0.10549878345498784, + "grad_norm": 1.4805170620003079, + "learning_rate": 9.851529467956946e-06, + "loss": 0.2421, + "step": 1084 + }, + { + "epoch": 0.10559610705596106, + "grad_norm": 1.432802486072686, + "learning_rate": 9.851147984726154e-06, + "loss": 0.479, + "step": 1085 + }, + { + "epoch": 0.1056934306569343, + "grad_norm": 1.7662999036343905, + "learning_rate": 9.850766019433803e-06, + "loss": 0.706, + "step": 1086 + }, + { + "epoch": 0.10579075425790754, + "grad_norm": 1.9136497208168854, + "learning_rate": 9.850383572117853e-06, + "loss": 0.7672, + "step": 1087 + }, + { + "epoch": 0.10588807785888078, + "grad_norm": 1.1667281997438979, + "learning_rate": 9.850000642816306e-06, + "loss": 0.2263, + "step": 1088 + }, + { + "epoch": 0.10598540145985401, + "grad_norm": 1.3133144576431575, + "learning_rate": 9.849617231567213e-06, + "loss": 0.2211, + "step": 1089 + }, + { + "epoch": 0.10608272506082725, + "grad_norm": 1.411642205718121, + "learning_rate": 9.849233338408674e-06, + "loss": 0.4379, + "step": 1090 + }, + { + "epoch": 0.10618004866180049, + "grad_norm": 1.7114110143353651, + "learning_rate": 9.84884896337884e-06, + "loss": 0.462, + "step": 1091 + }, + { + "epoch": 0.10627737226277373, + "grad_norm": 1.4035875335457177, + "learning_rate": 9.848464106515903e-06, + "loss": 0.317, + "step": 1092 + }, + { + "epoch": 0.10637469586374695, + "grad_norm": 1.5988244446936477, + "learning_rate": 9.848078767858107e-06, + "loss": 0.5254, + "step": 1093 + }, + { + "epoch": 0.1064720194647202, + "grad_norm": 1.6336010940510732, + "learning_rate": 9.847692947443745e-06, + "loss": 0.4979, + "step": 1094 + }, + { + "epoch": 0.10656934306569343, + "grad_norm": 1.68747146017171, + "learning_rate": 9.847306645311154e-06, + "loss": 0.5515, + "step": 1095 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 1.497709273552353, + "learning_rate": 9.846919861498724e-06, + "loss": 0.4221, + "step": 1096 + }, + { + "epoch": 0.1067639902676399, + "grad_norm": 1.4761873606313476, + "learning_rate": 9.846532596044887e-06, + "loss": 0.4296, + "step": 1097 + }, + { + "epoch": 0.10686131386861314, + "grad_norm": 1.1441862868877024, + "learning_rate": 9.846144848988127e-06, + "loss": 0.2816, + "step": 1098 + }, + { + "epoch": 0.10695863746958638, + "grad_norm": 1.7272604657837642, + "learning_rate": 9.845756620366976e-06, + "loss": 0.5916, + "step": 1099 + }, + { + "epoch": 0.1070559610705596, + "grad_norm": 1.3799505412872324, + "learning_rate": 9.84536791022001e-06, + "loss": 0.3947, + "step": 1100 + }, + { + "epoch": 0.10715328467153284, + "grad_norm": 1.6943818099878132, + "learning_rate": 9.844978718585855e-06, + "loss": 0.4737, + "step": 1101 + }, + { + "epoch": 0.10725060827250608, + "grad_norm": 1.5405614688920448, + "learning_rate": 9.84458904550319e-06, + "loss": 0.4152, + "step": 1102 + }, + { + "epoch": 0.10734793187347932, + "grad_norm": 1.6335292867295117, + "learning_rate": 9.844198891010733e-06, + "loss": 0.5677, + "step": 1103 + }, + { + "epoch": 0.10744525547445255, + "grad_norm": 1.302603147379972, + "learning_rate": 9.843808255147253e-06, + "loss": 0.4283, + "step": 1104 + }, + { + "epoch": 0.10754257907542579, + "grad_norm": 1.7967506033919078, + "learning_rate": 9.84341713795157e-06, + "loss": 0.6995, + "step": 1105 + }, + { + "epoch": 0.10763990267639903, + "grad_norm": 1.7320527346822367, + "learning_rate": 9.84302553946255e-06, + "loss": 0.5369, + "step": 1106 + }, + { + "epoch": 0.10773722627737227, + "grad_norm": 1.2124746103676287, + "learning_rate": 9.842633459719104e-06, + "loss": 0.296, + "step": 1107 + }, + { + "epoch": 0.1078345498783455, + "grad_norm": 1.6638227119864655, + "learning_rate": 9.842240898760195e-06, + "loss": 0.5632, + "step": 1108 + }, + { + "epoch": 0.10793187347931874, + "grad_norm": 1.5728826792836543, + "learning_rate": 9.841847856624833e-06, + "loss": 0.3407, + "step": 1109 + }, + { + "epoch": 0.10802919708029197, + "grad_norm": 1.4855225795030034, + "learning_rate": 9.841454333352073e-06, + "loss": 0.534, + "step": 1110 + }, + { + "epoch": 0.1081265206812652, + "grad_norm": 1.741747608159628, + "learning_rate": 9.841060328981019e-06, + "loss": 0.5739, + "step": 1111 + }, + { + "epoch": 0.10822384428223844, + "grad_norm": 1.2765533148109443, + "learning_rate": 9.840665843550825e-06, + "loss": 0.335, + "step": 1112 + }, + { + "epoch": 0.10832116788321168, + "grad_norm": 1.9391527817309226, + "learning_rate": 9.840270877100692e-06, + "loss": 0.5604, + "step": 1113 + }, + { + "epoch": 0.10841849148418492, + "grad_norm": 1.2570937099076989, + "learning_rate": 9.839875429669865e-06, + "loss": 0.3098, + "step": 1114 + }, + { + "epoch": 0.10851581508515815, + "grad_norm": 1.6345857910998665, + "learning_rate": 9.839479501297643e-06, + "loss": 0.4665, + "step": 1115 + }, + { + "epoch": 0.10861313868613139, + "grad_norm": 2.1039943309751075, + "learning_rate": 9.839083092023368e-06, + "loss": 0.8597, + "step": 1116 + }, + { + "epoch": 0.10871046228710463, + "grad_norm": 1.634678554608885, + "learning_rate": 9.838686201886432e-06, + "loss": 0.4907, + "step": 1117 + }, + { + "epoch": 0.10880778588807787, + "grad_norm": 1.328229383966676, + "learning_rate": 9.838288830926274e-06, + "loss": 0.3255, + "step": 1118 + }, + { + "epoch": 0.10890510948905109, + "grad_norm": 1.3587359099021656, + "learning_rate": 9.837890979182381e-06, + "loss": 0.4224, + "step": 1119 + }, + { + "epoch": 0.10900243309002433, + "grad_norm": 1.6242900911620413, + "learning_rate": 9.837492646694287e-06, + "loss": 0.4338, + "step": 1120 + }, + { + "epoch": 0.10909975669099757, + "grad_norm": 1.5901048900387273, + "learning_rate": 9.837093833501576e-06, + "loss": 0.5168, + "step": 1121 + }, + { + "epoch": 0.1091970802919708, + "grad_norm": 1.34172908606168, + "learning_rate": 9.836694539643878e-06, + "loss": 0.3233, + "step": 1122 + }, + { + "epoch": 0.10929440389294404, + "grad_norm": 1.4724714330159256, + "learning_rate": 9.83629476516087e-06, + "loss": 0.3652, + "step": 1123 + }, + { + "epoch": 0.10939172749391728, + "grad_norm": 1.4884050773310515, + "learning_rate": 9.835894510092279e-06, + "loss": 0.4622, + "step": 1124 + }, + { + "epoch": 0.10948905109489052, + "grad_norm": 1.3181328020728609, + "learning_rate": 9.835493774477877e-06, + "loss": 0.4531, + "step": 1125 + }, + { + "epoch": 0.10958637469586374, + "grad_norm": 1.5414298966880746, + "learning_rate": 9.835092558357488e-06, + "loss": 0.3659, + "step": 1126 + }, + { + "epoch": 0.10968369829683698, + "grad_norm": 1.3248299507567909, + "learning_rate": 9.834690861770979e-06, + "loss": 0.3207, + "step": 1127 + }, + { + "epoch": 0.10978102189781022, + "grad_norm": 1.5527535683267375, + "learning_rate": 9.834288684758269e-06, + "loss": 0.4938, + "step": 1128 + }, + { + "epoch": 0.10987834549878346, + "grad_norm": 1.3342131255187983, + "learning_rate": 9.83388602735932e-06, + "loss": 0.4451, + "step": 1129 + }, + { + "epoch": 0.10997566909975669, + "grad_norm": 1.0500905202266426, + "learning_rate": 9.833482889614143e-06, + "loss": 0.2408, + "step": 1130 + }, + { + "epoch": 0.11007299270072993, + "grad_norm": 1.377353907486564, + "learning_rate": 9.833079271562802e-06, + "loss": 0.3945, + "step": 1131 + }, + { + "epoch": 0.11017031630170317, + "grad_norm": 1.5823324787969848, + "learning_rate": 9.832675173245404e-06, + "loss": 0.6066, + "step": 1132 + }, + { + "epoch": 0.11026763990267639, + "grad_norm": 1.7266167679625446, + "learning_rate": 9.832270594702102e-06, + "loss": 0.6417, + "step": 1133 + }, + { + "epoch": 0.11036496350364963, + "grad_norm": 1.4091165783577269, + "learning_rate": 9.831865535973103e-06, + "loss": 0.2661, + "step": 1134 + }, + { + "epoch": 0.11046228710462287, + "grad_norm": 0.9959339686876645, + "learning_rate": 9.831459997098654e-06, + "loss": 0.1744, + "step": 1135 + }, + { + "epoch": 0.11055961070559611, + "grad_norm": 1.4748243970921762, + "learning_rate": 9.831053978119056e-06, + "loss": 0.4011, + "step": 1136 + }, + { + "epoch": 0.11065693430656934, + "grad_norm": 1.5879686249629044, + "learning_rate": 9.830647479074656e-06, + "loss": 0.3021, + "step": 1137 + }, + { + "epoch": 0.11075425790754258, + "grad_norm": 1.5057704716227702, + "learning_rate": 9.830240500005845e-06, + "loss": 0.2962, + "step": 1138 + }, + { + "epoch": 0.11085158150851582, + "grad_norm": 1.7497051535586357, + "learning_rate": 9.829833040953068e-06, + "loss": 0.4717, + "step": 1139 + }, + { + "epoch": 0.11094890510948906, + "grad_norm": 1.7819946472609902, + "learning_rate": 9.829425101956812e-06, + "loss": 0.6113, + "step": 1140 + }, + { + "epoch": 0.11104622871046228, + "grad_norm": 1.7680522472506797, + "learning_rate": 9.829016683057615e-06, + "loss": 0.4672, + "step": 1141 + }, + { + "epoch": 0.11114355231143552, + "grad_norm": 1.8291787265156998, + "learning_rate": 9.828607784296063e-06, + "loss": 0.5148, + "step": 1142 + }, + { + "epoch": 0.11124087591240876, + "grad_norm": 1.4119536127948566, + "learning_rate": 9.828198405712788e-06, + "loss": 0.2698, + "step": 1143 + }, + { + "epoch": 0.11133819951338199, + "grad_norm": 1.67600232780131, + "learning_rate": 9.827788547348469e-06, + "loss": 0.4912, + "step": 1144 + }, + { + "epoch": 0.11143552311435523, + "grad_norm": 1.9367616538665617, + "learning_rate": 9.827378209243835e-06, + "loss": 0.3781, + "step": 1145 + }, + { + "epoch": 0.11153284671532847, + "grad_norm": 1.7032208896905794, + "learning_rate": 9.826967391439662e-06, + "loss": 0.5816, + "step": 1146 + }, + { + "epoch": 0.11163017031630171, + "grad_norm": 1.60872896431165, + "learning_rate": 9.826556093976769e-06, + "loss": 0.4654, + "step": 1147 + }, + { + "epoch": 0.11172749391727493, + "grad_norm": 1.5752275514466696, + "learning_rate": 9.826144316896033e-06, + "loss": 0.3177, + "step": 1148 + }, + { + "epoch": 0.11182481751824817, + "grad_norm": 1.8207599924827627, + "learning_rate": 9.82573206023837e-06, + "loss": 0.5701, + "step": 1149 + }, + { + "epoch": 0.11192214111922141, + "grad_norm": 1.5850279506541385, + "learning_rate": 9.825319324044745e-06, + "loss": 0.5616, + "step": 1150 + }, + { + "epoch": 0.11201946472019465, + "grad_norm": 1.360496233978723, + "learning_rate": 9.824906108356174e-06, + "loss": 0.3407, + "step": 1151 + }, + { + "epoch": 0.11211678832116788, + "grad_norm": 1.6595565610362801, + "learning_rate": 9.824492413213717e-06, + "loss": 0.6641, + "step": 1152 + }, + { + "epoch": 0.11221411192214112, + "grad_norm": 1.6031792644515102, + "learning_rate": 9.824078238658483e-06, + "loss": 0.4779, + "step": 1153 + }, + { + "epoch": 0.11231143552311436, + "grad_norm": 1.0762751645680708, + "learning_rate": 9.82366358473163e-06, + "loss": 0.2739, + "step": 1154 + }, + { + "epoch": 0.11240875912408758, + "grad_norm": 1.3660129842713564, + "learning_rate": 9.82324845147436e-06, + "loss": 0.5043, + "step": 1155 + }, + { + "epoch": 0.11250608272506082, + "grad_norm": 1.6273408315616833, + "learning_rate": 9.822832838927929e-06, + "loss": 0.6159, + "step": 1156 + }, + { + "epoch": 0.11260340632603406, + "grad_norm": 1.4216921342906768, + "learning_rate": 9.822416747133634e-06, + "loss": 0.4093, + "step": 1157 + }, + { + "epoch": 0.1127007299270073, + "grad_norm": 1.8899721642114575, + "learning_rate": 9.822000176132822e-06, + "loss": 0.5586, + "step": 1158 + }, + { + "epoch": 0.11279805352798053, + "grad_norm": 1.5144459966059345, + "learning_rate": 9.821583125966889e-06, + "loss": 0.3806, + "step": 1159 + }, + { + "epoch": 0.11289537712895377, + "grad_norm": 1.61041803725934, + "learning_rate": 9.821165596677278e-06, + "loss": 0.4064, + "step": 1160 + }, + { + "epoch": 0.11299270072992701, + "grad_norm": 1.5410637406837986, + "learning_rate": 9.820747588305477e-06, + "loss": 0.3526, + "step": 1161 + }, + { + "epoch": 0.11309002433090025, + "grad_norm": 1.5545393523360629, + "learning_rate": 9.820329100893026e-06, + "loss": 0.3834, + "step": 1162 + }, + { + "epoch": 0.11318734793187347, + "grad_norm": 1.6391567381322345, + "learning_rate": 9.819910134481508e-06, + "loss": 0.3849, + "step": 1163 + }, + { + "epoch": 0.11328467153284671, + "grad_norm": 1.5204183543600032, + "learning_rate": 9.819490689112559e-06, + "loss": 0.4712, + "step": 1164 + }, + { + "epoch": 0.11338199513381995, + "grad_norm": 1.5168954302933022, + "learning_rate": 9.819070764827858e-06, + "loss": 0.4662, + "step": 1165 + }, + { + "epoch": 0.1134793187347932, + "grad_norm": 1.4412304117107342, + "learning_rate": 9.818650361669133e-06, + "loss": 0.3515, + "step": 1166 + }, + { + "epoch": 0.11357664233576642, + "grad_norm": 1.5419710047923603, + "learning_rate": 9.81822947967816e-06, + "loss": 0.383, + "step": 1167 + }, + { + "epoch": 0.11367396593673966, + "grad_norm": 1.59211707141906, + "learning_rate": 9.817808118896759e-06, + "loss": 0.5101, + "step": 1168 + }, + { + "epoch": 0.1137712895377129, + "grad_norm": 1.9315831066859817, + "learning_rate": 9.817386279366808e-06, + "loss": 0.6179, + "step": 1169 + }, + { + "epoch": 0.11386861313868613, + "grad_norm": 1.3153157684002792, + "learning_rate": 9.816963961130218e-06, + "loss": 0.2382, + "step": 1170 + }, + { + "epoch": 0.11396593673965937, + "grad_norm": 1.3579619945410324, + "learning_rate": 9.81654116422896e-06, + "loss": 0.4424, + "step": 1171 + }, + { + "epoch": 0.1140632603406326, + "grad_norm": 1.479330223962703, + "learning_rate": 9.816117888705046e-06, + "loss": 0.3647, + "step": 1172 + }, + { + "epoch": 0.11416058394160584, + "grad_norm": 1.5031676224913018, + "learning_rate": 9.815694134600537e-06, + "loss": 0.3686, + "step": 1173 + }, + { + "epoch": 0.11425790754257907, + "grad_norm": 1.6106095254885215, + "learning_rate": 9.815269901957543e-06, + "loss": 0.5309, + "step": 1174 + }, + { + "epoch": 0.11435523114355231, + "grad_norm": 1.4367590943688036, + "learning_rate": 9.814845190818218e-06, + "loss": 0.3786, + "step": 1175 + }, + { + "epoch": 0.11445255474452555, + "grad_norm": 2.0513510648109636, + "learning_rate": 9.814420001224767e-06, + "loss": 0.8885, + "step": 1176 + }, + { + "epoch": 0.11454987834549879, + "grad_norm": 1.3799990465326748, + "learning_rate": 9.813994333219443e-06, + "loss": 0.3511, + "step": 1177 + }, + { + "epoch": 0.11464720194647202, + "grad_norm": 1.2354207015762353, + "learning_rate": 9.813568186844541e-06, + "loss": 0.3571, + "step": 1178 + }, + { + "epoch": 0.11474452554744526, + "grad_norm": 2.0501383618438678, + "learning_rate": 9.813141562142409e-06, + "loss": 0.4485, + "step": 1179 + }, + { + "epoch": 0.1148418491484185, + "grad_norm": 1.351584991091541, + "learning_rate": 9.812714459155444e-06, + "loss": 0.2894, + "step": 1180 + }, + { + "epoch": 0.11493917274939172, + "grad_norm": 1.3568994189032655, + "learning_rate": 9.812286877926085e-06, + "loss": 0.4016, + "step": 1181 + }, + { + "epoch": 0.11503649635036496, + "grad_norm": 1.4949546840268106, + "learning_rate": 9.81185881849682e-06, + "loss": 0.527, + "step": 1182 + }, + { + "epoch": 0.1151338199513382, + "grad_norm": 1.5053242129518953, + "learning_rate": 9.811430280910186e-06, + "loss": 0.4324, + "step": 1183 + }, + { + "epoch": 0.11523114355231144, + "grad_norm": 1.2995408017430223, + "learning_rate": 9.811001265208768e-06, + "loss": 0.4592, + "step": 1184 + }, + { + "epoch": 0.11532846715328467, + "grad_norm": 1.4103061247668216, + "learning_rate": 9.810571771435197e-06, + "loss": 0.4615, + "step": 1185 + }, + { + "epoch": 0.1154257907542579, + "grad_norm": 1.3694132099540144, + "learning_rate": 9.810141799632153e-06, + "loss": 0.4224, + "step": 1186 + }, + { + "epoch": 0.11552311435523115, + "grad_norm": 1.4494836775882813, + "learning_rate": 9.809711349842363e-06, + "loss": 0.4189, + "step": 1187 + }, + { + "epoch": 0.11562043795620439, + "grad_norm": 1.5100099037805617, + "learning_rate": 9.809280422108598e-06, + "loss": 0.495, + "step": 1188 + }, + { + "epoch": 0.11571776155717761, + "grad_norm": 1.449093301695385, + "learning_rate": 9.808849016473682e-06, + "loss": 0.345, + "step": 1189 + }, + { + "epoch": 0.11581508515815085, + "grad_norm": 1.501093862959825, + "learning_rate": 9.808417132980484e-06, + "loss": 0.4624, + "step": 1190 + }, + { + "epoch": 0.11591240875912409, + "grad_norm": 1.4567657310588336, + "learning_rate": 9.807984771671919e-06, + "loss": 0.2836, + "step": 1191 + }, + { + "epoch": 0.11600973236009732, + "grad_norm": 1.6666134190000732, + "learning_rate": 9.807551932590952e-06, + "loss": 0.3341, + "step": 1192 + }, + { + "epoch": 0.11610705596107056, + "grad_norm": 1.7534770482902293, + "learning_rate": 9.807118615780595e-06, + "loss": 0.6021, + "step": 1193 + }, + { + "epoch": 0.1162043795620438, + "grad_norm": 1.744738707996039, + "learning_rate": 9.806684821283908e-06, + "loss": 0.4593, + "step": 1194 + }, + { + "epoch": 0.11630170316301704, + "grad_norm": 1.7519974888996959, + "learning_rate": 9.806250549143994e-06, + "loss": 0.5433, + "step": 1195 + }, + { + "epoch": 0.11639902676399026, + "grad_norm": 1.6094009249182397, + "learning_rate": 9.805815799404008e-06, + "loss": 0.6053, + "step": 1196 + }, + { + "epoch": 0.1164963503649635, + "grad_norm": 1.4291146386614342, + "learning_rate": 9.805380572107153e-06, + "loss": 0.4377, + "step": 1197 + }, + { + "epoch": 0.11659367396593674, + "grad_norm": 1.6092739629047335, + "learning_rate": 9.804944867296678e-06, + "loss": 0.5708, + "step": 1198 + }, + { + "epoch": 0.11669099756690998, + "grad_norm": 1.3856208861087336, + "learning_rate": 9.804508685015876e-06, + "loss": 0.3677, + "step": 1199 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 1.52110832375871, + "learning_rate": 9.804072025308096e-06, + "loss": 0.3076, + "step": 1200 + }, + { + "epoch": 0.11688564476885645, + "grad_norm": 1.3072729020716074, + "learning_rate": 9.803634888216724e-06, + "loss": 0.2673, + "step": 1201 + }, + { + "epoch": 0.11698296836982969, + "grad_norm": 1.9045471339964295, + "learning_rate": 9.8031972737852e-06, + "loss": 0.7326, + "step": 1202 + }, + { + "epoch": 0.11708029197080291, + "grad_norm": 1.3351659498760804, + "learning_rate": 9.802759182057013e-06, + "loss": 0.4193, + "step": 1203 + }, + { + "epoch": 0.11717761557177615, + "grad_norm": 1.4664570380446003, + "learning_rate": 9.80232061307569e-06, + "loss": 0.358, + "step": 1204 + }, + { + "epoch": 0.11727493917274939, + "grad_norm": 1.1764722042212887, + "learning_rate": 9.80188156688482e-06, + "loss": 0.3093, + "step": 1205 + }, + { + "epoch": 0.11737226277372263, + "grad_norm": 1.5415184448059258, + "learning_rate": 9.801442043528026e-06, + "loss": 0.4667, + "step": 1206 + }, + { + "epoch": 0.11746958637469586, + "grad_norm": 1.4827166479100118, + "learning_rate": 9.801002043048984e-06, + "loss": 0.4876, + "step": 1207 + }, + { + "epoch": 0.1175669099756691, + "grad_norm": 1.6786553338713377, + "learning_rate": 9.80056156549142e-06, + "loss": 0.5076, + "step": 1208 + }, + { + "epoch": 0.11766423357664234, + "grad_norm": 1.2161366736688597, + "learning_rate": 9.8001206108991e-06, + "loss": 0.2247, + "step": 1209 + }, + { + "epoch": 0.11776155717761558, + "grad_norm": 1.4015628266094937, + "learning_rate": 9.799679179315846e-06, + "loss": 0.4327, + "step": 1210 + }, + { + "epoch": 0.1178588807785888, + "grad_norm": 1.5420255844625947, + "learning_rate": 9.799237270785522e-06, + "loss": 0.438, + "step": 1211 + }, + { + "epoch": 0.11795620437956204, + "grad_norm": 1.5978057716745744, + "learning_rate": 9.79879488535204e-06, + "loss": 0.4203, + "step": 1212 + }, + { + "epoch": 0.11805352798053528, + "grad_norm": 1.8973070083198396, + "learning_rate": 9.79835202305936e-06, + "loss": 0.7404, + "step": 1213 + }, + { + "epoch": 0.11815085158150851, + "grad_norm": 1.5331088091760856, + "learning_rate": 9.797908683951492e-06, + "loss": 0.5378, + "step": 1214 + }, + { + "epoch": 0.11824817518248175, + "grad_norm": 1.9627839775910105, + "learning_rate": 9.797464868072489e-06, + "loss": 0.6298, + "step": 1215 + }, + { + "epoch": 0.11834549878345499, + "grad_norm": 1.5059209630421948, + "learning_rate": 9.797020575466452e-06, + "loss": 0.4233, + "step": 1216 + }, + { + "epoch": 0.11844282238442823, + "grad_norm": 1.4714593450262028, + "learning_rate": 9.796575806177531e-06, + "loss": 0.4078, + "step": 1217 + }, + { + "epoch": 0.11854014598540145, + "grad_norm": 1.812199008547911, + "learning_rate": 9.796130560249926e-06, + "loss": 0.6636, + "step": 1218 + }, + { + "epoch": 0.1186374695863747, + "grad_norm": 1.330364448549248, + "learning_rate": 9.795684837727878e-06, + "loss": 0.2597, + "step": 1219 + }, + { + "epoch": 0.11873479318734793, + "grad_norm": 1.1642089342014024, + "learning_rate": 9.795238638655681e-06, + "loss": 0.2669, + "step": 1220 + }, + { + "epoch": 0.11883211678832117, + "grad_norm": 1.0578785975666756, + "learning_rate": 9.794791963077672e-06, + "loss": 0.2138, + "step": 1221 + }, + { + "epoch": 0.1189294403892944, + "grad_norm": 1.2810119779981208, + "learning_rate": 9.794344811038239e-06, + "loss": 0.3426, + "step": 1222 + }, + { + "epoch": 0.11902676399026764, + "grad_norm": 1.6109574325023976, + "learning_rate": 9.793897182581816e-06, + "loss": 0.4931, + "step": 1223 + }, + { + "epoch": 0.11912408759124088, + "grad_norm": 1.8314564663365431, + "learning_rate": 9.793449077752882e-06, + "loss": 0.5424, + "step": 1224 + }, + { + "epoch": 0.1192214111922141, + "grad_norm": 1.3266514401224994, + "learning_rate": 9.793000496595968e-06, + "loss": 0.3123, + "step": 1225 + }, + { + "epoch": 0.11931873479318734, + "grad_norm": 1.624792232435884, + "learning_rate": 9.792551439155649e-06, + "loss": 0.3635, + "step": 1226 + }, + { + "epoch": 0.11941605839416058, + "grad_norm": 1.306535519875853, + "learning_rate": 9.792101905476547e-06, + "loss": 0.3252, + "step": 1227 + }, + { + "epoch": 0.11951338199513382, + "grad_norm": 1.591218471169796, + "learning_rate": 9.791651895603333e-06, + "loss": 0.5493, + "step": 1228 + }, + { + "epoch": 0.11961070559610705, + "grad_norm": 1.8218114354346657, + "learning_rate": 9.791201409580725e-06, + "loss": 0.6988, + "step": 1229 + }, + { + "epoch": 0.11970802919708029, + "grad_norm": 1.7366783724272585, + "learning_rate": 9.790750447453487e-06, + "loss": 0.4285, + "step": 1230 + }, + { + "epoch": 0.11980535279805353, + "grad_norm": 1.9439764988659998, + "learning_rate": 9.790299009266434e-06, + "loss": 0.2787, + "step": 1231 + }, + { + "epoch": 0.11990267639902677, + "grad_norm": 1.4894849660267724, + "learning_rate": 9.789847095064425e-06, + "loss": 0.2531, + "step": 1232 + }, + { + "epoch": 0.12, + "grad_norm": 1.6270936536604101, + "learning_rate": 9.789394704892364e-06, + "loss": 0.5309, + "step": 1233 + }, + { + "epoch": 0.12009732360097324, + "grad_norm": 1.4144832764840753, + "learning_rate": 9.788941838795209e-06, + "loss": 0.298, + "step": 1234 + }, + { + "epoch": 0.12019464720194648, + "grad_norm": 1.546926786538444, + "learning_rate": 9.788488496817958e-06, + "loss": 0.4751, + "step": 1235 + }, + { + "epoch": 0.12029197080291971, + "grad_norm": 1.5827216255031866, + "learning_rate": 9.788034679005664e-06, + "loss": 0.4576, + "step": 1236 + }, + { + "epoch": 0.12038929440389294, + "grad_norm": 1.6103699210596951, + "learning_rate": 9.78758038540342e-06, + "loss": 0.4637, + "step": 1237 + }, + { + "epoch": 0.12048661800486618, + "grad_norm": 1.4918367462943103, + "learning_rate": 9.78712561605637e-06, + "loss": 0.4998, + "step": 1238 + }, + { + "epoch": 0.12058394160583942, + "grad_norm": 1.5775409788682337, + "learning_rate": 9.786670371009706e-06, + "loss": 0.4415, + "step": 1239 + }, + { + "epoch": 0.12068126520681265, + "grad_norm": 1.5427286854632911, + "learning_rate": 9.786214650308666e-06, + "loss": 0.4606, + "step": 1240 + }, + { + "epoch": 0.12077858880778589, + "grad_norm": 1.523821034203494, + "learning_rate": 9.78575845399853e-06, + "loss": 0.3918, + "step": 1241 + }, + { + "epoch": 0.12087591240875913, + "grad_norm": 1.950297391662121, + "learning_rate": 9.785301782124638e-06, + "loss": 0.5579, + "step": 1242 + }, + { + "epoch": 0.12097323600973237, + "grad_norm": 1.5957141815138678, + "learning_rate": 9.784844634732367e-06, + "loss": 0.3814, + "step": 1243 + }, + { + "epoch": 0.12107055961070559, + "grad_norm": 1.3924341327971197, + "learning_rate": 9.784387011867145e-06, + "loss": 0.3576, + "step": 1244 + }, + { + "epoch": 0.12116788321167883, + "grad_norm": 1.670661057733516, + "learning_rate": 9.783928913574442e-06, + "loss": 0.5307, + "step": 1245 + }, + { + "epoch": 0.12126520681265207, + "grad_norm": 1.9162789104592521, + "learning_rate": 9.783470339899783e-06, + "loss": 0.2309, + "step": 1246 + }, + { + "epoch": 0.12136253041362531, + "grad_norm": 1.4323883393925967, + "learning_rate": 9.783011290888737e-06, + "loss": 0.4816, + "step": 1247 + }, + { + "epoch": 0.12145985401459854, + "grad_norm": 1.133557304990043, + "learning_rate": 9.78255176658692e-06, + "loss": 0.259, + "step": 1248 + }, + { + "epoch": 0.12155717761557178, + "grad_norm": 1.6381613262272003, + "learning_rate": 9.782091767039992e-06, + "loss": 0.535, + "step": 1249 + }, + { + "epoch": 0.12165450121654502, + "grad_norm": 1.521879132713644, + "learning_rate": 9.781631292293668e-06, + "loss": 0.5299, + "step": 1250 + }, + { + "epoch": 0.12175182481751824, + "grad_norm": 1.2965362290198492, + "learning_rate": 9.781170342393702e-06, + "loss": 0.4161, + "step": 1251 + }, + { + "epoch": 0.12184914841849148, + "grad_norm": 1.4753461399295356, + "learning_rate": 9.780708917385901e-06, + "loss": 0.5379, + "step": 1252 + }, + { + "epoch": 0.12194647201946472, + "grad_norm": 0.9509628974965367, + "learning_rate": 9.780247017316115e-06, + "loss": 0.2681, + "step": 1253 + }, + { + "epoch": 0.12204379562043796, + "grad_norm": 1.3308735848114122, + "learning_rate": 9.779784642230246e-06, + "loss": 0.4247, + "step": 1254 + }, + { + "epoch": 0.12214111922141119, + "grad_norm": 1.1206835484781008, + "learning_rate": 9.779321792174239e-06, + "loss": 0.2301, + "step": 1255 + }, + { + "epoch": 0.12223844282238443, + "grad_norm": 1.2598096263209464, + "learning_rate": 9.778858467194087e-06, + "loss": 0.3163, + "step": 1256 + }, + { + "epoch": 0.12233576642335767, + "grad_norm": 1.4871998460052394, + "learning_rate": 9.778394667335834e-06, + "loss": 0.3433, + "step": 1257 + }, + { + "epoch": 0.1224330900243309, + "grad_norm": 1.384245738588718, + "learning_rate": 9.777930392645565e-06, + "loss": 0.2111, + "step": 1258 + }, + { + "epoch": 0.12253041362530413, + "grad_norm": 1.4369061113982475, + "learning_rate": 9.777465643169417e-06, + "loss": 0.3895, + "step": 1259 + }, + { + "epoch": 0.12262773722627737, + "grad_norm": 1.8558638944994366, + "learning_rate": 9.777000418953568e-06, + "loss": 0.3388, + "step": 1260 + }, + { + "epoch": 0.12272506082725061, + "grad_norm": 1.512984108492842, + "learning_rate": 9.776534720044255e-06, + "loss": 0.4726, + "step": 1261 + }, + { + "epoch": 0.12282238442822384, + "grad_norm": 1.367540412040702, + "learning_rate": 9.77606854648775e-06, + "loss": 0.2684, + "step": 1262 + }, + { + "epoch": 0.12291970802919708, + "grad_norm": 1.2042550068870583, + "learning_rate": 9.775601898330377e-06, + "loss": 0.2173, + "step": 1263 + }, + { + "epoch": 0.12301703163017032, + "grad_norm": 1.5842484372844456, + "learning_rate": 9.775134775618509e-06, + "loss": 0.5608, + "step": 1264 + }, + { + "epoch": 0.12311435523114356, + "grad_norm": 1.397447971201202, + "learning_rate": 9.774667178398562e-06, + "loss": 0.4632, + "step": 1265 + }, + { + "epoch": 0.12321167883211678, + "grad_norm": 1.3468996882112099, + "learning_rate": 9.774199106717004e-06, + "loss": 0.3697, + "step": 1266 + }, + { + "epoch": 0.12330900243309002, + "grad_norm": 1.252677053550249, + "learning_rate": 9.773730560620345e-06, + "loss": 0.2377, + "step": 1267 + }, + { + "epoch": 0.12340632603406326, + "grad_norm": 1.4179546260918483, + "learning_rate": 9.773261540155148e-06, + "loss": 0.4857, + "step": 1268 + }, + { + "epoch": 0.1235036496350365, + "grad_norm": 1.3092572252570605, + "learning_rate": 9.772792045368015e-06, + "loss": 0.2969, + "step": 1269 + }, + { + "epoch": 0.12360097323600973, + "grad_norm": 1.7901486760202572, + "learning_rate": 9.772322076305607e-06, + "loss": 0.6935, + "step": 1270 + }, + { + "epoch": 0.12369829683698297, + "grad_norm": 1.5982523135009328, + "learning_rate": 9.771851633014618e-06, + "loss": 0.4368, + "step": 1271 + }, + { + "epoch": 0.12379562043795621, + "grad_norm": 1.195950207110724, + "learning_rate": 9.7713807155418e-06, + "loss": 0.3202, + "step": 1272 + }, + { + "epoch": 0.12389294403892943, + "grad_norm": 1.352519407714817, + "learning_rate": 9.770909323933947e-06, + "loss": 0.4284, + "step": 1273 + }, + { + "epoch": 0.12399026763990267, + "grad_norm": 1.4231425912579843, + "learning_rate": 9.770437458237903e-06, + "loss": 0.434, + "step": 1274 + }, + { + "epoch": 0.12408759124087591, + "grad_norm": 1.2825234760121222, + "learning_rate": 9.769965118500555e-06, + "loss": 0.3817, + "step": 1275 + }, + { + "epoch": 0.12418491484184915, + "grad_norm": 1.8250797045299043, + "learning_rate": 9.769492304768843e-06, + "loss": 0.7366, + "step": 1276 + }, + { + "epoch": 0.12428223844282238, + "grad_norm": 1.3974167065714918, + "learning_rate": 9.769019017089748e-06, + "loss": 0.2804, + "step": 1277 + }, + { + "epoch": 0.12437956204379562, + "grad_norm": 1.2933017267383033, + "learning_rate": 9.768545255510302e-06, + "loss": 0.3495, + "step": 1278 + }, + { + "epoch": 0.12447688564476886, + "grad_norm": 1.2423501538003798, + "learning_rate": 9.768071020077584e-06, + "loss": 0.2908, + "step": 1279 + }, + { + "epoch": 0.1245742092457421, + "grad_norm": 1.8228975858143868, + "learning_rate": 9.767596310838718e-06, + "loss": 0.4222, + "step": 1280 + }, + { + "epoch": 0.12467153284671532, + "grad_norm": 1.5510872411682606, + "learning_rate": 9.767121127840874e-06, + "loss": 0.5058, + "step": 1281 + }, + { + "epoch": 0.12476885644768856, + "grad_norm": 1.6665778692750302, + "learning_rate": 9.766645471131278e-06, + "loss": 0.3592, + "step": 1282 + }, + { + "epoch": 0.1248661800486618, + "grad_norm": 1.5396481092124317, + "learning_rate": 9.766169340757187e-06, + "loss": 0.2737, + "step": 1283 + }, + { + "epoch": 0.12496350364963503, + "grad_norm": 1.5555229817491858, + "learning_rate": 9.765692736765922e-06, + "loss": 0.5466, + "step": 1284 + }, + { + "epoch": 0.12506082725060827, + "grad_norm": 1.5351601326386175, + "learning_rate": 9.765215659204838e-06, + "loss": 0.4733, + "step": 1285 + }, + { + "epoch": 0.1251581508515815, + "grad_norm": 1.2793773363741519, + "learning_rate": 9.764738108121347e-06, + "loss": 0.3056, + "step": 1286 + }, + { + "epoch": 0.12525547445255475, + "grad_norm": 1.6331577939793205, + "learning_rate": 9.764260083562902e-06, + "loss": 0.5883, + "step": 1287 + }, + { + "epoch": 0.12535279805352798, + "grad_norm": 1.3363728845544067, + "learning_rate": 9.763781585577003e-06, + "loss": 0.2904, + "step": 1288 + }, + { + "epoch": 0.12545012165450123, + "grad_norm": 1.360818732035961, + "learning_rate": 9.763302614211199e-06, + "loss": 0.4202, + "step": 1289 + }, + { + "epoch": 0.12554744525547445, + "grad_norm": 1.3103877737057137, + "learning_rate": 9.762823169513089e-06, + "loss": 0.4694, + "step": 1290 + }, + { + "epoch": 0.12564476885644768, + "grad_norm": 1.1848446118808063, + "learning_rate": 9.76234325153031e-06, + "loss": 0.2265, + "step": 1291 + }, + { + "epoch": 0.12574209245742093, + "grad_norm": 1.3494947194310234, + "learning_rate": 9.761862860310558e-06, + "loss": 0.2382, + "step": 1292 + }, + { + "epoch": 0.12583941605839416, + "grad_norm": 1.7062717031139596, + "learning_rate": 9.761381995901564e-06, + "loss": 0.7254, + "step": 1293 + }, + { + "epoch": 0.12593673965936739, + "grad_norm": 1.208337515242783, + "learning_rate": 9.760900658351117e-06, + "loss": 0.326, + "step": 1294 + }, + { + "epoch": 0.12603406326034064, + "grad_norm": 1.3159841432369768, + "learning_rate": 9.760418847707043e-06, + "loss": 0.3438, + "step": 1295 + }, + { + "epoch": 0.12613138686131387, + "grad_norm": 1.3809255020300633, + "learning_rate": 9.759936564017223e-06, + "loss": 0.2716, + "step": 1296 + }, + { + "epoch": 0.1262287104622871, + "grad_norm": 1.3382917039666673, + "learning_rate": 9.759453807329582e-06, + "loss": 0.2882, + "step": 1297 + }, + { + "epoch": 0.12632603406326035, + "grad_norm": 1.3572918507167704, + "learning_rate": 9.75897057769209e-06, + "loss": 0.4181, + "step": 1298 + }, + { + "epoch": 0.12642335766423357, + "grad_norm": 1.4433440128897468, + "learning_rate": 9.758486875152766e-06, + "loss": 0.4883, + "step": 1299 + }, + { + "epoch": 0.12652068126520682, + "grad_norm": 1.1934091211117765, + "learning_rate": 9.758002699759677e-06, + "loss": 0.3828, + "step": 1300 + }, + { + "epoch": 0.12661800486618005, + "grad_norm": 1.4647925609545562, + "learning_rate": 9.757518051560935e-06, + "loss": 0.402, + "step": 1301 + }, + { + "epoch": 0.12671532846715328, + "grad_norm": 1.658517832372951, + "learning_rate": 9.7570329306047e-06, + "loss": 0.6752, + "step": 1302 + }, + { + "epoch": 0.12681265206812653, + "grad_norm": 1.2682494280043264, + "learning_rate": 9.75654733693918e-06, + "loss": 0.2786, + "step": 1303 + }, + { + "epoch": 0.12690997566909976, + "grad_norm": 1.3919267883395627, + "learning_rate": 9.756061270612625e-06, + "loss": 0.4806, + "step": 1304 + }, + { + "epoch": 0.12700729927007298, + "grad_norm": 1.160118847382142, + "learning_rate": 9.75557473167334e-06, + "loss": 0.2458, + "step": 1305 + }, + { + "epoch": 0.12710462287104624, + "grad_norm": 1.482640427472728, + "learning_rate": 9.755087720169672e-06, + "loss": 0.527, + "step": 1306 + }, + { + "epoch": 0.12720194647201946, + "grad_norm": 1.5068875178509769, + "learning_rate": 9.75460023615001e-06, + "loss": 0.4985, + "step": 1307 + }, + { + "epoch": 0.1272992700729927, + "grad_norm": 1.2878541774064265, + "learning_rate": 9.754112279662805e-06, + "loss": 0.3478, + "step": 1308 + }, + { + "epoch": 0.12739659367396594, + "grad_norm": 1.1398490461157162, + "learning_rate": 9.75362385075654e-06, + "loss": 0.3084, + "step": 1309 + }, + { + "epoch": 0.12749391727493917, + "grad_norm": 1.2924420070365765, + "learning_rate": 9.75313494947975e-06, + "loss": 0.3919, + "step": 1310 + }, + { + "epoch": 0.12759124087591242, + "grad_norm": 1.4558696462945964, + "learning_rate": 9.752645575881018e-06, + "loss": 0.225, + "step": 1311 + }, + { + "epoch": 0.12768856447688565, + "grad_norm": 1.677251779693783, + "learning_rate": 9.752155730008974e-06, + "loss": 0.4831, + "step": 1312 + }, + { + "epoch": 0.12778588807785887, + "grad_norm": 1.3350720195417478, + "learning_rate": 9.751665411912294e-06, + "loss": 0.4371, + "step": 1313 + }, + { + "epoch": 0.12788321167883213, + "grad_norm": 1.3653039655289896, + "learning_rate": 9.751174621639702e-06, + "loss": 0.4051, + "step": 1314 + }, + { + "epoch": 0.12798053527980535, + "grad_norm": 2.0214110135389927, + "learning_rate": 9.75068335923997e-06, + "loss": 0.4971, + "step": 1315 + }, + { + "epoch": 0.12807785888077858, + "grad_norm": 1.7144522600221743, + "learning_rate": 9.750191624761909e-06, + "loss": 0.6353, + "step": 1316 + }, + { + "epoch": 0.12817518248175183, + "grad_norm": 1.61491787633751, + "learning_rate": 9.749699418254388e-06, + "loss": 0.5408, + "step": 1317 + }, + { + "epoch": 0.12827250608272506, + "grad_norm": 1.3029361322695596, + "learning_rate": 9.749206739766317e-06, + "loss": 0.407, + "step": 1318 + }, + { + "epoch": 0.12836982968369828, + "grad_norm": 1.2453189940624274, + "learning_rate": 9.748713589346652e-06, + "loss": 0.3254, + "step": 1319 + }, + { + "epoch": 0.12846715328467154, + "grad_norm": 1.4117795102544664, + "learning_rate": 9.748219967044398e-06, + "loss": 0.3941, + "step": 1320 + }, + { + "epoch": 0.12856447688564476, + "grad_norm": 1.4197813276706028, + "learning_rate": 9.74772587290861e-06, + "loss": 0.3454, + "step": 1321 + }, + { + "epoch": 0.12866180048661802, + "grad_norm": 1.3133599325252279, + "learning_rate": 9.747231306988381e-06, + "loss": 0.3389, + "step": 1322 + }, + { + "epoch": 0.12875912408759124, + "grad_norm": 1.3432229805022793, + "learning_rate": 9.746736269332861e-06, + "loss": 0.469, + "step": 1323 + }, + { + "epoch": 0.12885644768856447, + "grad_norm": 1.1244292400820686, + "learning_rate": 9.746240759991241e-06, + "loss": 0.3674, + "step": 1324 + }, + { + "epoch": 0.12895377128953772, + "grad_norm": 1.4966792860681473, + "learning_rate": 9.745744779012758e-06, + "loss": 0.4308, + "step": 1325 + }, + { + "epoch": 0.12905109489051095, + "grad_norm": 1.5238028846181695, + "learning_rate": 9.745248326446699e-06, + "loss": 0.4213, + "step": 1326 + }, + { + "epoch": 0.12914841849148417, + "grad_norm": 1.3633303920337936, + "learning_rate": 9.744751402342398e-06, + "loss": 0.438, + "step": 1327 + }, + { + "epoch": 0.12924574209245743, + "grad_norm": 1.3260493495785517, + "learning_rate": 9.744254006749235e-06, + "loss": 0.4762, + "step": 1328 + }, + { + "epoch": 0.12934306569343065, + "grad_norm": 1.705738477220435, + "learning_rate": 9.743756139716634e-06, + "loss": 0.5861, + "step": 1329 + }, + { + "epoch": 0.12944038929440388, + "grad_norm": 1.5829201544013396, + "learning_rate": 9.743257801294069e-06, + "loss": 0.469, + "step": 1330 + }, + { + "epoch": 0.12953771289537713, + "grad_norm": 1.1445128143179795, + "learning_rate": 9.74275899153106e-06, + "loss": 0.4018, + "step": 1331 + }, + { + "epoch": 0.12963503649635036, + "grad_norm": 1.2900129109113572, + "learning_rate": 9.742259710477178e-06, + "loss": 0.3802, + "step": 1332 + }, + { + "epoch": 0.1297323600973236, + "grad_norm": 1.3212461161488713, + "learning_rate": 9.74175995818203e-06, + "loss": 0.3725, + "step": 1333 + }, + { + "epoch": 0.12982968369829684, + "grad_norm": 1.3979706650986563, + "learning_rate": 9.741259734695283e-06, + "loss": 0.3961, + "step": 1334 + }, + { + "epoch": 0.12992700729927006, + "grad_norm": 1.2642819849441118, + "learning_rate": 9.740759040066642e-06, + "loss": 0.3528, + "step": 1335 + }, + { + "epoch": 0.13002433090024332, + "grad_norm": 1.7776493019463793, + "learning_rate": 9.74025787434586e-06, + "loss": 0.6424, + "step": 1336 + }, + { + "epoch": 0.13012165450121654, + "grad_norm": 1.1885806737857232, + "learning_rate": 9.73975623758274e-06, + "loss": 0.3163, + "step": 1337 + }, + { + "epoch": 0.13021897810218977, + "grad_norm": 1.7443954093720497, + "learning_rate": 9.739254129827131e-06, + "loss": 0.7263, + "step": 1338 + }, + { + "epoch": 0.13031630170316302, + "grad_norm": 1.7005058938305366, + "learning_rate": 9.738751551128924e-06, + "loss": 0.5204, + "step": 1339 + }, + { + "epoch": 0.13041362530413625, + "grad_norm": 6.598521165184121, + "learning_rate": 9.738248501538063e-06, + "loss": 0.5113, + "step": 1340 + }, + { + "epoch": 0.1305109489051095, + "grad_norm": 1.6203066466178853, + "learning_rate": 9.737744981104536e-06, + "loss": 0.625, + "step": 1341 + }, + { + "epoch": 0.13060827250608273, + "grad_norm": 1.548111392574701, + "learning_rate": 9.73724098987838e-06, + "loss": 0.3952, + "step": 1342 + }, + { + "epoch": 0.13070559610705595, + "grad_norm": 1.4871418112966692, + "learning_rate": 9.736736527909674e-06, + "loss": 0.5084, + "step": 1343 + }, + { + "epoch": 0.1308029197080292, + "grad_norm": 1.0723677900938815, + "learning_rate": 9.736231595248546e-06, + "loss": 0.255, + "step": 1344 + }, + { + "epoch": 0.13090024330900243, + "grad_norm": 1.5695490713137843, + "learning_rate": 9.735726191945176e-06, + "loss": 0.3438, + "step": 1345 + }, + { + "epoch": 0.13099756690997566, + "grad_norm": 2.059617079542521, + "learning_rate": 9.73522031804978e-06, + "loss": 0.5249, + "step": 1346 + }, + { + "epoch": 0.1310948905109489, + "grad_norm": 1.5301765260275246, + "learning_rate": 9.734713973612633e-06, + "loss": 0.3667, + "step": 1347 + }, + { + "epoch": 0.13119221411192214, + "grad_norm": 1.7431028553023509, + "learning_rate": 9.734207158684048e-06, + "loss": 0.5551, + "step": 1348 + }, + { + "epoch": 0.13128953771289537, + "grad_norm": 1.2916959738739295, + "learning_rate": 9.733699873314388e-06, + "loss": 0.278, + "step": 1349 + }, + { + "epoch": 0.13138686131386862, + "grad_norm": 1.5891072584842363, + "learning_rate": 9.733192117554062e-06, + "loss": 0.4139, + "step": 1350 + }, + { + "epoch": 0.13148418491484185, + "grad_norm": 1.6366778166029219, + "learning_rate": 9.732683891453528e-06, + "loss": 0.4888, + "step": 1351 + }, + { + "epoch": 0.1315815085158151, + "grad_norm": 1.6763551525158185, + "learning_rate": 9.732175195063283e-06, + "loss": 0.5432, + "step": 1352 + }, + { + "epoch": 0.13167883211678832, + "grad_norm": 1.551593865483807, + "learning_rate": 9.731666028433882e-06, + "loss": 0.5634, + "step": 1353 + }, + { + "epoch": 0.13177615571776155, + "grad_norm": 1.693219206573502, + "learning_rate": 9.731156391615919e-06, + "loss": 0.4554, + "step": 1354 + }, + { + "epoch": 0.1318734793187348, + "grad_norm": 1.4894832853139421, + "learning_rate": 9.730646284660037e-06, + "loss": 0.4286, + "step": 1355 + }, + { + "epoch": 0.13197080291970803, + "grad_norm": 1.20058966692396, + "learning_rate": 9.730135707616927e-06, + "loss": 0.2519, + "step": 1356 + }, + { + "epoch": 0.13206812652068126, + "grad_norm": 1.395115321325138, + "learning_rate": 9.729624660537324e-06, + "loss": 0.3718, + "step": 1357 + }, + { + "epoch": 0.1321654501216545, + "grad_norm": 1.3441869335850034, + "learning_rate": 9.729113143472011e-06, + "loss": 0.43, + "step": 1358 + }, + { + "epoch": 0.13226277372262774, + "grad_norm": 1.31865416445236, + "learning_rate": 9.72860115647182e-06, + "loss": 0.296, + "step": 1359 + }, + { + "epoch": 0.13236009732360096, + "grad_norm": 1.3998148863889133, + "learning_rate": 9.728088699587623e-06, + "loss": 0.2642, + "step": 1360 + }, + { + "epoch": 0.13245742092457422, + "grad_norm": 1.5917388343760925, + "learning_rate": 9.727575772870347e-06, + "loss": 0.5999, + "step": 1361 + }, + { + "epoch": 0.13255474452554744, + "grad_norm": 1.6062441992747731, + "learning_rate": 9.727062376370962e-06, + "loss": 0.6017, + "step": 1362 + }, + { + "epoch": 0.1326520681265207, + "grad_norm": 1.756325054261889, + "learning_rate": 9.72654851014048e-06, + "loss": 0.5855, + "step": 1363 + }, + { + "epoch": 0.13274939172749392, + "grad_norm": 1.5782112626775713, + "learning_rate": 9.72603417422997e-06, + "loss": 0.5643, + "step": 1364 + }, + { + "epoch": 0.13284671532846715, + "grad_norm": 1.6280008631148617, + "learning_rate": 9.725519368690539e-06, + "loss": 0.3918, + "step": 1365 + }, + { + "epoch": 0.1329440389294404, + "grad_norm": 1.731476294535625, + "learning_rate": 9.725004093573343e-06, + "loss": 0.6909, + "step": 1366 + }, + { + "epoch": 0.13304136253041363, + "grad_norm": 1.7012591859680217, + "learning_rate": 9.724488348929587e-06, + "loss": 0.3206, + "step": 1367 + }, + { + "epoch": 0.13313868613138685, + "grad_norm": 1.5539166250213363, + "learning_rate": 9.723972134810519e-06, + "loss": 0.3735, + "step": 1368 + }, + { + "epoch": 0.1332360097323601, + "grad_norm": 1.2431527472113675, + "learning_rate": 9.723455451267436e-06, + "loss": 0.4023, + "step": 1369 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.4147928785913308, + "learning_rate": 9.722938298351682e-06, + "loss": 0.4501, + "step": 1370 + }, + { + "epoch": 0.13343065693430656, + "grad_norm": 1.5209109752466956, + "learning_rate": 9.722420676114646e-06, + "loss": 0.4504, + "step": 1371 + }, + { + "epoch": 0.1335279805352798, + "grad_norm": 1.6031977794999224, + "learning_rate": 9.721902584607766e-06, + "loss": 0.4036, + "step": 1372 + }, + { + "epoch": 0.13362530413625304, + "grad_norm": 1.3752266957066934, + "learning_rate": 9.721384023882524e-06, + "loss": 0.4008, + "step": 1373 + }, + { + "epoch": 0.1337226277372263, + "grad_norm": 1.2289957585024915, + "learning_rate": 9.720864993990448e-06, + "loss": 0.3214, + "step": 1374 + }, + { + "epoch": 0.13381995133819952, + "grad_norm": 1.5334770200964671, + "learning_rate": 9.720345494983117e-06, + "loss": 0.4103, + "step": 1375 + }, + { + "epoch": 0.13391727493917274, + "grad_norm": 1.4428318489533865, + "learning_rate": 9.719825526912152e-06, + "loss": 0.4314, + "step": 1376 + }, + { + "epoch": 0.134014598540146, + "grad_norm": 1.6794168653476527, + "learning_rate": 9.719305089829224e-06, + "loss": 0.6027, + "step": 1377 + }, + { + "epoch": 0.13411192214111922, + "grad_norm": 1.4695816931820398, + "learning_rate": 9.718784183786048e-06, + "loss": 0.5337, + "step": 1378 + }, + { + "epoch": 0.13420924574209245, + "grad_norm": 1.428180254445363, + "learning_rate": 9.718262808834386e-06, + "loss": 0.3636, + "step": 1379 + }, + { + "epoch": 0.1343065693430657, + "grad_norm": 1.4446624118640763, + "learning_rate": 9.717740965026051e-06, + "loss": 0.4213, + "step": 1380 + }, + { + "epoch": 0.13440389294403893, + "grad_norm": 1.0145899854020284, + "learning_rate": 9.717218652412896e-06, + "loss": 0.292, + "step": 1381 + }, + { + "epoch": 0.13450121654501215, + "grad_norm": 1.4589445831305994, + "learning_rate": 9.716695871046824e-06, + "loss": 0.4787, + "step": 1382 + }, + { + "epoch": 0.1345985401459854, + "grad_norm": 1.5462880417778382, + "learning_rate": 9.716172620979783e-06, + "loss": 0.4716, + "step": 1383 + }, + { + "epoch": 0.13469586374695863, + "grad_norm": 1.5411526965931406, + "learning_rate": 9.71564890226377e-06, + "loss": 0.5311, + "step": 1384 + }, + { + "epoch": 0.1347931873479319, + "grad_norm": 1.1928924795974905, + "learning_rate": 9.71512471495083e-06, + "loss": 0.2724, + "step": 1385 + }, + { + "epoch": 0.1348905109489051, + "grad_norm": 1.3201585939530793, + "learning_rate": 9.714600059093045e-06, + "loss": 0.2987, + "step": 1386 + }, + { + "epoch": 0.13498783454987834, + "grad_norm": 1.5687746327202647, + "learning_rate": 9.714074934742556e-06, + "loss": 0.363, + "step": 1387 + }, + { + "epoch": 0.1350851581508516, + "grad_norm": 1.5338932500845779, + "learning_rate": 9.713549341951543e-06, + "loss": 0.5661, + "step": 1388 + }, + { + "epoch": 0.13518248175182482, + "grad_norm": 1.1601153536694444, + "learning_rate": 9.713023280772236e-06, + "loss": 0.3079, + "step": 1389 + }, + { + "epoch": 0.13527980535279804, + "grad_norm": 1.3983637614495477, + "learning_rate": 9.712496751256907e-06, + "loss": 0.4741, + "step": 1390 + }, + { + "epoch": 0.1353771289537713, + "grad_norm": 1.2378967843544995, + "learning_rate": 9.71196975345788e-06, + "loss": 0.3467, + "step": 1391 + }, + { + "epoch": 0.13547445255474452, + "grad_norm": 1.3128641622430697, + "learning_rate": 9.711442287427523e-06, + "loss": 0.413, + "step": 1392 + }, + { + "epoch": 0.13557177615571775, + "grad_norm": 1.6638151172989781, + "learning_rate": 9.71091435321825e-06, + "loss": 0.4844, + "step": 1393 + }, + { + "epoch": 0.135669099756691, + "grad_norm": 1.5023430961651105, + "learning_rate": 9.710385950882522e-06, + "loss": 0.3639, + "step": 1394 + }, + { + "epoch": 0.13576642335766423, + "grad_norm": 1.3286069107884302, + "learning_rate": 9.709857080472847e-06, + "loss": 0.4055, + "step": 1395 + }, + { + "epoch": 0.13586374695863748, + "grad_norm": 1.2934746343236392, + "learning_rate": 9.709327742041776e-06, + "loss": 0.2837, + "step": 1396 + }, + { + "epoch": 0.1359610705596107, + "grad_norm": 1.698077360010743, + "learning_rate": 9.708797935641915e-06, + "loss": 0.3687, + "step": 1397 + }, + { + "epoch": 0.13605839416058393, + "grad_norm": 1.412271661785088, + "learning_rate": 9.70826766132591e-06, + "loss": 0.3577, + "step": 1398 + }, + { + "epoch": 0.1361557177615572, + "grad_norm": 1.5421055950766074, + "learning_rate": 9.707736919146453e-06, + "loss": 0.5394, + "step": 1399 + }, + { + "epoch": 0.1362530413625304, + "grad_norm": 2.554386599490806, + "learning_rate": 9.707205709156285e-06, + "loss": 0.212, + "step": 1400 + }, + { + "epoch": 0.13635036496350364, + "grad_norm": 1.7436805109650844, + "learning_rate": 9.70667403140819e-06, + "loss": 0.6893, + "step": 1401 + }, + { + "epoch": 0.1364476885644769, + "grad_norm": 1.613527884115612, + "learning_rate": 9.706141885955006e-06, + "loss": 0.42, + "step": 1402 + }, + { + "epoch": 0.13654501216545012, + "grad_norm": 1.711619341430359, + "learning_rate": 9.70560927284961e-06, + "loss": 0.7025, + "step": 1403 + }, + { + "epoch": 0.13664233576642335, + "grad_norm": 1.5376532439489434, + "learning_rate": 9.705076192144927e-06, + "loss": 0.5201, + "step": 1404 + }, + { + "epoch": 0.1367396593673966, + "grad_norm": 1.492510855426001, + "learning_rate": 9.704542643893931e-06, + "loss": 0.4281, + "step": 1405 + }, + { + "epoch": 0.13683698296836982, + "grad_norm": 1.5678573317920237, + "learning_rate": 9.704008628149641e-06, + "loss": 0.506, + "step": 1406 + }, + { + "epoch": 0.13693430656934308, + "grad_norm": 1.3237691920747017, + "learning_rate": 9.703474144965123e-06, + "loss": 0.4114, + "step": 1407 + }, + { + "epoch": 0.1370316301703163, + "grad_norm": 1.4134135574988251, + "learning_rate": 9.702939194393489e-06, + "loss": 0.3806, + "step": 1408 + }, + { + "epoch": 0.13712895377128953, + "grad_norm": 1.5544258549266206, + "learning_rate": 9.702403776487895e-06, + "loss": 0.4863, + "step": 1409 + }, + { + "epoch": 0.13722627737226278, + "grad_norm": 1.3619063912879554, + "learning_rate": 9.701867891301548e-06, + "loss": 0.3692, + "step": 1410 + }, + { + "epoch": 0.137323600973236, + "grad_norm": 1.5146665393724075, + "learning_rate": 9.701331538887699e-06, + "loss": 0.3311, + "step": 1411 + }, + { + "epoch": 0.13742092457420924, + "grad_norm": 1.5674647990142176, + "learning_rate": 9.700794719299644e-06, + "loss": 0.5292, + "step": 1412 + }, + { + "epoch": 0.1375182481751825, + "grad_norm": 1.4711236643775818, + "learning_rate": 9.700257432590729e-06, + "loss": 0.466, + "step": 1413 + }, + { + "epoch": 0.13761557177615572, + "grad_norm": 1.4410106250389758, + "learning_rate": 9.699719678814345e-06, + "loss": 0.3276, + "step": 1414 + }, + { + "epoch": 0.13771289537712894, + "grad_norm": 1.652937978394441, + "learning_rate": 9.699181458023927e-06, + "loss": 0.5391, + "step": 1415 + }, + { + "epoch": 0.1378102189781022, + "grad_norm": 1.7285587973510355, + "learning_rate": 9.698642770272959e-06, + "loss": 0.5707, + "step": 1416 + }, + { + "epoch": 0.13790754257907542, + "grad_norm": 1.325058257423692, + "learning_rate": 9.698103615614972e-06, + "loss": 0.3429, + "step": 1417 + }, + { + "epoch": 0.13800486618004867, + "grad_norm": 1.5653351048996198, + "learning_rate": 9.69756399410354e-06, + "loss": 0.4132, + "step": 1418 + }, + { + "epoch": 0.1381021897810219, + "grad_norm": 1.603805088396393, + "learning_rate": 9.697023905792287e-06, + "loss": 0.4983, + "step": 1419 + }, + { + "epoch": 0.13819951338199513, + "grad_norm": 1.5052443063346659, + "learning_rate": 9.69648335073488e-06, + "loss": 0.2713, + "step": 1420 + }, + { + "epoch": 0.13829683698296838, + "grad_norm": 1.30196768692164, + "learning_rate": 9.695942328985037e-06, + "loss": 0.27, + "step": 1421 + }, + { + "epoch": 0.1383941605839416, + "grad_norm": 1.1542739478608208, + "learning_rate": 9.695400840596519e-06, + "loss": 0.3309, + "step": 1422 + }, + { + "epoch": 0.13849148418491483, + "grad_norm": 1.1029138054910885, + "learning_rate": 9.694858885623132e-06, + "loss": 0.3262, + "step": 1423 + }, + { + "epoch": 0.13858880778588809, + "grad_norm": 1.581389120261872, + "learning_rate": 9.694316464118732e-06, + "loss": 0.4663, + "step": 1424 + }, + { + "epoch": 0.1386861313868613, + "grad_norm": 1.2966198038166061, + "learning_rate": 9.69377357613722e-06, + "loss": 0.336, + "step": 1425 + }, + { + "epoch": 0.13878345498783454, + "grad_norm": 1.505634533514273, + "learning_rate": 9.693230221732544e-06, + "loss": 0.4269, + "step": 1426 + }, + { + "epoch": 0.1388807785888078, + "grad_norm": 1.274453115047599, + "learning_rate": 9.692686400958695e-06, + "loss": 0.3978, + "step": 1427 + }, + { + "epoch": 0.13897810218978102, + "grad_norm": 1.2126154933077449, + "learning_rate": 9.692142113869714e-06, + "loss": 0.2754, + "step": 1428 + }, + { + "epoch": 0.13907542579075427, + "grad_norm": 1.4884313472642259, + "learning_rate": 9.691597360519686e-06, + "loss": 0.4661, + "step": 1429 + }, + { + "epoch": 0.1391727493917275, + "grad_norm": 1.5680101511782372, + "learning_rate": 9.691052140962747e-06, + "loss": 0.4237, + "step": 1430 + }, + { + "epoch": 0.13927007299270072, + "grad_norm": 1.325640699841282, + "learning_rate": 9.690506455253073e-06, + "loss": 0.3988, + "step": 1431 + }, + { + "epoch": 0.13936739659367398, + "grad_norm": 1.3107002270910884, + "learning_rate": 9.689960303444887e-06, + "loss": 0.4268, + "step": 1432 + }, + { + "epoch": 0.1394647201946472, + "grad_norm": 1.9246823036308274, + "learning_rate": 9.689413685592465e-06, + "loss": 0.3733, + "step": 1433 + }, + { + "epoch": 0.13956204379562043, + "grad_norm": 1.3731854343094059, + "learning_rate": 9.688866601750122e-06, + "loss": 0.4215, + "step": 1434 + }, + { + "epoch": 0.13965936739659368, + "grad_norm": 1.368964734934982, + "learning_rate": 9.688319051972224e-06, + "loss": 0.4697, + "step": 1435 + }, + { + "epoch": 0.1397566909975669, + "grad_norm": 1.3451140821212522, + "learning_rate": 9.687771036313178e-06, + "loss": 0.3741, + "step": 1436 + }, + { + "epoch": 0.13985401459854013, + "grad_norm": 1.5372748667563303, + "learning_rate": 9.687222554827444e-06, + "loss": 0.4199, + "step": 1437 + }, + { + "epoch": 0.1399513381995134, + "grad_norm": 1.1780522614950486, + "learning_rate": 9.686673607569526e-06, + "loss": 0.3602, + "step": 1438 + }, + { + "epoch": 0.1400486618004866, + "grad_norm": 1.20778383169165, + "learning_rate": 9.686124194593967e-06, + "loss": 0.23, + "step": 1439 + }, + { + "epoch": 0.14014598540145987, + "grad_norm": 1.6760972087501165, + "learning_rate": 9.685574315955368e-06, + "loss": 0.5089, + "step": 1440 + }, + { + "epoch": 0.1402433090024331, + "grad_norm": 1.7963497555189056, + "learning_rate": 9.68502397170837e-06, + "loss": 0.3932, + "step": 1441 + }, + { + "epoch": 0.14034063260340632, + "grad_norm": 1.401968265514402, + "learning_rate": 9.68447316190766e-06, + "loss": 0.4272, + "step": 1442 + }, + { + "epoch": 0.14043795620437957, + "grad_norm": 1.1461895591250986, + "learning_rate": 9.683921886607973e-06, + "loss": 0.3003, + "step": 1443 + }, + { + "epoch": 0.1405352798053528, + "grad_norm": 1.8257595963636586, + "learning_rate": 9.683370145864089e-06, + "loss": 0.4454, + "step": 1444 + }, + { + "epoch": 0.14063260340632602, + "grad_norm": 1.3483599166387192, + "learning_rate": 9.682817939730833e-06, + "loss": 0.3708, + "step": 1445 + }, + { + "epoch": 0.14072992700729928, + "grad_norm": 1.4560700792487955, + "learning_rate": 9.682265268263083e-06, + "loss": 0.4321, + "step": 1446 + }, + { + "epoch": 0.1408272506082725, + "grad_norm": 1.4364952224667933, + "learning_rate": 9.681712131515753e-06, + "loss": 0.3812, + "step": 1447 + }, + { + "epoch": 0.14092457420924573, + "grad_norm": 1.6808986821455574, + "learning_rate": 9.681158529543812e-06, + "loss": 0.3939, + "step": 1448 + }, + { + "epoch": 0.14102189781021898, + "grad_norm": 1.5327313322922438, + "learning_rate": 9.68060446240227e-06, + "loss": 0.3617, + "step": 1449 + }, + { + "epoch": 0.1411192214111922, + "grad_norm": 1.9055650449775412, + "learning_rate": 9.680049930146186e-06, + "loss": 0.4984, + "step": 1450 + }, + { + "epoch": 0.14121654501216546, + "grad_norm": 1.8971706606162055, + "learning_rate": 9.679494932830664e-06, + "loss": 0.4196, + "step": 1451 + }, + { + "epoch": 0.1413138686131387, + "grad_norm": 1.7337796675846617, + "learning_rate": 9.678939470510856e-06, + "loss": 0.4282, + "step": 1452 + }, + { + "epoch": 0.1414111922141119, + "grad_norm": 1.6436762455975924, + "learning_rate": 9.678383543241954e-06, + "loss": 0.425, + "step": 1453 + }, + { + "epoch": 0.14150851581508517, + "grad_norm": 1.3304471527694197, + "learning_rate": 9.677827151079205e-06, + "loss": 0.346, + "step": 1454 + }, + { + "epoch": 0.1416058394160584, + "grad_norm": 1.3162532004293022, + "learning_rate": 9.677270294077895e-06, + "loss": 0.4492, + "step": 1455 + }, + { + "epoch": 0.14170316301703162, + "grad_norm": 1.2299075830057253, + "learning_rate": 9.676712972293363e-06, + "loss": 0.3525, + "step": 1456 + }, + { + "epoch": 0.14180048661800487, + "grad_norm": 1.7174455721253266, + "learning_rate": 9.676155185780989e-06, + "loss": 0.763, + "step": 1457 + }, + { + "epoch": 0.1418978102189781, + "grad_norm": 0.9624475539149472, + "learning_rate": 9.675596934596198e-06, + "loss": 0.2234, + "step": 1458 + }, + { + "epoch": 0.14199513381995132, + "grad_norm": 1.380722751360302, + "learning_rate": 9.675038218794469e-06, + "loss": 0.3539, + "step": 1459 + }, + { + "epoch": 0.14209245742092458, + "grad_norm": 1.3595616004290971, + "learning_rate": 9.674479038431314e-06, + "loss": 0.4356, + "step": 1460 + }, + { + "epoch": 0.1421897810218978, + "grad_norm": 1.2777542247997187, + "learning_rate": 9.673919393562308e-06, + "loss": 0.3233, + "step": 1461 + }, + { + "epoch": 0.14228710462287106, + "grad_norm": 1.23752096524445, + "learning_rate": 9.673359284243055e-06, + "loss": 0.405, + "step": 1462 + }, + { + "epoch": 0.14238442822384428, + "grad_norm": 1.4547729172095425, + "learning_rate": 9.672798710529222e-06, + "loss": 0.5356, + "step": 1463 + }, + { + "epoch": 0.1424817518248175, + "grad_norm": 1.5976011084855026, + "learning_rate": 9.672237672476506e-06, + "loss": 0.571, + "step": 1464 + }, + { + "epoch": 0.14257907542579076, + "grad_norm": 1.4454139467669962, + "learning_rate": 9.67167617014066e-06, + "loss": 0.4556, + "step": 1465 + }, + { + "epoch": 0.142676399026764, + "grad_norm": 1.5296734849172828, + "learning_rate": 9.671114203577485e-06, + "loss": 0.5791, + "step": 1466 + }, + { + "epoch": 0.14277372262773722, + "grad_norm": 1.0140913901893902, + "learning_rate": 9.670551772842818e-06, + "loss": 0.2732, + "step": 1467 + }, + { + "epoch": 0.14287104622871047, + "grad_norm": 1.5600773149062541, + "learning_rate": 9.669988877992551e-06, + "loss": 0.3902, + "step": 1468 + }, + { + "epoch": 0.1429683698296837, + "grad_norm": 1.4767158872669255, + "learning_rate": 9.66942551908262e-06, + "loss": 0.5531, + "step": 1469 + }, + { + "epoch": 0.14306569343065692, + "grad_norm": 1.1134570066684917, + "learning_rate": 9.668861696169003e-06, + "loss": 0.278, + "step": 1470 + }, + { + "epoch": 0.14316301703163017, + "grad_norm": 0.9776488708344422, + "learning_rate": 9.66829740930773e-06, + "loss": 0.231, + "step": 1471 + }, + { + "epoch": 0.1432603406326034, + "grad_norm": 1.4647496714581032, + "learning_rate": 9.667732658554875e-06, + "loss": 0.485, + "step": 1472 + }, + { + "epoch": 0.14335766423357665, + "grad_norm": 1.2234301570511203, + "learning_rate": 9.667167443966557e-06, + "loss": 0.3944, + "step": 1473 + }, + { + "epoch": 0.14345498783454988, + "grad_norm": 1.3655487702696618, + "learning_rate": 9.66660176559894e-06, + "loss": 0.3989, + "step": 1474 + }, + { + "epoch": 0.1435523114355231, + "grad_norm": 1.4690108372007447, + "learning_rate": 9.666035623508238e-06, + "loss": 0.4311, + "step": 1475 + }, + { + "epoch": 0.14364963503649636, + "grad_norm": 1.1910374305057687, + "learning_rate": 9.665469017750708e-06, + "loss": 0.3002, + "step": 1476 + }, + { + "epoch": 0.14374695863746959, + "grad_norm": 1.678176413091249, + "learning_rate": 9.664901948382654e-06, + "loss": 0.6143, + "step": 1477 + }, + { + "epoch": 0.1438442822384428, + "grad_norm": 1.817046546881487, + "learning_rate": 9.664334415460426e-06, + "loss": 0.7811, + "step": 1478 + }, + { + "epoch": 0.14394160583941606, + "grad_norm": 1.4955026439922687, + "learning_rate": 9.663766419040422e-06, + "loss": 0.411, + "step": 1479 + }, + { + "epoch": 0.1440389294403893, + "grad_norm": 1.4198677231066263, + "learning_rate": 9.66319795917908e-06, + "loss": 0.4245, + "step": 1480 + }, + { + "epoch": 0.14413625304136254, + "grad_norm": 1.5199876898969789, + "learning_rate": 9.662629035932892e-06, + "loss": 0.438, + "step": 1481 + }, + { + "epoch": 0.14423357664233577, + "grad_norm": 1.4859771113526168, + "learning_rate": 9.662059649358388e-06, + "loss": 0.3949, + "step": 1482 + }, + { + "epoch": 0.144330900243309, + "grad_norm": 1.5386966328542977, + "learning_rate": 9.661489799512155e-06, + "loss": 0.4679, + "step": 1483 + }, + { + "epoch": 0.14442822384428225, + "grad_norm": 1.2872766782537612, + "learning_rate": 9.660919486450813e-06, + "loss": 0.2624, + "step": 1484 + }, + { + "epoch": 0.14452554744525548, + "grad_norm": 1.3276179523832277, + "learning_rate": 9.660348710231037e-06, + "loss": 0.5476, + "step": 1485 + }, + { + "epoch": 0.1446228710462287, + "grad_norm": 0.9490583621811937, + "learning_rate": 9.659777470909547e-06, + "loss": 0.2354, + "step": 1486 + }, + { + "epoch": 0.14472019464720196, + "grad_norm": 1.3763558898436123, + "learning_rate": 9.659205768543104e-06, + "loss": 0.4327, + "step": 1487 + }, + { + "epoch": 0.14481751824817518, + "grad_norm": 1.178366926128956, + "learning_rate": 9.658633603188521e-06, + "loss": 0.3839, + "step": 1488 + }, + { + "epoch": 0.1449148418491484, + "grad_norm": 1.3255542333725456, + "learning_rate": 9.658060974902653e-06, + "loss": 0.3068, + "step": 1489 + }, + { + "epoch": 0.14501216545012166, + "grad_norm": 1.5998345706772108, + "learning_rate": 9.657487883742403e-06, + "loss": 0.5432, + "step": 1490 + }, + { + "epoch": 0.1451094890510949, + "grad_norm": 1.8804975658787435, + "learning_rate": 9.656914329764718e-06, + "loss": 0.5268, + "step": 1491 + }, + { + "epoch": 0.14520681265206814, + "grad_norm": 1.5841269093835124, + "learning_rate": 9.656340313026595e-06, + "loss": 0.6304, + "step": 1492 + }, + { + "epoch": 0.14530413625304137, + "grad_norm": 1.5832299483159056, + "learning_rate": 9.655765833585072e-06, + "loss": 0.4417, + "step": 1493 + }, + { + "epoch": 0.1454014598540146, + "grad_norm": 1.2541361090062475, + "learning_rate": 9.655190891497237e-06, + "loss": 0.2956, + "step": 1494 + }, + { + "epoch": 0.14549878345498785, + "grad_norm": 1.4549578520972333, + "learning_rate": 9.654615486820223e-06, + "loss": 0.5352, + "step": 1495 + }, + { + "epoch": 0.14559610705596107, + "grad_norm": 1.4797996277102474, + "learning_rate": 9.654039619611205e-06, + "loss": 0.4915, + "step": 1496 + }, + { + "epoch": 0.1456934306569343, + "grad_norm": 1.2281886698207842, + "learning_rate": 9.65346328992741e-06, + "loss": 0.1901, + "step": 1497 + }, + { + "epoch": 0.14579075425790755, + "grad_norm": 1.4478972545758728, + "learning_rate": 9.652886497826109e-06, + "loss": 0.4142, + "step": 1498 + }, + { + "epoch": 0.14588807785888078, + "grad_norm": 1.5883286963945868, + "learning_rate": 9.652309243364614e-06, + "loss": 0.3576, + "step": 1499 + }, + { + "epoch": 0.145985401459854, + "grad_norm": 1.5369489845441549, + "learning_rate": 9.651731526600293e-06, + "loss": 0.5479, + "step": 1500 + }, + { + "epoch": 0.14608272506082726, + "grad_norm": 1.5655077404950533, + "learning_rate": 9.651153347590549e-06, + "loss": 0.3464, + "step": 1501 + }, + { + "epoch": 0.14618004866180048, + "grad_norm": 1.6426065013852054, + "learning_rate": 9.65057470639284e-06, + "loss": 0.5038, + "step": 1502 + }, + { + "epoch": 0.14627737226277374, + "grad_norm": 1.8088684532537898, + "learning_rate": 9.649995603064664e-06, + "loss": 0.5731, + "step": 1503 + }, + { + "epoch": 0.14637469586374696, + "grad_norm": 1.2493389766016731, + "learning_rate": 9.649416037663564e-06, + "loss": 0.3306, + "step": 1504 + }, + { + "epoch": 0.1464720194647202, + "grad_norm": 1.5964615139072293, + "learning_rate": 9.648836010247137e-06, + "loss": 0.4169, + "step": 1505 + }, + { + "epoch": 0.14656934306569344, + "grad_norm": 1.3925830899828215, + "learning_rate": 9.648255520873018e-06, + "loss": 0.3092, + "step": 1506 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 1.4256860988690832, + "learning_rate": 9.647674569598889e-06, + "loss": 0.3201, + "step": 1507 + }, + { + "epoch": 0.1467639902676399, + "grad_norm": 1.6614703553660697, + "learning_rate": 9.647093156482483e-06, + "loss": 0.6078, + "step": 1508 + }, + { + "epoch": 0.14686131386861315, + "grad_norm": 1.4357097092225446, + "learning_rate": 9.646511281581575e-06, + "loss": 0.4004, + "step": 1509 + }, + { + "epoch": 0.14695863746958637, + "grad_norm": 1.4562846462074024, + "learning_rate": 9.645928944953981e-06, + "loss": 0.4601, + "step": 1510 + }, + { + "epoch": 0.1470559610705596, + "grad_norm": 1.3277232740610976, + "learning_rate": 9.645346146657575e-06, + "loss": 0.4015, + "step": 1511 + }, + { + "epoch": 0.14715328467153285, + "grad_norm": 1.5964514332978577, + "learning_rate": 9.644762886750267e-06, + "loss": 0.4556, + "step": 1512 + }, + { + "epoch": 0.14725060827250608, + "grad_norm": 1.4663379423625913, + "learning_rate": 9.644179165290015e-06, + "loss": 0.4353, + "step": 1513 + }, + { + "epoch": 0.14734793187347933, + "grad_norm": 1.0949765548634744, + "learning_rate": 9.643594982334826e-06, + "loss": 0.2276, + "step": 1514 + }, + { + "epoch": 0.14744525547445256, + "grad_norm": 1.563845779693575, + "learning_rate": 9.643010337942749e-06, + "loss": 0.6694, + "step": 1515 + }, + { + "epoch": 0.14754257907542578, + "grad_norm": 1.024413538842663, + "learning_rate": 9.642425232171881e-06, + "loss": 0.3047, + "step": 1516 + }, + { + "epoch": 0.14763990267639904, + "grad_norm": 1.712866405633365, + "learning_rate": 9.641839665080363e-06, + "loss": 0.6729, + "step": 1517 + }, + { + "epoch": 0.14773722627737226, + "grad_norm": 1.4526258041869373, + "learning_rate": 9.641253636726386e-06, + "loss": 0.5037, + "step": 1518 + }, + { + "epoch": 0.1478345498783455, + "grad_norm": 1.7375410582816389, + "learning_rate": 9.640667147168182e-06, + "loss": 0.6717, + "step": 1519 + }, + { + "epoch": 0.14793187347931874, + "grad_norm": 1.736227335112512, + "learning_rate": 9.640080196464032e-06, + "loss": 0.6677, + "step": 1520 + }, + { + "epoch": 0.14802919708029197, + "grad_norm": 1.5194007013329096, + "learning_rate": 9.63949278467226e-06, + "loss": 0.4288, + "step": 1521 + }, + { + "epoch": 0.1481265206812652, + "grad_norm": 1.5063763039212688, + "learning_rate": 9.638904911851237e-06, + "loss": 0.4529, + "step": 1522 + }, + { + "epoch": 0.14822384428223845, + "grad_norm": 1.6414876214129155, + "learning_rate": 9.638316578059384e-06, + "loss": 0.5482, + "step": 1523 + }, + { + "epoch": 0.14832116788321167, + "grad_norm": 1.3122113228270877, + "learning_rate": 9.63772778335516e-06, + "loss": 0.3903, + "step": 1524 + }, + { + "epoch": 0.14841849148418493, + "grad_norm": 1.6417051120393822, + "learning_rate": 9.637138527797075e-06, + "loss": 0.654, + "step": 1525 + }, + { + "epoch": 0.14851581508515815, + "grad_norm": 1.2700043251684836, + "learning_rate": 9.636548811443685e-06, + "loss": 0.3338, + "step": 1526 + }, + { + "epoch": 0.14861313868613138, + "grad_norm": 1.4124836827913858, + "learning_rate": 9.63595863435359e-06, + "loss": 0.3551, + "step": 1527 + }, + { + "epoch": 0.14871046228710463, + "grad_norm": 1.3732601776051463, + "learning_rate": 9.635367996585436e-06, + "loss": 0.4212, + "step": 1528 + }, + { + "epoch": 0.14880778588807786, + "grad_norm": 1.4785898006079692, + "learning_rate": 9.634776898197916e-06, + "loss": 0.416, + "step": 1529 + }, + { + "epoch": 0.14890510948905109, + "grad_norm": 1.5889313350171215, + "learning_rate": 9.634185339249766e-06, + "loss": 0.5277, + "step": 1530 + }, + { + "epoch": 0.14900243309002434, + "grad_norm": 1.7475817866143981, + "learning_rate": 9.63359331979977e-06, + "loss": 0.5202, + "step": 1531 + }, + { + "epoch": 0.14909975669099756, + "grad_norm": 1.5329427899001755, + "learning_rate": 9.633000839906758e-06, + "loss": 0.4283, + "step": 1532 + }, + { + "epoch": 0.1491970802919708, + "grad_norm": 1.3789605408265815, + "learning_rate": 9.632407899629606e-06, + "loss": 0.41, + "step": 1533 + }, + { + "epoch": 0.14929440389294404, + "grad_norm": 1.725959361785896, + "learning_rate": 9.631814499027233e-06, + "loss": 0.6289, + "step": 1534 + }, + { + "epoch": 0.14939172749391727, + "grad_norm": 1.5432692609357797, + "learning_rate": 9.631220638158605e-06, + "loss": 0.5, + "step": 1535 + }, + { + "epoch": 0.14948905109489052, + "grad_norm": 1.6556108789068573, + "learning_rate": 9.630626317082737e-06, + "loss": 0.3819, + "step": 1536 + }, + { + "epoch": 0.14958637469586375, + "grad_norm": 1.4498977098887442, + "learning_rate": 9.630031535858686e-06, + "loss": 0.4317, + "step": 1537 + }, + { + "epoch": 0.14968369829683698, + "grad_norm": 1.1232180788321369, + "learning_rate": 9.629436294545555e-06, + "loss": 0.4004, + "step": 1538 + }, + { + "epoch": 0.14978102189781023, + "grad_norm": 0.9949950497949807, + "learning_rate": 9.628840593202494e-06, + "loss": 0.2008, + "step": 1539 + }, + { + "epoch": 0.14987834549878346, + "grad_norm": 1.431426278132333, + "learning_rate": 9.628244431888699e-06, + "loss": 0.3689, + "step": 1540 + }, + { + "epoch": 0.14997566909975668, + "grad_norm": 1.575987397356523, + "learning_rate": 9.627647810663407e-06, + "loss": 0.5513, + "step": 1541 + }, + { + "epoch": 0.15007299270072993, + "grad_norm": 1.5419042077794642, + "learning_rate": 9.627050729585911e-06, + "loss": 0.4614, + "step": 1542 + }, + { + "epoch": 0.15017031630170316, + "grad_norm": 1.6695059275012083, + "learning_rate": 9.626453188715539e-06, + "loss": 0.5111, + "step": 1543 + }, + { + "epoch": 0.1502676399026764, + "grad_norm": 1.5402255238707527, + "learning_rate": 9.625855188111668e-06, + "loss": 0.4209, + "step": 1544 + }, + { + "epoch": 0.15036496350364964, + "grad_norm": 1.4645797288107798, + "learning_rate": 9.625256727833726e-06, + "loss": 0.4852, + "step": 1545 + }, + { + "epoch": 0.15046228710462287, + "grad_norm": 2.0138530187225845, + "learning_rate": 9.62465780794118e-06, + "loss": 0.4272, + "step": 1546 + }, + { + "epoch": 0.15055961070559612, + "grad_norm": 1.7939871096323345, + "learning_rate": 9.624058428493543e-06, + "loss": 0.3864, + "step": 1547 + }, + { + "epoch": 0.15065693430656935, + "grad_norm": 1.5936734798237622, + "learning_rate": 9.62345858955038e-06, + "loss": 0.5951, + "step": 1548 + }, + { + "epoch": 0.15075425790754257, + "grad_norm": 1.381736638575513, + "learning_rate": 9.622858291171295e-06, + "loss": 0.5078, + "step": 1549 + }, + { + "epoch": 0.15085158150851583, + "grad_norm": 1.2680468052820635, + "learning_rate": 9.622257533415939e-06, + "loss": 0.3314, + "step": 1550 + }, + { + "epoch": 0.15094890510948905, + "grad_norm": 1.5886359348363517, + "learning_rate": 9.621656316344011e-06, + "loss": 0.5985, + "step": 1551 + }, + { + "epoch": 0.15104622871046228, + "grad_norm": 1.631001321245941, + "learning_rate": 9.621054640015255e-06, + "loss": 0.6297, + "step": 1552 + }, + { + "epoch": 0.15114355231143553, + "grad_norm": 1.7004985330783402, + "learning_rate": 9.62045250448946e-06, + "loss": 0.5153, + "step": 1553 + }, + { + "epoch": 0.15124087591240876, + "grad_norm": 1.414050528965644, + "learning_rate": 9.619849909826457e-06, + "loss": 0.2651, + "step": 1554 + }, + { + "epoch": 0.15133819951338198, + "grad_norm": 1.3361950007111751, + "learning_rate": 9.61924685608613e-06, + "loss": 0.4179, + "step": 1555 + }, + { + "epoch": 0.15143552311435524, + "grad_norm": 1.2305020766816175, + "learning_rate": 9.618643343328404e-06, + "loss": 0.3342, + "step": 1556 + }, + { + "epoch": 0.15153284671532846, + "grad_norm": 1.3364057110807985, + "learning_rate": 9.618039371613251e-06, + "loss": 0.357, + "step": 1557 + }, + { + "epoch": 0.15163017031630172, + "grad_norm": 0.9846564904659728, + "learning_rate": 9.617434941000685e-06, + "loss": 0.2278, + "step": 1558 + }, + { + "epoch": 0.15172749391727494, + "grad_norm": 1.4874184978820846, + "learning_rate": 9.616830051550772e-06, + "loss": 0.4467, + "step": 1559 + }, + { + "epoch": 0.15182481751824817, + "grad_norm": 1.79907754997464, + "learning_rate": 9.61622470332362e-06, + "loss": 0.5501, + "step": 1560 + }, + { + "epoch": 0.15192214111922142, + "grad_norm": 1.2290536645357835, + "learning_rate": 9.61561889637938e-06, + "loss": 0.3149, + "step": 1561 + }, + { + "epoch": 0.15201946472019465, + "grad_norm": 1.5048179340178087, + "learning_rate": 9.615012630778254e-06, + "loss": 0.5367, + "step": 1562 + }, + { + "epoch": 0.15211678832116787, + "grad_norm": 1.387431259858161, + "learning_rate": 9.614405906580486e-06, + "loss": 0.4953, + "step": 1563 + }, + { + "epoch": 0.15221411192214113, + "grad_norm": 1.4159610711473967, + "learning_rate": 9.613798723846368e-06, + "loss": 0.454, + "step": 1564 + }, + { + "epoch": 0.15231143552311435, + "grad_norm": 1.2005509919566202, + "learning_rate": 9.613191082636235e-06, + "loss": 0.3945, + "step": 1565 + }, + { + "epoch": 0.15240875912408758, + "grad_norm": 1.518451218591156, + "learning_rate": 9.612582983010468e-06, + "loss": 0.42, + "step": 1566 + }, + { + "epoch": 0.15250608272506083, + "grad_norm": 1.2817177267697137, + "learning_rate": 9.611974425029494e-06, + "loss": 0.4119, + "step": 1567 + }, + { + "epoch": 0.15260340632603406, + "grad_norm": 1.3182769071429, + "learning_rate": 9.611365408753787e-06, + "loss": 0.4301, + "step": 1568 + }, + { + "epoch": 0.1527007299270073, + "grad_norm": 1.2668371165350867, + "learning_rate": 9.610755934243864e-06, + "loss": 0.3415, + "step": 1569 + }, + { + "epoch": 0.15279805352798054, + "grad_norm": 1.334265705787435, + "learning_rate": 9.610146001560293e-06, + "loss": 0.325, + "step": 1570 + }, + { + "epoch": 0.15289537712895376, + "grad_norm": 1.405628575667756, + "learning_rate": 9.609535610763678e-06, + "loss": 0.4, + "step": 1571 + }, + { + "epoch": 0.15299270072992702, + "grad_norm": 1.5931859233666277, + "learning_rate": 9.608924761914677e-06, + "loss": 0.643, + "step": 1572 + }, + { + "epoch": 0.15309002433090024, + "grad_norm": 1.323715339329346, + "learning_rate": 9.608313455073989e-06, + "loss": 0.4832, + "step": 1573 + }, + { + "epoch": 0.15318734793187347, + "grad_norm": 1.1603088792271297, + "learning_rate": 9.60770169030236e-06, + "loss": 0.2684, + "step": 1574 + }, + { + "epoch": 0.15328467153284672, + "grad_norm": 1.4578030666688024, + "learning_rate": 9.607089467660581e-06, + "loss": 0.4418, + "step": 1575 + }, + { + "epoch": 0.15338199513381995, + "grad_norm": 1.2739086566679132, + "learning_rate": 9.606476787209493e-06, + "loss": 0.3847, + "step": 1576 + }, + { + "epoch": 0.15347931873479317, + "grad_norm": 1.4031538044918876, + "learning_rate": 9.605863649009973e-06, + "loss": 0.3672, + "step": 1577 + }, + { + "epoch": 0.15357664233576643, + "grad_norm": 1.473592849907526, + "learning_rate": 9.605250053122951e-06, + "loss": 0.3955, + "step": 1578 + }, + { + "epoch": 0.15367396593673965, + "grad_norm": 1.6950520258208177, + "learning_rate": 9.604635999609402e-06, + "loss": 0.6923, + "step": 1579 + }, + { + "epoch": 0.1537712895377129, + "grad_norm": 1.6074239515288835, + "learning_rate": 9.604021488530342e-06, + "loss": 0.4771, + "step": 1580 + }, + { + "epoch": 0.15386861313868613, + "grad_norm": 1.5289432511411145, + "learning_rate": 9.603406519946838e-06, + "loss": 0.5881, + "step": 1581 + }, + { + "epoch": 0.15396593673965936, + "grad_norm": 1.3225323677068181, + "learning_rate": 9.602791093919998e-06, + "loss": 0.3128, + "step": 1582 + }, + { + "epoch": 0.1540632603406326, + "grad_norm": 1.467417498061456, + "learning_rate": 9.60217521051098e-06, + "loss": 0.545, + "step": 1583 + }, + { + "epoch": 0.15416058394160584, + "grad_norm": 1.7568491012309082, + "learning_rate": 9.60155886978098e-06, + "loss": 0.7054, + "step": 1584 + }, + { + "epoch": 0.15425790754257906, + "grad_norm": 1.5606257069028109, + "learning_rate": 9.600942071791248e-06, + "loss": 0.4329, + "step": 1585 + }, + { + "epoch": 0.15435523114355232, + "grad_norm": 1.5727160833264413, + "learning_rate": 9.600324816603074e-06, + "loss": 0.6128, + "step": 1586 + }, + { + "epoch": 0.15445255474452554, + "grad_norm": 1.3864503412663605, + "learning_rate": 9.599707104277796e-06, + "loss": 0.573, + "step": 1587 + }, + { + "epoch": 0.15454987834549877, + "grad_norm": 1.4232761061254342, + "learning_rate": 9.599088934876794e-06, + "loss": 0.4136, + "step": 1588 + }, + { + "epoch": 0.15464720194647202, + "grad_norm": 1.3399427727677786, + "learning_rate": 9.598470308461499e-06, + "loss": 0.257, + "step": 1589 + }, + { + "epoch": 0.15474452554744525, + "grad_norm": 1.61635763649276, + "learning_rate": 9.597851225093382e-06, + "loss": 0.566, + "step": 1590 + }, + { + "epoch": 0.1548418491484185, + "grad_norm": 1.6304164262097627, + "learning_rate": 9.597231684833964e-06, + "loss": 0.3673, + "step": 1591 + }, + { + "epoch": 0.15493917274939173, + "grad_norm": 1.4592987498064005, + "learning_rate": 9.596611687744807e-06, + "loss": 0.5193, + "step": 1592 + }, + { + "epoch": 0.15503649635036496, + "grad_norm": 1.4397292060019447, + "learning_rate": 9.595991233887523e-06, + "loss": 0.3236, + "step": 1593 + }, + { + "epoch": 0.1551338199513382, + "grad_norm": 1.2246835494507005, + "learning_rate": 9.595370323323763e-06, + "loss": 0.2397, + "step": 1594 + }, + { + "epoch": 0.15523114355231143, + "grad_norm": 1.530797619071646, + "learning_rate": 9.59474895611523e-06, + "loss": 0.4537, + "step": 1595 + }, + { + "epoch": 0.15532846715328466, + "grad_norm": 0.9400393110536889, + "learning_rate": 9.594127132323669e-06, + "loss": 0.1899, + "step": 1596 + }, + { + "epoch": 0.15542579075425791, + "grad_norm": 1.167634539806263, + "learning_rate": 9.593504852010872e-06, + "loss": 0.353, + "step": 1597 + }, + { + "epoch": 0.15552311435523114, + "grad_norm": 1.6772160290018319, + "learning_rate": 9.592882115238675e-06, + "loss": 0.4194, + "step": 1598 + }, + { + "epoch": 0.15562043795620437, + "grad_norm": 1.4391641520861267, + "learning_rate": 9.592258922068958e-06, + "loss": 0.4767, + "step": 1599 + }, + { + "epoch": 0.15571776155717762, + "grad_norm": 1.544673007447179, + "learning_rate": 9.591635272563648e-06, + "loss": 0.3175, + "step": 1600 + }, + { + "epoch": 0.15581508515815085, + "grad_norm": 1.4189512773822923, + "learning_rate": 9.591011166784721e-06, + "loss": 0.4834, + "step": 1601 + }, + { + "epoch": 0.1559124087591241, + "grad_norm": 1.2414753149853184, + "learning_rate": 9.590386604794191e-06, + "loss": 0.3657, + "step": 1602 + }, + { + "epoch": 0.15600973236009733, + "grad_norm": 1.0236785255419305, + "learning_rate": 9.589761586654122e-06, + "loss": 0.2011, + "step": 1603 + }, + { + "epoch": 0.15610705596107055, + "grad_norm": 2.3461369884265357, + "learning_rate": 9.589136112426625e-06, + "loss": 0.4024, + "step": 1604 + }, + { + "epoch": 0.1562043795620438, + "grad_norm": 1.2849479900774115, + "learning_rate": 9.588510182173851e-06, + "loss": 0.3527, + "step": 1605 + }, + { + "epoch": 0.15630170316301703, + "grad_norm": 1.4153286655317308, + "learning_rate": 9.587883795958001e-06, + "loss": 0.4149, + "step": 1606 + }, + { + "epoch": 0.15639902676399026, + "grad_norm": 1.6599870662874754, + "learning_rate": 9.587256953841317e-06, + "loss": 0.6479, + "step": 1607 + }, + { + "epoch": 0.1564963503649635, + "grad_norm": 1.6670860080877101, + "learning_rate": 9.58662965588609e-06, + "loss": 0.5825, + "step": 1608 + }, + { + "epoch": 0.15659367396593674, + "grad_norm": 1.7776280437765584, + "learning_rate": 9.586001902154655e-06, + "loss": 0.5798, + "step": 1609 + }, + { + "epoch": 0.15669099756690996, + "grad_norm": 1.5456297515043347, + "learning_rate": 9.585373692709391e-06, + "loss": 0.4583, + "step": 1610 + }, + { + "epoch": 0.15678832116788322, + "grad_norm": 1.8806083091738082, + "learning_rate": 9.584745027612728e-06, + "loss": 0.4736, + "step": 1611 + }, + { + "epoch": 0.15688564476885644, + "grad_norm": 1.4790926453601037, + "learning_rate": 9.584115906927131e-06, + "loss": 0.4172, + "step": 1612 + }, + { + "epoch": 0.1569829683698297, + "grad_norm": 3.3021500316987633, + "learning_rate": 9.58348633071512e-06, + "loss": 0.472, + "step": 1613 + }, + { + "epoch": 0.15708029197080292, + "grad_norm": 1.860435632122749, + "learning_rate": 9.582856299039253e-06, + "loss": 0.4743, + "step": 1614 + }, + { + "epoch": 0.15717761557177615, + "grad_norm": 1.7557070181222967, + "learning_rate": 9.58222581196214e-06, + "loss": 0.2907, + "step": 1615 + }, + { + "epoch": 0.1572749391727494, + "grad_norm": 1.5588238003780286, + "learning_rate": 9.581594869546433e-06, + "loss": 0.3803, + "step": 1616 + }, + { + "epoch": 0.15737226277372263, + "grad_norm": 1.5265824940366777, + "learning_rate": 9.580963471854825e-06, + "loss": 0.3163, + "step": 1617 + }, + { + "epoch": 0.15746958637469585, + "grad_norm": 1.5425233608560427, + "learning_rate": 9.580331618950063e-06, + "loss": 0.3884, + "step": 1618 + }, + { + "epoch": 0.1575669099756691, + "grad_norm": 1.4123635386488018, + "learning_rate": 9.579699310894932e-06, + "loss": 0.382, + "step": 1619 + }, + { + "epoch": 0.15766423357664233, + "grad_norm": 1.578019469103596, + "learning_rate": 9.579066547752266e-06, + "loss": 0.4293, + "step": 1620 + }, + { + "epoch": 0.15776155717761559, + "grad_norm": 1.6566990657429592, + "learning_rate": 9.578433329584943e-06, + "loss": 0.2878, + "step": 1621 + }, + { + "epoch": 0.1578588807785888, + "grad_norm": 1.5290043771605026, + "learning_rate": 9.577799656455886e-06, + "loss": 0.4483, + "step": 1622 + }, + { + "epoch": 0.15795620437956204, + "grad_norm": 1.7268752423292135, + "learning_rate": 9.577165528428063e-06, + "loss": 0.4805, + "step": 1623 + }, + { + "epoch": 0.1580535279805353, + "grad_norm": 1.3495189675110832, + "learning_rate": 9.576530945564488e-06, + "loss": 0.3161, + "step": 1624 + }, + { + "epoch": 0.15815085158150852, + "grad_norm": 1.4763829359235794, + "learning_rate": 9.575895907928218e-06, + "loss": 0.4825, + "step": 1625 + }, + { + "epoch": 0.15824817518248174, + "grad_norm": 1.686991367686583, + "learning_rate": 9.575260415582362e-06, + "loss": 0.3016, + "step": 1626 + }, + { + "epoch": 0.158345498783455, + "grad_norm": 1.3390220591470878, + "learning_rate": 9.574624468590065e-06, + "loss": 0.4523, + "step": 1627 + }, + { + "epoch": 0.15844282238442822, + "grad_norm": 1.8698808087393168, + "learning_rate": 9.573988067014523e-06, + "loss": 0.5203, + "step": 1628 + }, + { + "epoch": 0.15854014598540145, + "grad_norm": 1.4032165021732874, + "learning_rate": 9.573351210918976e-06, + "loss": 0.3678, + "step": 1629 + }, + { + "epoch": 0.1586374695863747, + "grad_norm": 1.4017015011859046, + "learning_rate": 9.572713900366707e-06, + "loss": 0.2798, + "step": 1630 + }, + { + "epoch": 0.15873479318734793, + "grad_norm": 1.4441030854971395, + "learning_rate": 9.572076135421048e-06, + "loss": 0.3514, + "step": 1631 + }, + { + "epoch": 0.15883211678832118, + "grad_norm": 1.3629792761623065, + "learning_rate": 9.571437916145373e-06, + "loss": 0.4604, + "step": 1632 + }, + { + "epoch": 0.1589294403892944, + "grad_norm": 1.376972344446985, + "learning_rate": 9.570799242603101e-06, + "loss": 0.4603, + "step": 1633 + }, + { + "epoch": 0.15902676399026763, + "grad_norm": 1.5637421057827365, + "learning_rate": 9.5701601148577e-06, + "loss": 0.5575, + "step": 1634 + }, + { + "epoch": 0.1591240875912409, + "grad_norm": 1.4338457681188446, + "learning_rate": 9.56952053297268e-06, + "loss": 0.532, + "step": 1635 + }, + { + "epoch": 0.1592214111922141, + "grad_norm": 1.4858651962900338, + "learning_rate": 9.568880497011597e-06, + "loss": 0.4951, + "step": 1636 + }, + { + "epoch": 0.15931873479318734, + "grad_norm": 1.543423201839799, + "learning_rate": 9.568240007038048e-06, + "loss": 0.5278, + "step": 1637 + }, + { + "epoch": 0.1594160583941606, + "grad_norm": 1.408319688012345, + "learning_rate": 9.567599063115683e-06, + "loss": 0.4474, + "step": 1638 + }, + { + "epoch": 0.15951338199513382, + "grad_norm": 1.2680346779127702, + "learning_rate": 9.566957665308192e-06, + "loss": 0.3351, + "step": 1639 + }, + { + "epoch": 0.15961070559610704, + "grad_norm": 1.6277797838197976, + "learning_rate": 9.56631581367931e-06, + "loss": 0.3966, + "step": 1640 + }, + { + "epoch": 0.1597080291970803, + "grad_norm": 1.5248977314161354, + "learning_rate": 9.565673508292818e-06, + "loss": 0.5211, + "step": 1641 + }, + { + "epoch": 0.15980535279805352, + "grad_norm": 1.7164012466100764, + "learning_rate": 9.565030749212546e-06, + "loss": 0.5428, + "step": 1642 + }, + { + "epoch": 0.15990267639902678, + "grad_norm": 1.6687081549609284, + "learning_rate": 9.56438753650236e-06, + "loss": 0.2936, + "step": 1643 + }, + { + "epoch": 0.16, + "grad_norm": 1.5678110268585723, + "learning_rate": 9.56374387022618e-06, + "loss": 0.5166, + "step": 1644 + }, + { + "epoch": 0.16009732360097323, + "grad_norm": 1.6983019931785335, + "learning_rate": 9.563099750447966e-06, + "loss": 0.4822, + "step": 1645 + }, + { + "epoch": 0.16019464720194648, + "grad_norm": 1.4431824530543444, + "learning_rate": 9.562455177231726e-06, + "loss": 0.3212, + "step": 1646 + }, + { + "epoch": 0.1602919708029197, + "grad_norm": 3.712828208723791, + "learning_rate": 9.56181015064151e-06, + "loss": 0.4286, + "step": 1647 + }, + { + "epoch": 0.16038929440389293, + "grad_norm": 1.4388083433357408, + "learning_rate": 9.561164670741416e-06, + "loss": 0.3757, + "step": 1648 + }, + { + "epoch": 0.1604866180048662, + "grad_norm": 2.4878081586110117, + "learning_rate": 9.560518737595586e-06, + "loss": 0.3494, + "step": 1649 + }, + { + "epoch": 0.16058394160583941, + "grad_norm": 2.3091262745384706, + "learning_rate": 9.559872351268205e-06, + "loss": 0.4607, + "step": 1650 + }, + { + "epoch": 0.16068126520681264, + "grad_norm": 1.6632563827899045, + "learning_rate": 9.559225511823504e-06, + "loss": 0.5718, + "step": 1651 + }, + { + "epoch": 0.1607785888077859, + "grad_norm": 1.6138862417611177, + "learning_rate": 9.558578219325763e-06, + "loss": 0.325, + "step": 1652 + }, + { + "epoch": 0.16087591240875912, + "grad_norm": 1.1933317040764397, + "learning_rate": 9.557930473839303e-06, + "loss": 0.339, + "step": 1653 + }, + { + "epoch": 0.16097323600973237, + "grad_norm": 0.9728312200944081, + "learning_rate": 9.55728227542849e-06, + "loss": 0.2395, + "step": 1654 + }, + { + "epoch": 0.1610705596107056, + "grad_norm": 1.5521742092214053, + "learning_rate": 9.556633624157735e-06, + "loss": 0.4613, + "step": 1655 + }, + { + "epoch": 0.16116788321167883, + "grad_norm": 1.639740187603822, + "learning_rate": 9.555984520091497e-06, + "loss": 0.5146, + "step": 1656 + }, + { + "epoch": 0.16126520681265208, + "grad_norm": 1.5387772039120604, + "learning_rate": 9.555334963294277e-06, + "loss": 0.4879, + "step": 1657 + }, + { + "epoch": 0.1613625304136253, + "grad_norm": 1.2788374913210725, + "learning_rate": 9.554684953830622e-06, + "loss": 0.2115, + "step": 1658 + }, + { + "epoch": 0.16145985401459853, + "grad_norm": 1.2466060338770748, + "learning_rate": 9.554034491765123e-06, + "loss": 0.4057, + "step": 1659 + }, + { + "epoch": 0.16155717761557178, + "grad_norm": 1.3626765355526065, + "learning_rate": 9.553383577162418e-06, + "loss": 0.3922, + "step": 1660 + }, + { + "epoch": 0.161654501216545, + "grad_norm": 1.4993759287568524, + "learning_rate": 9.552732210087188e-06, + "loss": 0.5101, + "step": 1661 + }, + { + "epoch": 0.16175182481751824, + "grad_norm": 1.4132678080310175, + "learning_rate": 9.55208039060416e-06, + "loss": 0.4098, + "step": 1662 + }, + { + "epoch": 0.1618491484184915, + "grad_norm": 1.3072203759845393, + "learning_rate": 9.551428118778105e-06, + "loss": 0.4437, + "step": 1663 + }, + { + "epoch": 0.16194647201946472, + "grad_norm": 1.4197615961970556, + "learning_rate": 9.550775394673841e-06, + "loss": 0.4855, + "step": 1664 + }, + { + "epoch": 0.16204379562043797, + "grad_norm": 1.1443578178578404, + "learning_rate": 9.550122218356228e-06, + "loss": 0.2651, + "step": 1665 + }, + { + "epoch": 0.1621411192214112, + "grad_norm": 1.6274953169982382, + "learning_rate": 9.549468589890173e-06, + "loss": 0.5702, + "step": 1666 + }, + { + "epoch": 0.16223844282238442, + "grad_norm": 1.5542252970145625, + "learning_rate": 9.548814509340631e-06, + "loss": 0.3618, + "step": 1667 + }, + { + "epoch": 0.16233576642335767, + "grad_norm": 1.5872588267319008, + "learning_rate": 9.548159976772593e-06, + "loss": 0.5261, + "step": 1668 + }, + { + "epoch": 0.1624330900243309, + "grad_norm": 1.1735078752446053, + "learning_rate": 9.547504992251102e-06, + "loss": 0.2709, + "step": 1669 + }, + { + "epoch": 0.16253041362530413, + "grad_norm": 1.8057871189139236, + "learning_rate": 9.546849555841247e-06, + "loss": 0.3383, + "step": 1670 + }, + { + "epoch": 0.16262773722627738, + "grad_norm": 1.4181568031561294, + "learning_rate": 9.546193667608155e-06, + "loss": 0.4654, + "step": 1671 + }, + { + "epoch": 0.1627250608272506, + "grad_norm": 1.3372190697374011, + "learning_rate": 9.545537327617004e-06, + "loss": 0.4098, + "step": 1672 + }, + { + "epoch": 0.16282238442822383, + "grad_norm": 1.4054977948345526, + "learning_rate": 9.544880535933015e-06, + "loss": 0.488, + "step": 1673 + }, + { + "epoch": 0.16291970802919709, + "grad_norm": 1.8103202340533562, + "learning_rate": 9.544223292621456e-06, + "loss": 0.2989, + "step": 1674 + }, + { + "epoch": 0.1630170316301703, + "grad_norm": 1.4424657055300307, + "learning_rate": 9.543565597747633e-06, + "loss": 0.3545, + "step": 1675 + }, + { + "epoch": 0.16311435523114357, + "grad_norm": 1.712897793310079, + "learning_rate": 9.542907451376904e-06, + "loss": 0.4372, + "step": 1676 + }, + { + "epoch": 0.1632116788321168, + "grad_norm": 1.5856342495538354, + "learning_rate": 9.542248853574669e-06, + "loss": 0.3552, + "step": 1677 + }, + { + "epoch": 0.16330900243309002, + "grad_norm": 1.6070757988154845, + "learning_rate": 9.541589804406373e-06, + "loss": 0.6297, + "step": 1678 + }, + { + "epoch": 0.16340632603406327, + "grad_norm": 1.4030835423791206, + "learning_rate": 9.540930303937508e-06, + "loss": 0.5304, + "step": 1679 + }, + { + "epoch": 0.1635036496350365, + "grad_norm": 1.1629420270697914, + "learning_rate": 9.540270352233607e-06, + "loss": 0.3196, + "step": 1680 + }, + { + "epoch": 0.16360097323600972, + "grad_norm": 1.6438421767465334, + "learning_rate": 9.53960994936025e-06, + "loss": 0.5718, + "step": 1681 + }, + { + "epoch": 0.16369829683698298, + "grad_norm": 1.4972655485667212, + "learning_rate": 9.538949095383064e-06, + "loss": 0.5411, + "step": 1682 + }, + { + "epoch": 0.1637956204379562, + "grad_norm": 1.6855463092047138, + "learning_rate": 9.538287790367715e-06, + "loss": 0.4072, + "step": 1683 + }, + { + "epoch": 0.16389294403892943, + "grad_norm": 1.3024464622228382, + "learning_rate": 9.537626034379918e-06, + "loss": 0.3779, + "step": 1684 + }, + { + "epoch": 0.16399026763990268, + "grad_norm": 1.295189693137423, + "learning_rate": 9.536963827485435e-06, + "loss": 0.3687, + "step": 1685 + }, + { + "epoch": 0.1640875912408759, + "grad_norm": 1.4535138830119652, + "learning_rate": 9.536301169750068e-06, + "loss": 0.4548, + "step": 1686 + }, + { + "epoch": 0.16418491484184916, + "grad_norm": 1.199213729997, + "learning_rate": 9.535638061239663e-06, + "loss": 0.2053, + "step": 1687 + }, + { + "epoch": 0.1642822384428224, + "grad_norm": 1.5567691993981325, + "learning_rate": 9.534974502020117e-06, + "loss": 0.4098, + "step": 1688 + }, + { + "epoch": 0.1643795620437956, + "grad_norm": 1.5701473016338705, + "learning_rate": 9.534310492157368e-06, + "loss": 0.4663, + "step": 1689 + }, + { + "epoch": 0.16447688564476887, + "grad_norm": 1.4652608455665965, + "learning_rate": 9.533646031717398e-06, + "loss": 0.423, + "step": 1690 + }, + { + "epoch": 0.1645742092457421, + "grad_norm": 1.556818972222242, + "learning_rate": 9.532981120766235e-06, + "loss": 0.5823, + "step": 1691 + }, + { + "epoch": 0.16467153284671532, + "grad_norm": 1.3176167070500389, + "learning_rate": 9.532315759369953e-06, + "loss": 0.3369, + "step": 1692 + }, + { + "epoch": 0.16476885644768857, + "grad_norm": 1.710131590392248, + "learning_rate": 9.531649947594668e-06, + "loss": 0.6235, + "step": 1693 + }, + { + "epoch": 0.1648661800486618, + "grad_norm": 1.316452070848038, + "learning_rate": 9.53098368550654e-06, + "loss": 0.2773, + "step": 1694 + }, + { + "epoch": 0.16496350364963502, + "grad_norm": 1.3144552108952152, + "learning_rate": 9.53031697317178e-06, + "loss": 0.4008, + "step": 1695 + }, + { + "epoch": 0.16506082725060828, + "grad_norm": 1.6242845867808264, + "learning_rate": 9.529649810656638e-06, + "loss": 0.4994, + "step": 1696 + }, + { + "epoch": 0.1651581508515815, + "grad_norm": 1.285181340955318, + "learning_rate": 9.52898219802741e-06, + "loss": 0.3565, + "step": 1697 + }, + { + "epoch": 0.16525547445255476, + "grad_norm": 1.5859120183692204, + "learning_rate": 9.528314135350439e-06, + "loss": 0.6057, + "step": 1698 + }, + { + "epoch": 0.16535279805352798, + "grad_norm": 1.2413369391689792, + "learning_rate": 9.527645622692105e-06, + "loss": 0.2912, + "step": 1699 + }, + { + "epoch": 0.1654501216545012, + "grad_norm": 1.5626898078072964, + "learning_rate": 9.526976660118846e-06, + "loss": 0.4912, + "step": 1700 + }, + { + "epoch": 0.16554744525547446, + "grad_norm": 1.355302168314411, + "learning_rate": 9.526307247697133e-06, + "loss": 0.4066, + "step": 1701 + }, + { + "epoch": 0.1656447688564477, + "grad_norm": 1.6754743388370108, + "learning_rate": 9.525637385493485e-06, + "loss": 0.4402, + "step": 1702 + }, + { + "epoch": 0.16574209245742091, + "grad_norm": 1.4378330010865907, + "learning_rate": 9.524967073574468e-06, + "loss": 0.3896, + "step": 1703 + }, + { + "epoch": 0.16583941605839417, + "grad_norm": 1.5562357645264613, + "learning_rate": 9.524296312006696e-06, + "loss": 0.7178, + "step": 1704 + }, + { + "epoch": 0.1659367396593674, + "grad_norm": 1.4997676033555023, + "learning_rate": 9.523625100856814e-06, + "loss": 0.5203, + "step": 1705 + }, + { + "epoch": 0.16603406326034062, + "grad_norm": 1.39039181243628, + "learning_rate": 9.522953440191528e-06, + "loss": 0.4804, + "step": 1706 + }, + { + "epoch": 0.16613138686131387, + "grad_norm": 1.2594698773182105, + "learning_rate": 9.522281330077579e-06, + "loss": 0.31, + "step": 1707 + }, + { + "epoch": 0.1662287104622871, + "grad_norm": 1.5394103920539104, + "learning_rate": 9.521608770581751e-06, + "loss": 0.4579, + "step": 1708 + }, + { + "epoch": 0.16632603406326035, + "grad_norm": 1.4703967014570463, + "learning_rate": 9.520935761770885e-06, + "loss": 0.4732, + "step": 1709 + }, + { + "epoch": 0.16642335766423358, + "grad_norm": 1.0444153315520046, + "learning_rate": 9.520262303711851e-06, + "loss": 0.2468, + "step": 1710 + }, + { + "epoch": 0.1665206812652068, + "grad_norm": 1.4440019594110525, + "learning_rate": 9.519588396471572e-06, + "loss": 0.4979, + "step": 1711 + }, + { + "epoch": 0.16661800486618006, + "grad_norm": 1.6467368949298022, + "learning_rate": 9.518914040117018e-06, + "loss": 0.603, + "step": 1712 + }, + { + "epoch": 0.16671532846715328, + "grad_norm": 1.656027868957794, + "learning_rate": 9.518239234715198e-06, + "loss": 0.3534, + "step": 1713 + }, + { + "epoch": 0.1668126520681265, + "grad_norm": 1.409360793352949, + "learning_rate": 9.517563980333169e-06, + "loss": 0.4442, + "step": 1714 + }, + { + "epoch": 0.16690997566909976, + "grad_norm": 1.4429795690770129, + "learning_rate": 9.51688827703803e-06, + "loss": 0.4347, + "step": 1715 + }, + { + "epoch": 0.167007299270073, + "grad_norm": 1.2256612199861667, + "learning_rate": 9.516212124896926e-06, + "loss": 0.3582, + "step": 1716 + }, + { + "epoch": 0.16710462287104622, + "grad_norm": 1.340106815948813, + "learning_rate": 9.515535523977047e-06, + "loss": 0.4494, + "step": 1717 + }, + { + "epoch": 0.16720194647201947, + "grad_norm": 1.8033632646616307, + "learning_rate": 9.514858474345628e-06, + "loss": 0.7254, + "step": 1718 + }, + { + "epoch": 0.1672992700729927, + "grad_norm": 1.461471704742246, + "learning_rate": 9.514180976069948e-06, + "loss": 0.4431, + "step": 1719 + }, + { + "epoch": 0.16739659367396595, + "grad_norm": 1.8149337871023152, + "learning_rate": 9.513503029217329e-06, + "loss": 0.6808, + "step": 1720 + }, + { + "epoch": 0.16749391727493917, + "grad_norm": 1.4317488687976054, + "learning_rate": 9.51282463385514e-06, + "loss": 0.3969, + "step": 1721 + }, + { + "epoch": 0.1675912408759124, + "grad_norm": 1.406660867644435, + "learning_rate": 9.512145790050793e-06, + "loss": 0.4466, + "step": 1722 + }, + { + "epoch": 0.16768856447688565, + "grad_norm": 1.5087949092220858, + "learning_rate": 9.511466497871747e-06, + "loss": 0.3588, + "step": 1723 + }, + { + "epoch": 0.16778588807785888, + "grad_norm": 1.3780878680496882, + "learning_rate": 9.5107867573855e-06, + "loss": 0.4136, + "step": 1724 + }, + { + "epoch": 0.1678832116788321, + "grad_norm": 1.1785521443758606, + "learning_rate": 9.510106568659601e-06, + "loss": 0.3319, + "step": 1725 + }, + { + "epoch": 0.16798053527980536, + "grad_norm": 1.4150065437408217, + "learning_rate": 9.50942593176164e-06, + "loss": 0.3619, + "step": 1726 + }, + { + "epoch": 0.16807785888077859, + "grad_norm": 1.5810685607791577, + "learning_rate": 9.508744846759254e-06, + "loss": 0.5204, + "step": 1727 + }, + { + "epoch": 0.1681751824817518, + "grad_norm": 1.5507123725258296, + "learning_rate": 9.50806331372012e-06, + "loss": 0.3017, + "step": 1728 + }, + { + "epoch": 0.16827250608272507, + "grad_norm": 1.7448176899198176, + "learning_rate": 9.507381332711963e-06, + "loss": 0.6488, + "step": 1729 + }, + { + "epoch": 0.1683698296836983, + "grad_norm": 2.0203041353812243, + "learning_rate": 9.506698903802553e-06, + "loss": 0.2868, + "step": 1730 + }, + { + "epoch": 0.16846715328467154, + "grad_norm": 1.425557408986151, + "learning_rate": 9.506016027059703e-06, + "loss": 0.4181, + "step": 1731 + }, + { + "epoch": 0.16856447688564477, + "grad_norm": 1.658389742609111, + "learning_rate": 9.505332702551272e-06, + "loss": 0.4834, + "step": 1732 + }, + { + "epoch": 0.168661800486618, + "grad_norm": 1.6313220070332846, + "learning_rate": 9.50464893034516e-06, + "loss": 0.6351, + "step": 1733 + }, + { + "epoch": 0.16875912408759125, + "grad_norm": 1.4860828412814417, + "learning_rate": 9.503964710509314e-06, + "loss": 0.384, + "step": 1734 + }, + { + "epoch": 0.16885644768856448, + "grad_norm": 1.5665989326823084, + "learning_rate": 9.503280043111729e-06, + "loss": 0.5031, + "step": 1735 + }, + { + "epoch": 0.1689537712895377, + "grad_norm": 1.2627591310970376, + "learning_rate": 9.502594928220437e-06, + "loss": 0.3557, + "step": 1736 + }, + { + "epoch": 0.16905109489051096, + "grad_norm": 1.6101827723851228, + "learning_rate": 9.50190936590352e-06, + "loss": 0.3886, + "step": 1737 + }, + { + "epoch": 0.16914841849148418, + "grad_norm": 1.190927027644026, + "learning_rate": 9.5012233562291e-06, + "loss": 0.3, + "step": 1738 + }, + { + "epoch": 0.1692457420924574, + "grad_norm": 1.6452233677093766, + "learning_rate": 9.50053689926535e-06, + "loss": 0.5808, + "step": 1739 + }, + { + "epoch": 0.16934306569343066, + "grad_norm": 1.607284224817037, + "learning_rate": 9.499849995080482e-06, + "loss": 0.5726, + "step": 1740 + }, + { + "epoch": 0.1694403892944039, + "grad_norm": 1.360873175063302, + "learning_rate": 9.499162643742754e-06, + "loss": 0.3294, + "step": 1741 + }, + { + "epoch": 0.16953771289537714, + "grad_norm": 1.6205396325650636, + "learning_rate": 9.49847484532047e-06, + "loss": 0.5496, + "step": 1742 + }, + { + "epoch": 0.16963503649635037, + "grad_norm": 1.6677491090337848, + "learning_rate": 9.497786599881973e-06, + "loss": 0.5745, + "step": 1743 + }, + { + "epoch": 0.1697323600973236, + "grad_norm": 1.4765151889225172, + "learning_rate": 9.497097907495658e-06, + "loss": 0.3552, + "step": 1744 + }, + { + "epoch": 0.16982968369829685, + "grad_norm": 1.4991516257283077, + "learning_rate": 9.496408768229962e-06, + "loss": 0.6004, + "step": 1745 + }, + { + "epoch": 0.16992700729927007, + "grad_norm": 1.394241003611109, + "learning_rate": 9.49571918215336e-06, + "loss": 0.4166, + "step": 1746 + }, + { + "epoch": 0.1700243309002433, + "grad_norm": 1.2418310265706307, + "learning_rate": 9.495029149334381e-06, + "loss": 0.3754, + "step": 1747 + }, + { + "epoch": 0.17012165450121655, + "grad_norm": 1.7344174079178016, + "learning_rate": 9.494338669841592e-06, + "loss": 0.6136, + "step": 1748 + }, + { + "epoch": 0.17021897810218978, + "grad_norm": 1.689754745813109, + "learning_rate": 9.493647743743605e-06, + "loss": 0.3066, + "step": 1749 + }, + { + "epoch": 0.170316301703163, + "grad_norm": 1.5986274434851808, + "learning_rate": 9.492956371109083e-06, + "loss": 0.6476, + "step": 1750 + }, + { + "epoch": 0.17041362530413626, + "grad_norm": 1.3892856963539753, + "learning_rate": 9.492264552006725e-06, + "loss": 0.2438, + "step": 1751 + }, + { + "epoch": 0.17051094890510948, + "grad_norm": 1.3744062095245357, + "learning_rate": 9.491572286505275e-06, + "loss": 0.4154, + "step": 1752 + }, + { + "epoch": 0.17060827250608274, + "grad_norm": 1.3041989445373636, + "learning_rate": 9.490879574673528e-06, + "loss": 0.3603, + "step": 1753 + }, + { + "epoch": 0.17070559610705596, + "grad_norm": 1.2198251236981021, + "learning_rate": 9.490186416580317e-06, + "loss": 0.3382, + "step": 1754 + }, + { + "epoch": 0.1708029197080292, + "grad_norm": 1.0699077871285796, + "learning_rate": 9.489492812294521e-06, + "loss": 0.2805, + "step": 1755 + }, + { + "epoch": 0.17090024330900244, + "grad_norm": 1.8289792925797566, + "learning_rate": 9.488798761885064e-06, + "loss": 0.2551, + "step": 1756 + }, + { + "epoch": 0.17099756690997567, + "grad_norm": 1.5156970449411904, + "learning_rate": 9.488104265420917e-06, + "loss": 0.5468, + "step": 1757 + }, + { + "epoch": 0.1710948905109489, + "grad_norm": 1.3669899498040559, + "learning_rate": 9.487409322971089e-06, + "loss": 0.4705, + "step": 1758 + }, + { + "epoch": 0.17119221411192215, + "grad_norm": 1.4212977316967985, + "learning_rate": 9.486713934604638e-06, + "loss": 0.5259, + "step": 1759 + }, + { + "epoch": 0.17128953771289537, + "grad_norm": 1.3256503218660822, + "learning_rate": 9.486018100390668e-06, + "loss": 0.3825, + "step": 1760 + }, + { + "epoch": 0.17138686131386863, + "grad_norm": 1.3448672418414023, + "learning_rate": 9.485321820398321e-06, + "loss": 0.4984, + "step": 1761 + }, + { + "epoch": 0.17148418491484185, + "grad_norm": 1.3293122762885854, + "learning_rate": 9.484625094696788e-06, + "loss": 0.4419, + "step": 1762 + }, + { + "epoch": 0.17158150851581508, + "grad_norm": 1.5749728003681251, + "learning_rate": 9.483927923355303e-06, + "loss": 0.4512, + "step": 1763 + }, + { + "epoch": 0.17167883211678833, + "grad_norm": 2.1875449039755, + "learning_rate": 9.483230306443144e-06, + "loss": 0.4606, + "step": 1764 + }, + { + "epoch": 0.17177615571776156, + "grad_norm": 1.4675466599593059, + "learning_rate": 9.482532244029632e-06, + "loss": 0.5098, + "step": 1765 + }, + { + "epoch": 0.17187347931873478, + "grad_norm": 1.4395657499189969, + "learning_rate": 9.481833736184137e-06, + "loss": 0.4196, + "step": 1766 + }, + { + "epoch": 0.17197080291970804, + "grad_norm": 1.6202346179751734, + "learning_rate": 9.48113478297607e-06, + "loss": 0.4083, + "step": 1767 + }, + { + "epoch": 0.17206812652068126, + "grad_norm": 1.943359375, + "learning_rate": 9.480435384474884e-06, + "loss": 0.3829, + "step": 1768 + }, + { + "epoch": 0.1721654501216545, + "grad_norm": 1.3957800309361543, + "learning_rate": 9.47973554075008e-06, + "loss": 0.4776, + "step": 1769 + }, + { + "epoch": 0.17226277372262774, + "grad_norm": 1.3277740014703983, + "learning_rate": 9.479035251871202e-06, + "loss": 0.2944, + "step": 1770 + }, + { + "epoch": 0.17236009732360097, + "grad_norm": 1.5955109684829234, + "learning_rate": 9.478334517907838e-06, + "loss": 0.4713, + "step": 1771 + }, + { + "epoch": 0.17245742092457422, + "grad_norm": 1.209763677864614, + "learning_rate": 9.477633338929621e-06, + "loss": 0.2925, + "step": 1772 + }, + { + "epoch": 0.17255474452554745, + "grad_norm": 1.6082316661319236, + "learning_rate": 9.476931715006225e-06, + "loss": 0.6037, + "step": 1773 + }, + { + "epoch": 0.17265206812652067, + "grad_norm": 1.5310145605828824, + "learning_rate": 9.476229646207375e-06, + "loss": 0.427, + "step": 1774 + }, + { + "epoch": 0.17274939172749393, + "grad_norm": 1.5161322305327478, + "learning_rate": 9.475527132602833e-06, + "loss": 0.5765, + "step": 1775 + }, + { + "epoch": 0.17284671532846715, + "grad_norm": 1.5515912532543141, + "learning_rate": 9.47482417426241e-06, + "loss": 0.4693, + "step": 1776 + }, + { + "epoch": 0.17294403892944038, + "grad_norm": 1.273583152257964, + "learning_rate": 9.474120771255956e-06, + "loss": 0.401, + "step": 1777 + }, + { + "epoch": 0.17304136253041363, + "grad_norm": 1.3058387108850102, + "learning_rate": 9.473416923653373e-06, + "loss": 0.4651, + "step": 1778 + }, + { + "epoch": 0.17313868613138686, + "grad_norm": 1.4876685295483647, + "learning_rate": 9.472712631524599e-06, + "loss": 0.5423, + "step": 1779 + }, + { + "epoch": 0.17323600973236009, + "grad_norm": 1.4134646674679987, + "learning_rate": 9.472007894939624e-06, + "loss": 0.448, + "step": 1780 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 1.3805564537318322, + "learning_rate": 9.471302713968473e-06, + "loss": 0.2429, + "step": 1781 + }, + { + "epoch": 0.17343065693430657, + "grad_norm": 1.4256414475552066, + "learning_rate": 9.470597088681225e-06, + "loss": 0.4821, + "step": 1782 + }, + { + "epoch": 0.17352798053527982, + "grad_norm": 1.2857804565204727, + "learning_rate": 9.469891019147996e-06, + "loss": 0.3177, + "step": 1783 + }, + { + "epoch": 0.17362530413625304, + "grad_norm": 1.7384422656290006, + "learning_rate": 9.46918450543895e-06, + "loss": 0.6144, + "step": 1784 + }, + { + "epoch": 0.17372262773722627, + "grad_norm": 1.3733947226466707, + "learning_rate": 9.46847754762429e-06, + "loss": 0.3777, + "step": 1785 + }, + { + "epoch": 0.17381995133819952, + "grad_norm": 1.090627736959876, + "learning_rate": 9.467770145774271e-06, + "loss": 0.307, + "step": 1786 + }, + { + "epoch": 0.17391727493917275, + "grad_norm": 1.5306242617959314, + "learning_rate": 9.467062299959187e-06, + "loss": 0.4652, + "step": 1787 + }, + { + "epoch": 0.17401459854014598, + "grad_norm": 1.6335244702718128, + "learning_rate": 9.466354010249375e-06, + "loss": 0.5127, + "step": 1788 + }, + { + "epoch": 0.17411192214111923, + "grad_norm": 1.3582351114688258, + "learning_rate": 9.465645276715221e-06, + "loss": 0.4213, + "step": 1789 + }, + { + "epoch": 0.17420924574209246, + "grad_norm": 1.4962342995542501, + "learning_rate": 9.464936099427151e-06, + "loss": 0.4327, + "step": 1790 + }, + { + "epoch": 0.17430656934306568, + "grad_norm": 1.513533102257641, + "learning_rate": 9.464226478455636e-06, + "loss": 0.4527, + "step": 1791 + }, + { + "epoch": 0.17440389294403894, + "grad_norm": 1.4174664240767785, + "learning_rate": 9.463516413871193e-06, + "loss": 0.4986, + "step": 1792 + }, + { + "epoch": 0.17450121654501216, + "grad_norm": 1.283758777636687, + "learning_rate": 9.46280590574438e-06, + "loss": 0.4648, + "step": 1793 + }, + { + "epoch": 0.17459854014598541, + "grad_norm": 1.3960565511895506, + "learning_rate": 9.4620949541458e-06, + "loss": 0.3587, + "step": 1794 + }, + { + "epoch": 0.17469586374695864, + "grad_norm": 1.6199676647020385, + "learning_rate": 9.461383559146104e-06, + "loss": 0.5292, + "step": 1795 + }, + { + "epoch": 0.17479318734793187, + "grad_norm": 1.5028051531717803, + "learning_rate": 9.46067172081598e-06, + "loss": 0.4903, + "step": 1796 + }, + { + "epoch": 0.17489051094890512, + "grad_norm": 1.683063467822515, + "learning_rate": 9.459959439226165e-06, + "loss": 0.3106, + "step": 1797 + }, + { + "epoch": 0.17498783454987835, + "grad_norm": 1.3296224342860092, + "learning_rate": 9.459246714447439e-06, + "loss": 0.409, + "step": 1798 + }, + { + "epoch": 0.17508515815085157, + "grad_norm": 1.5847151231323486, + "learning_rate": 9.458533546550628e-06, + "loss": 0.4169, + "step": 1799 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 1.495253204796384, + "learning_rate": 9.457819935606596e-06, + "loss": 0.3753, + "step": 1800 + }, + { + "epoch": 0.17527980535279805, + "grad_norm": 1.4723876384358174, + "learning_rate": 9.45710588168626e-06, + "loss": 0.2437, + "step": 1801 + }, + { + "epoch": 0.17537712895377128, + "grad_norm": 1.5610295815557715, + "learning_rate": 9.45639138486057e-06, + "loss": 0.5651, + "step": 1802 + }, + { + "epoch": 0.17547445255474453, + "grad_norm": 1.5876154549734276, + "learning_rate": 9.45567644520053e-06, + "loss": 0.4835, + "step": 1803 + }, + { + "epoch": 0.17557177615571776, + "grad_norm": 1.5619759252942187, + "learning_rate": 9.454961062777181e-06, + "loss": 0.3036, + "step": 1804 + }, + { + "epoch": 0.175669099756691, + "grad_norm": 1.0144029160546408, + "learning_rate": 9.454245237661617e-06, + "loss": 0.219, + "step": 1805 + }, + { + "epoch": 0.17576642335766424, + "grad_norm": 1.717922774563162, + "learning_rate": 9.453528969924963e-06, + "loss": 0.5388, + "step": 1806 + }, + { + "epoch": 0.17586374695863746, + "grad_norm": 1.288743598100688, + "learning_rate": 9.452812259638399e-06, + "loss": 0.4171, + "step": 1807 + }, + { + "epoch": 0.17596107055961072, + "grad_norm": 1.951279890184611, + "learning_rate": 9.452095106873142e-06, + "loss": 0.3823, + "step": 1808 + }, + { + "epoch": 0.17605839416058394, + "grad_norm": 1.352467757455935, + "learning_rate": 9.45137751170046e-06, + "loss": 0.3137, + "step": 1809 + }, + { + "epoch": 0.17615571776155717, + "grad_norm": 1.3883395327139227, + "learning_rate": 9.450659474191658e-06, + "loss": 0.4878, + "step": 1810 + }, + { + "epoch": 0.17625304136253042, + "grad_norm": 1.5658708451700805, + "learning_rate": 9.449940994418088e-06, + "loss": 0.523, + "step": 1811 + }, + { + "epoch": 0.17635036496350365, + "grad_norm": 1.215080164631292, + "learning_rate": 9.449222072451147e-06, + "loss": 0.3773, + "step": 1812 + }, + { + "epoch": 0.17644768856447687, + "grad_norm": 1.524331324578441, + "learning_rate": 9.448502708362273e-06, + "loss": 0.539, + "step": 1813 + }, + { + "epoch": 0.17654501216545013, + "grad_norm": 1.6985132616371517, + "learning_rate": 9.447782902222951e-06, + "loss": 0.6344, + "step": 1814 + }, + { + "epoch": 0.17664233576642335, + "grad_norm": 1.394156226586294, + "learning_rate": 9.447062654104708e-06, + "loss": 0.4136, + "step": 1815 + }, + { + "epoch": 0.1767396593673966, + "grad_norm": 1.0359913462457855, + "learning_rate": 9.446341964079116e-06, + "loss": 0.2471, + "step": 1816 + }, + { + "epoch": 0.17683698296836983, + "grad_norm": 1.6379291001324041, + "learning_rate": 9.44562083221779e-06, + "loss": 0.4648, + "step": 1817 + }, + { + "epoch": 0.17693430656934306, + "grad_norm": 1.0926982727654353, + "learning_rate": 9.44489925859239e-06, + "loss": 0.253, + "step": 1818 + }, + { + "epoch": 0.1770316301703163, + "grad_norm": 1.3396314447206463, + "learning_rate": 9.444177243274619e-06, + "loss": 0.4053, + "step": 1819 + }, + { + "epoch": 0.17712895377128954, + "grad_norm": 1.2170984864894128, + "learning_rate": 9.44345478633622e-06, + "loss": 0.3483, + "step": 1820 + }, + { + "epoch": 0.17722627737226276, + "grad_norm": 1.9241463489982464, + "learning_rate": 9.442731887848993e-06, + "loss": 0.7875, + "step": 1821 + }, + { + "epoch": 0.17732360097323602, + "grad_norm": 1.7367037011857493, + "learning_rate": 9.442008547884765e-06, + "loss": 0.5423, + "step": 1822 + }, + { + "epoch": 0.17742092457420924, + "grad_norm": 1.7768925691501514, + "learning_rate": 9.441284766515417e-06, + "loss": 0.5332, + "step": 1823 + }, + { + "epoch": 0.17751824817518247, + "grad_norm": 1.544872490519166, + "learning_rate": 9.440560543812872e-06, + "loss": 0.4797, + "step": 1824 + }, + { + "epoch": 0.17761557177615572, + "grad_norm": 1.3959412272112985, + "learning_rate": 9.439835879849097e-06, + "loss": 0.2813, + "step": 1825 + }, + { + "epoch": 0.17771289537712895, + "grad_norm": 1.4333698815114406, + "learning_rate": 9.439110774696101e-06, + "loss": 0.4623, + "step": 1826 + }, + { + "epoch": 0.1778102189781022, + "grad_norm": 1.4483549520432324, + "learning_rate": 9.43838522842594e-06, + "loss": 0.3718, + "step": 1827 + }, + { + "epoch": 0.17790754257907543, + "grad_norm": 1.1321375447475677, + "learning_rate": 9.43765924111071e-06, + "loss": 0.3035, + "step": 1828 + }, + { + "epoch": 0.17800486618004865, + "grad_norm": 1.362326738732822, + "learning_rate": 9.436932812822554e-06, + "loss": 0.316, + "step": 1829 + }, + { + "epoch": 0.1781021897810219, + "grad_norm": 1.460799021966237, + "learning_rate": 9.436205943633656e-06, + "loss": 0.3911, + "step": 1830 + }, + { + "epoch": 0.17819951338199513, + "grad_norm": 1.5389161016090995, + "learning_rate": 9.435478633616247e-06, + "loss": 0.5521, + "step": 1831 + }, + { + "epoch": 0.17829683698296836, + "grad_norm": 1.5219868331018827, + "learning_rate": 9.4347508828426e-06, + "loss": 0.5027, + "step": 1832 + }, + { + "epoch": 0.1783941605839416, + "grad_norm": 1.245087028586955, + "learning_rate": 9.434022691385034e-06, + "loss": 0.2981, + "step": 1833 + }, + { + "epoch": 0.17849148418491484, + "grad_norm": 1.4557548434245557, + "learning_rate": 9.433294059315905e-06, + "loss": 0.2293, + "step": 1834 + }, + { + "epoch": 0.17858880778588807, + "grad_norm": 1.3081558633618169, + "learning_rate": 9.432564986707621e-06, + "loss": 0.4217, + "step": 1835 + }, + { + "epoch": 0.17868613138686132, + "grad_norm": 1.3513560054673133, + "learning_rate": 9.43183547363263e-06, + "loss": 0.4318, + "step": 1836 + }, + { + "epoch": 0.17878345498783454, + "grad_norm": 1.3315264956466353, + "learning_rate": 9.431105520163426e-06, + "loss": 0.3781, + "step": 1837 + }, + { + "epoch": 0.1788807785888078, + "grad_norm": 1.0550787306059675, + "learning_rate": 9.430375126372542e-06, + "loss": 0.3104, + "step": 1838 + }, + { + "epoch": 0.17897810218978102, + "grad_norm": 1.3337629142786684, + "learning_rate": 9.429644292332557e-06, + "loss": 0.3455, + "step": 1839 + }, + { + "epoch": 0.17907542579075425, + "grad_norm": 1.6239197882024916, + "learning_rate": 9.428913018116098e-06, + "loss": 0.5855, + "step": 1840 + }, + { + "epoch": 0.1791727493917275, + "grad_norm": 1.3780162846249417, + "learning_rate": 9.428181303795828e-06, + "loss": 0.3643, + "step": 1841 + }, + { + "epoch": 0.17927007299270073, + "grad_norm": 1.3478310292007554, + "learning_rate": 9.42744914944446e-06, + "loss": 0.3962, + "step": 1842 + }, + { + "epoch": 0.17936739659367396, + "grad_norm": 1.5440243743593307, + "learning_rate": 9.426716555134751e-06, + "loss": 0.6193, + "step": 1843 + }, + { + "epoch": 0.1794647201946472, + "grad_norm": 1.4878960058265709, + "learning_rate": 9.425983520939495e-06, + "loss": 0.473, + "step": 1844 + }, + { + "epoch": 0.17956204379562044, + "grad_norm": 1.672460221871015, + "learning_rate": 9.425250046931539e-06, + "loss": 0.6429, + "step": 1845 + }, + { + "epoch": 0.17965936739659366, + "grad_norm": 1.6015212635221012, + "learning_rate": 9.424516133183762e-06, + "loss": 0.3195, + "step": 1846 + }, + { + "epoch": 0.17975669099756691, + "grad_norm": 1.385761715171386, + "learning_rate": 9.4237817797691e-06, + "loss": 0.4054, + "step": 1847 + }, + { + "epoch": 0.17985401459854014, + "grad_norm": 1.386847906411032, + "learning_rate": 9.423046986760522e-06, + "loss": 0.3825, + "step": 1848 + }, + { + "epoch": 0.1799513381995134, + "grad_norm": 1.087510047515406, + "learning_rate": 9.422311754231047e-06, + "loss": 0.3213, + "step": 1849 + }, + { + "epoch": 0.18004866180048662, + "grad_norm": 1.6065416301387576, + "learning_rate": 9.421576082253734e-06, + "loss": 0.5062, + "step": 1850 + }, + { + "epoch": 0.18014598540145985, + "grad_norm": 1.34096451308299, + "learning_rate": 9.42083997090169e-06, + "loss": 0.4036, + "step": 1851 + }, + { + "epoch": 0.1802433090024331, + "grad_norm": 1.2557739418598393, + "learning_rate": 9.42010342024806e-06, + "loss": 0.3595, + "step": 1852 + }, + { + "epoch": 0.18034063260340633, + "grad_norm": 1.5281441778996137, + "learning_rate": 9.419366430366035e-06, + "loss": 0.604, + "step": 1853 + }, + { + "epoch": 0.18043795620437955, + "grad_norm": 1.2665309724570952, + "learning_rate": 9.418629001328852e-06, + "loss": 0.4205, + "step": 1854 + }, + { + "epoch": 0.1805352798053528, + "grad_norm": 1.3442942382162348, + "learning_rate": 9.417891133209789e-06, + "loss": 0.3457, + "step": 1855 + }, + { + "epoch": 0.18063260340632603, + "grad_norm": 1.4106593198915445, + "learning_rate": 9.417152826082169e-06, + "loss": 0.4812, + "step": 1856 + }, + { + "epoch": 0.18072992700729926, + "grad_norm": 1.4377180846268287, + "learning_rate": 9.416414080019359e-06, + "loss": 0.4618, + "step": 1857 + }, + { + "epoch": 0.1808272506082725, + "grad_norm": 3.1493721230250182, + "learning_rate": 9.415674895094765e-06, + "loss": 0.4636, + "step": 1858 + }, + { + "epoch": 0.18092457420924574, + "grad_norm": 1.2019926414899231, + "learning_rate": 9.414935271381844e-06, + "loss": 0.3081, + "step": 1859 + }, + { + "epoch": 0.181021897810219, + "grad_norm": 2.6470194483303042, + "learning_rate": 9.41419520895409e-06, + "loss": 0.545, + "step": 1860 + }, + { + "epoch": 0.18111922141119222, + "grad_norm": 1.2980614715591199, + "learning_rate": 9.413454707885048e-06, + "loss": 0.2964, + "step": 1861 + }, + { + "epoch": 0.18121654501216544, + "grad_norm": 1.0776172492719038, + "learning_rate": 9.412713768248296e-06, + "loss": 0.3014, + "step": 1862 + }, + { + "epoch": 0.1813138686131387, + "grad_norm": 1.6105644497131084, + "learning_rate": 9.411972390117466e-06, + "loss": 0.2939, + "step": 1863 + }, + { + "epoch": 0.18141119221411192, + "grad_norm": 1.5656908641978677, + "learning_rate": 9.411230573566227e-06, + "loss": 0.5202, + "step": 1864 + }, + { + "epoch": 0.18150851581508515, + "grad_norm": 1.303806212869287, + "learning_rate": 9.410488318668294e-06, + "loss": 0.333, + "step": 1865 + }, + { + "epoch": 0.1816058394160584, + "grad_norm": 1.6655746538236336, + "learning_rate": 9.409745625497427e-06, + "loss": 0.432, + "step": 1866 + }, + { + "epoch": 0.18170316301703163, + "grad_norm": 1.3843667729738216, + "learning_rate": 9.409002494127427e-06, + "loss": 0.3721, + "step": 1867 + }, + { + "epoch": 0.18180048661800485, + "grad_norm": 1.119511993732411, + "learning_rate": 9.408258924632139e-06, + "loss": 0.3344, + "step": 1868 + }, + { + "epoch": 0.1818978102189781, + "grad_norm": 1.402581324947916, + "learning_rate": 9.407514917085451e-06, + "loss": 0.4016, + "step": 1869 + }, + { + "epoch": 0.18199513381995133, + "grad_norm": 1.424239738841203, + "learning_rate": 9.406770471561298e-06, + "loss": 0.4043, + "step": 1870 + }, + { + "epoch": 0.18209245742092459, + "grad_norm": 1.4825401610777273, + "learning_rate": 9.406025588133654e-06, + "loss": 0.5446, + "step": 1871 + }, + { + "epoch": 0.1821897810218978, + "grad_norm": 1.1812973154269832, + "learning_rate": 9.405280266876539e-06, + "loss": 0.3086, + "step": 1872 + }, + { + "epoch": 0.18228710462287104, + "grad_norm": 1.458454653825207, + "learning_rate": 9.404534507864015e-06, + "loss": 0.426, + "step": 1873 + }, + { + "epoch": 0.1823844282238443, + "grad_norm": 1.4345175445802738, + "learning_rate": 9.403788311170193e-06, + "loss": 0.4826, + "step": 1874 + }, + { + "epoch": 0.18248175182481752, + "grad_norm": 1.636664123351898, + "learning_rate": 9.403041676869217e-06, + "loss": 0.5861, + "step": 1875 + }, + { + "epoch": 0.18257907542579074, + "grad_norm": 1.4112207510715695, + "learning_rate": 9.402294605035285e-06, + "loss": 0.3575, + "step": 1876 + }, + { + "epoch": 0.182676399026764, + "grad_norm": 1.5632317164864975, + "learning_rate": 9.401547095742631e-06, + "loss": 0.5798, + "step": 1877 + }, + { + "epoch": 0.18277372262773722, + "grad_norm": 1.2700759423445944, + "learning_rate": 9.400799149065538e-06, + "loss": 0.3928, + "step": 1878 + }, + { + "epoch": 0.18287104622871045, + "grad_norm": 1.1318646905388465, + "learning_rate": 9.400050765078327e-06, + "loss": 0.2783, + "step": 1879 + }, + { + "epoch": 0.1829683698296837, + "grad_norm": 1.1697084872304198, + "learning_rate": 9.399301943855368e-06, + "loss": 0.2715, + "step": 1880 + }, + { + "epoch": 0.18306569343065693, + "grad_norm": 1.4137887426273796, + "learning_rate": 9.39855268547107e-06, + "loss": 0.3049, + "step": 1881 + }, + { + "epoch": 0.18316301703163018, + "grad_norm": 1.3869164554267486, + "learning_rate": 9.397802989999888e-06, + "loss": 0.3526, + "step": 1882 + }, + { + "epoch": 0.1832603406326034, + "grad_norm": 1.3336674996684654, + "learning_rate": 9.39705285751632e-06, + "loss": 0.3914, + "step": 1883 + }, + { + "epoch": 0.18335766423357663, + "grad_norm": 1.2095628873380657, + "learning_rate": 9.396302288094907e-06, + "loss": 0.3577, + "step": 1884 + }, + { + "epoch": 0.1834549878345499, + "grad_norm": 1.4741118747641506, + "learning_rate": 9.395551281810233e-06, + "loss": 0.4753, + "step": 1885 + }, + { + "epoch": 0.1835523114355231, + "grad_norm": 1.5440799623052803, + "learning_rate": 9.394799838736928e-06, + "loss": 0.5143, + "step": 1886 + }, + { + "epoch": 0.18364963503649634, + "grad_norm": 1.6461828641301555, + "learning_rate": 9.394047958949661e-06, + "loss": 0.5046, + "step": 1887 + }, + { + "epoch": 0.1837469586374696, + "grad_norm": 1.3077272649446732, + "learning_rate": 9.393295642523147e-06, + "loss": 0.4505, + "step": 1888 + }, + { + "epoch": 0.18384428223844282, + "grad_norm": 1.3954964938282017, + "learning_rate": 9.392542889532146e-06, + "loss": 0.3752, + "step": 1889 + }, + { + "epoch": 0.18394160583941604, + "grad_norm": 1.4332674159188397, + "learning_rate": 9.391789700051457e-06, + "loss": 0.4102, + "step": 1890 + }, + { + "epoch": 0.1840389294403893, + "grad_norm": 1.5291760471205262, + "learning_rate": 9.391036074155926e-06, + "loss": 0.3892, + "step": 1891 + }, + { + "epoch": 0.18413625304136252, + "grad_norm": 1.3194046059109847, + "learning_rate": 9.390282011920442e-06, + "loss": 0.3402, + "step": 1892 + }, + { + "epoch": 0.18423357664233578, + "grad_norm": 1.1218553674196712, + "learning_rate": 9.389527513419935e-06, + "loss": 0.2705, + "step": 1893 + }, + { + "epoch": 0.184330900243309, + "grad_norm": 1.4415924034763155, + "learning_rate": 9.388772578729382e-06, + "loss": 0.4153, + "step": 1894 + }, + { + "epoch": 0.18442822384428223, + "grad_norm": 1.1449469634853555, + "learning_rate": 9.3880172079238e-06, + "loss": 0.2464, + "step": 1895 + }, + { + "epoch": 0.18452554744525548, + "grad_norm": 1.3609647553229742, + "learning_rate": 9.38726140107825e-06, + "loss": 0.4167, + "step": 1896 + }, + { + "epoch": 0.1846228710462287, + "grad_norm": 1.5005607351629322, + "learning_rate": 9.38650515826784e-06, + "loss": 0.5496, + "step": 1897 + }, + { + "epoch": 0.18472019464720194, + "grad_norm": 1.2988771816540412, + "learning_rate": 9.385748479567715e-06, + "loss": 0.3746, + "step": 1898 + }, + { + "epoch": 0.1848175182481752, + "grad_norm": 1.6297457427665438, + "learning_rate": 9.384991365053066e-06, + "loss": 0.5329, + "step": 1899 + }, + { + "epoch": 0.18491484184914841, + "grad_norm": 1.4260746902123356, + "learning_rate": 9.384233814799133e-06, + "loss": 0.5495, + "step": 1900 + }, + { + "epoch": 0.18501216545012167, + "grad_norm": 1.6131616876000299, + "learning_rate": 9.38347582888119e-06, + "loss": 0.4956, + "step": 1901 + }, + { + "epoch": 0.1851094890510949, + "grad_norm": 1.2427047036633028, + "learning_rate": 9.382717407374559e-06, + "loss": 0.3527, + "step": 1902 + }, + { + "epoch": 0.18520681265206812, + "grad_norm": 1.1650358905093554, + "learning_rate": 9.381958550354607e-06, + "loss": 0.3282, + "step": 1903 + }, + { + "epoch": 0.18530413625304137, + "grad_norm": 1.2422827918011654, + "learning_rate": 9.381199257896738e-06, + "loss": 0.3954, + "step": 1904 + }, + { + "epoch": 0.1854014598540146, + "grad_norm": 1.3772059864511268, + "learning_rate": 9.38043953007641e-06, + "loss": 0.2519, + "step": 1905 + }, + { + "epoch": 0.18549878345498783, + "grad_norm": 1.2627132972091453, + "learning_rate": 9.379679366969108e-06, + "loss": 0.3748, + "step": 1906 + }, + { + "epoch": 0.18559610705596108, + "grad_norm": 1.7742544300786764, + "learning_rate": 9.378918768650379e-06, + "loss": 0.4627, + "step": 1907 + }, + { + "epoch": 0.1856934306569343, + "grad_norm": 1.3460661864821146, + "learning_rate": 9.3781577351958e-06, + "loss": 0.3769, + "step": 1908 + }, + { + "epoch": 0.18579075425790753, + "grad_norm": 1.2948363493096455, + "learning_rate": 9.377396266680993e-06, + "loss": 0.255, + "step": 1909 + }, + { + "epoch": 0.18588807785888078, + "grad_norm": 1.4260435934265066, + "learning_rate": 9.376634363181631e-06, + "loss": 0.4158, + "step": 1910 + }, + { + "epoch": 0.185985401459854, + "grad_norm": 1.4136193355548345, + "learning_rate": 9.375872024773423e-06, + "loss": 0.3764, + "step": 1911 + }, + { + "epoch": 0.18608272506082726, + "grad_norm": 1.2338333390059972, + "learning_rate": 9.375109251532121e-06, + "loss": 0.3785, + "step": 1912 + }, + { + "epoch": 0.1861800486618005, + "grad_norm": 1.535249430616727, + "learning_rate": 9.374346043533524e-06, + "loss": 0.5252, + "step": 1913 + }, + { + "epoch": 0.18627737226277372, + "grad_norm": 1.215284604855692, + "learning_rate": 9.373582400853472e-06, + "loss": 0.3295, + "step": 1914 + }, + { + "epoch": 0.18637469586374697, + "grad_norm": 1.331605367733698, + "learning_rate": 9.372818323567847e-06, + "loss": 0.2818, + "step": 1915 + }, + { + "epoch": 0.1864720194647202, + "grad_norm": 1.3700650260278666, + "learning_rate": 9.37205381175258e-06, + "loss": 0.5125, + "step": 1916 + }, + { + "epoch": 0.18656934306569342, + "grad_norm": 1.0730618437287824, + "learning_rate": 9.371288865483637e-06, + "loss": 0.3608, + "step": 1917 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 1.6775147335354874, + "learning_rate": 9.370523484837033e-06, + "loss": 0.4555, + "step": 1918 + }, + { + "epoch": 0.1867639902676399, + "grad_norm": 1.531630799569193, + "learning_rate": 9.369757669888822e-06, + "loss": 0.502, + "step": 1919 + }, + { + "epoch": 0.18686131386861313, + "grad_norm": 0.924734272033398, + "learning_rate": 9.368991420715109e-06, + "loss": 0.2117, + "step": 1920 + }, + { + "epoch": 0.18695863746958638, + "grad_norm": 1.3568146369682141, + "learning_rate": 9.36822473739203e-06, + "loss": 0.4311, + "step": 1921 + }, + { + "epoch": 0.1870559610705596, + "grad_norm": 1.2577909858711795, + "learning_rate": 9.367457619995776e-06, + "loss": 0.405, + "step": 1922 + }, + { + "epoch": 0.18715328467153286, + "grad_norm": 1.5933524739274278, + "learning_rate": 9.366690068602573e-06, + "loss": 0.627, + "step": 1923 + }, + { + "epoch": 0.18725060827250609, + "grad_norm": 1.279419778059805, + "learning_rate": 9.365922083288694e-06, + "loss": 0.2814, + "step": 1924 + }, + { + "epoch": 0.1873479318734793, + "grad_norm": 1.6336124778487715, + "learning_rate": 9.365153664130454e-06, + "loss": 0.6461, + "step": 1925 + }, + { + "epoch": 0.18744525547445257, + "grad_norm": 5.906434394339674, + "learning_rate": 9.364384811204212e-06, + "loss": 0.5628, + "step": 1926 + }, + { + "epoch": 0.1875425790754258, + "grad_norm": 1.2770793302804129, + "learning_rate": 9.363615524586368e-06, + "loss": 0.303, + "step": 1927 + }, + { + "epoch": 0.18763990267639902, + "grad_norm": 1.2695156624644028, + "learning_rate": 9.362845804353367e-06, + "loss": 0.3592, + "step": 1928 + }, + { + "epoch": 0.18773722627737227, + "grad_norm": 1.4443375056776053, + "learning_rate": 9.362075650581698e-06, + "loss": 0.4701, + "step": 1929 + }, + { + "epoch": 0.1878345498783455, + "grad_norm": 1.4330727776563095, + "learning_rate": 9.36130506334789e-06, + "loss": 0.5163, + "step": 1930 + }, + { + "epoch": 0.18793187347931872, + "grad_norm": 1.326934280688427, + "learning_rate": 9.360534042728517e-06, + "loss": 0.289, + "step": 1931 + }, + { + "epoch": 0.18802919708029198, + "grad_norm": 1.0531370847104877, + "learning_rate": 9.359762588800195e-06, + "loss": 0.1994, + "step": 1932 + }, + { + "epoch": 0.1881265206812652, + "grad_norm": 1.4998435892573359, + "learning_rate": 9.358990701639585e-06, + "loss": 0.4064, + "step": 1933 + }, + { + "epoch": 0.18822384428223846, + "grad_norm": 2.65155925581941, + "learning_rate": 9.358218381323391e-06, + "loss": 0.3513, + "step": 1934 + }, + { + "epoch": 0.18832116788321168, + "grad_norm": 1.280523326704506, + "learning_rate": 9.357445627928356e-06, + "loss": 0.3132, + "step": 1935 + }, + { + "epoch": 0.1884184914841849, + "grad_norm": 1.347047087613105, + "learning_rate": 9.356672441531273e-06, + "loss": 0.3334, + "step": 1936 + }, + { + "epoch": 0.18851581508515816, + "grad_norm": 1.2987558904079175, + "learning_rate": 9.35589882220897e-06, + "loss": 0.3224, + "step": 1937 + }, + { + "epoch": 0.1886131386861314, + "grad_norm": 0.9974048438134153, + "learning_rate": 9.355124770038323e-06, + "loss": 0.2764, + "step": 1938 + }, + { + "epoch": 0.1887104622871046, + "grad_norm": 2.544180913694316, + "learning_rate": 9.354350285096255e-06, + "loss": 0.495, + "step": 1939 + }, + { + "epoch": 0.18880778588807787, + "grad_norm": 1.613510595834776, + "learning_rate": 9.353575367459718e-06, + "loss": 0.5269, + "step": 1940 + }, + { + "epoch": 0.1889051094890511, + "grad_norm": 1.1663508101189002, + "learning_rate": 9.352800017205724e-06, + "loss": 0.3936, + "step": 1941 + }, + { + "epoch": 0.18900243309002432, + "grad_norm": 1.3673811421181858, + "learning_rate": 9.352024234411315e-06, + "loss": 0.4448, + "step": 1942 + }, + { + "epoch": 0.18909975669099757, + "grad_norm": 1.1481373712644614, + "learning_rate": 9.351248019153582e-06, + "loss": 0.3226, + "step": 1943 + }, + { + "epoch": 0.1891970802919708, + "grad_norm": 1.025014870233366, + "learning_rate": 9.350471371509659e-06, + "loss": 0.2095, + "step": 1944 + }, + { + "epoch": 0.18929440389294405, + "grad_norm": 1.6587902238420225, + "learning_rate": 9.349694291556723e-06, + "loss": 0.3805, + "step": 1945 + }, + { + "epoch": 0.18939172749391728, + "grad_norm": 1.568770301353131, + "learning_rate": 9.348916779371993e-06, + "loss": 0.3902, + "step": 1946 + }, + { + "epoch": 0.1894890510948905, + "grad_norm": 1.4274566422005779, + "learning_rate": 9.348138835032727e-06, + "loss": 0.3644, + "step": 1947 + }, + { + "epoch": 0.18958637469586376, + "grad_norm": 1.6590398647584288, + "learning_rate": 9.347360458616233e-06, + "loss": 0.3522, + "step": 1948 + }, + { + "epoch": 0.18968369829683698, + "grad_norm": 1.5905934658559544, + "learning_rate": 9.346581650199859e-06, + "loss": 0.3784, + "step": 1949 + }, + { + "epoch": 0.1897810218978102, + "grad_norm": 1.358850838464726, + "learning_rate": 9.345802409860995e-06, + "loss": 0.3407, + "step": 1950 + }, + { + "epoch": 0.18987834549878346, + "grad_norm": 1.5906740312195304, + "learning_rate": 9.345022737677073e-06, + "loss": 0.4735, + "step": 1951 + }, + { + "epoch": 0.1899756690997567, + "grad_norm": 1.419279223309371, + "learning_rate": 9.344242633725573e-06, + "loss": 0.4677, + "step": 1952 + }, + { + "epoch": 0.19007299270072991, + "grad_norm": 2.368125402390624, + "learning_rate": 9.34346209808401e-06, + "loss": 0.4341, + "step": 1953 + }, + { + "epoch": 0.19017031630170317, + "grad_norm": 1.6018933954570558, + "learning_rate": 9.342681130829949e-06, + "loss": 0.4348, + "step": 1954 + }, + { + "epoch": 0.1902676399026764, + "grad_norm": 1.4757982324740848, + "learning_rate": 9.341899732040996e-06, + "loss": 0.393, + "step": 1955 + }, + { + "epoch": 0.19036496350364965, + "grad_norm": 1.463093762624457, + "learning_rate": 9.341117901794797e-06, + "loss": 0.3787, + "step": 1956 + }, + { + "epoch": 0.19046228710462287, + "grad_norm": 1.5507561900230402, + "learning_rate": 9.340335640169045e-06, + "loss": 0.4715, + "step": 1957 + }, + { + "epoch": 0.1905596107055961, + "grad_norm": 1.4207468273121375, + "learning_rate": 9.339552947241471e-06, + "loss": 0.3938, + "step": 1958 + }, + { + "epoch": 0.19065693430656935, + "grad_norm": 1.407596113402629, + "learning_rate": 9.338769823089853e-06, + "loss": 0.4965, + "step": 1959 + }, + { + "epoch": 0.19075425790754258, + "grad_norm": 1.5505869092648736, + "learning_rate": 9.337986267792014e-06, + "loss": 0.3699, + "step": 1960 + }, + { + "epoch": 0.1908515815085158, + "grad_norm": 1.4558635051434323, + "learning_rate": 9.33720228142581e-06, + "loss": 0.3436, + "step": 1961 + }, + { + "epoch": 0.19094890510948906, + "grad_norm": 1.4210127007858437, + "learning_rate": 9.336417864069152e-06, + "loss": 0.3959, + "step": 1962 + }, + { + "epoch": 0.19104622871046228, + "grad_norm": 1.5797691467496429, + "learning_rate": 9.335633015799983e-06, + "loss": 0.5438, + "step": 1963 + }, + { + "epoch": 0.1911435523114355, + "grad_norm": 1.200940613853037, + "learning_rate": 9.334847736696297e-06, + "loss": 0.3037, + "step": 1964 + }, + { + "epoch": 0.19124087591240876, + "grad_norm": 1.6206966051553, + "learning_rate": 9.334062026836128e-06, + "loss": 0.6412, + "step": 1965 + }, + { + "epoch": 0.191338199513382, + "grad_norm": 1.3678147539203456, + "learning_rate": 9.33327588629755e-06, + "loss": 0.328, + "step": 1966 + }, + { + "epoch": 0.19143552311435524, + "grad_norm": 1.425436568728509, + "learning_rate": 9.332489315158685e-06, + "loss": 0.42, + "step": 1967 + }, + { + "epoch": 0.19153284671532847, + "grad_norm": 1.4740185495034979, + "learning_rate": 9.331702313497693e-06, + "loss": 0.3563, + "step": 1968 + }, + { + "epoch": 0.1916301703163017, + "grad_norm": 1.4865130636524604, + "learning_rate": 9.33091488139278e-06, + "loss": 0.3452, + "step": 1969 + }, + { + "epoch": 0.19172749391727495, + "grad_norm": 1.595704917953399, + "learning_rate": 9.330127018922195e-06, + "loss": 0.6593, + "step": 1970 + }, + { + "epoch": 0.19182481751824818, + "grad_norm": 1.4305855687191487, + "learning_rate": 9.329338726164225e-06, + "loss": 0.4935, + "step": 1971 + }, + { + "epoch": 0.1919221411192214, + "grad_norm": 1.4810316480182457, + "learning_rate": 9.328550003197203e-06, + "loss": 0.4303, + "step": 1972 + }, + { + "epoch": 0.19201946472019465, + "grad_norm": 1.1937939840472271, + "learning_rate": 9.32776085009951e-06, + "loss": 0.3178, + "step": 1973 + }, + { + "epoch": 0.19211678832116788, + "grad_norm": 1.3344201288029265, + "learning_rate": 9.326971266949558e-06, + "loss": 0.3469, + "step": 1974 + }, + { + "epoch": 0.1922141119221411, + "grad_norm": 1.5818137690503504, + "learning_rate": 9.326181253825813e-06, + "loss": 0.505, + "step": 1975 + }, + { + "epoch": 0.19231143552311436, + "grad_norm": 1.263126969220317, + "learning_rate": 9.325390810806778e-06, + "loss": 0.3967, + "step": 1976 + }, + { + "epoch": 0.19240875912408759, + "grad_norm": 1.6967730581105949, + "learning_rate": 9.324599937971e-06, + "loss": 0.7353, + "step": 1977 + }, + { + "epoch": 0.19250608272506084, + "grad_norm": 1.4550804189369502, + "learning_rate": 9.323808635397067e-06, + "loss": 0.3326, + "step": 1978 + }, + { + "epoch": 0.19260340632603407, + "grad_norm": 1.594493767215082, + "learning_rate": 9.323016903163612e-06, + "loss": 0.4547, + "step": 1979 + }, + { + "epoch": 0.1927007299270073, + "grad_norm": 1.4855552398261571, + "learning_rate": 9.322224741349313e-06, + "loss": 0.5095, + "step": 1980 + }, + { + "epoch": 0.19279805352798055, + "grad_norm": 1.3769945503658922, + "learning_rate": 9.321432150032884e-06, + "loss": 0.3853, + "step": 1981 + }, + { + "epoch": 0.19289537712895377, + "grad_norm": 1.3138128708042736, + "learning_rate": 9.320639129293083e-06, + "loss": 0.4129, + "step": 1982 + }, + { + "epoch": 0.192992700729927, + "grad_norm": 1.4617598559962484, + "learning_rate": 9.319845679208719e-06, + "loss": 0.449, + "step": 1983 + }, + { + "epoch": 0.19309002433090025, + "grad_norm": 1.6332060417216765, + "learning_rate": 9.319051799858633e-06, + "loss": 0.594, + "step": 1984 + }, + { + "epoch": 0.19318734793187348, + "grad_norm": 1.5432637765560855, + "learning_rate": 9.318257491321714e-06, + "loss": 0.3465, + "step": 1985 + }, + { + "epoch": 0.1932846715328467, + "grad_norm": 1.4536395238750577, + "learning_rate": 9.317462753676895e-06, + "loss": 0.4212, + "step": 1986 + }, + { + "epoch": 0.19338199513381996, + "grad_norm": 1.3985266204226148, + "learning_rate": 9.31666758700315e-06, + "loss": 0.5313, + "step": 1987 + }, + { + "epoch": 0.19347931873479318, + "grad_norm": 1.4329939166816383, + "learning_rate": 9.315871991379493e-06, + "loss": 0.3958, + "step": 1988 + }, + { + "epoch": 0.19357664233576644, + "grad_norm": 1.3666417803863316, + "learning_rate": 9.315075966884984e-06, + "loss": 0.462, + "step": 1989 + }, + { + "epoch": 0.19367396593673966, + "grad_norm": 1.6059064802064114, + "learning_rate": 9.314279513598721e-06, + "loss": 0.5734, + "step": 1990 + }, + { + "epoch": 0.1937712895377129, + "grad_norm": 1.521730062801285, + "learning_rate": 9.313482631599854e-06, + "loss": 0.3479, + "step": 1991 + }, + { + "epoch": 0.19386861313868614, + "grad_norm": 1.5212897395363751, + "learning_rate": 9.312685320967566e-06, + "loss": 0.4328, + "step": 1992 + }, + { + "epoch": 0.19396593673965937, + "grad_norm": 1.669365255826549, + "learning_rate": 9.311887581781086e-06, + "loss": 0.6153, + "step": 1993 + }, + { + "epoch": 0.1940632603406326, + "grad_norm": 1.1692329123053622, + "learning_rate": 9.311089414119688e-06, + "loss": 0.3149, + "step": 1994 + }, + { + "epoch": 0.19416058394160585, + "grad_norm": 1.4724909439197027, + "learning_rate": 9.310290818062683e-06, + "loss": 0.478, + "step": 1995 + }, + { + "epoch": 0.19425790754257907, + "grad_norm": 1.667688851021317, + "learning_rate": 9.309491793689431e-06, + "loss": 0.6192, + "step": 1996 + }, + { + "epoch": 0.1943552311435523, + "grad_norm": 1.2423474670669281, + "learning_rate": 9.30869234107933e-06, + "loss": 0.4242, + "step": 1997 + }, + { + "epoch": 0.19445255474452555, + "grad_norm": 1.4117486896728357, + "learning_rate": 9.307892460311825e-06, + "loss": 0.4417, + "step": 1998 + }, + { + "epoch": 0.19454987834549878, + "grad_norm": 1.6605518542896853, + "learning_rate": 9.307092151466397e-06, + "loss": 0.5289, + "step": 1999 + }, + { + "epoch": 0.19464720194647203, + "grad_norm": 1.661933360658536, + "learning_rate": 9.306291414622575e-06, + "loss": 0.3357, + "step": 2000 + }, + { + "epoch": 0.19474452554744526, + "grad_norm": 1.4409618985011814, + "learning_rate": 9.305490249859927e-06, + "loss": 0.4563, + "step": 2001 + }, + { + "epoch": 0.19484184914841848, + "grad_norm": 1.9082899591217046, + "learning_rate": 9.304688657258068e-06, + "loss": 0.3445, + "step": 2002 + }, + { + "epoch": 0.19493917274939174, + "grad_norm": 1.2157434891172034, + "learning_rate": 9.303886636896649e-06, + "loss": 0.3719, + "step": 2003 + }, + { + "epoch": 0.19503649635036496, + "grad_norm": 1.57236888854409, + "learning_rate": 9.303084188855371e-06, + "loss": 0.4399, + "step": 2004 + }, + { + "epoch": 0.1951338199513382, + "grad_norm": 1.4041570559360463, + "learning_rate": 9.302281313213973e-06, + "loss": 0.4442, + "step": 2005 + }, + { + "epoch": 0.19523114355231144, + "grad_norm": 1.595081147428658, + "learning_rate": 9.301478010052237e-06, + "loss": 0.4225, + "step": 2006 + }, + { + "epoch": 0.19532846715328467, + "grad_norm": 1.562924823229517, + "learning_rate": 9.300674279449986e-06, + "loss": 0.3739, + "step": 2007 + }, + { + "epoch": 0.1954257907542579, + "grad_norm": 1.6925679153497177, + "learning_rate": 9.299870121487088e-06, + "loss": 0.4465, + "step": 2008 + }, + { + "epoch": 0.19552311435523115, + "grad_norm": 1.4955175500348226, + "learning_rate": 9.299065536243453e-06, + "loss": 0.5055, + "step": 2009 + }, + { + "epoch": 0.19562043795620437, + "grad_norm": 1.5602814755448668, + "learning_rate": 9.298260523799035e-06, + "loss": 0.4214, + "step": 2010 + }, + { + "epoch": 0.19571776155717763, + "grad_norm": 1.4678189187481074, + "learning_rate": 9.297455084233826e-06, + "loss": 0.4221, + "step": 2011 + }, + { + "epoch": 0.19581508515815085, + "grad_norm": 1.1014848505883976, + "learning_rate": 9.296649217627863e-06, + "loss": 0.2531, + "step": 2012 + }, + { + "epoch": 0.19591240875912408, + "grad_norm": 1.553421501855423, + "learning_rate": 9.295842924061227e-06, + "loss": 0.5409, + "step": 2013 + }, + { + "epoch": 0.19600973236009733, + "grad_norm": 1.598118050761176, + "learning_rate": 9.295036203614039e-06, + "loss": 0.4084, + "step": 2014 + }, + { + "epoch": 0.19610705596107056, + "grad_norm": 1.6278848716274248, + "learning_rate": 9.294229056366464e-06, + "loss": 0.5842, + "step": 2015 + }, + { + "epoch": 0.19620437956204378, + "grad_norm": 1.243515264701947, + "learning_rate": 9.293421482398708e-06, + "loss": 0.3504, + "step": 2016 + }, + { + "epoch": 0.19630170316301704, + "grad_norm": 1.4687425329140307, + "learning_rate": 9.29261348179102e-06, + "loss": 0.2732, + "step": 2017 + }, + { + "epoch": 0.19639902676399026, + "grad_norm": 1.8000259635960119, + "learning_rate": 9.291805054623691e-06, + "loss": 0.7865, + "step": 2018 + }, + { + "epoch": 0.1964963503649635, + "grad_norm": 1.5721673591186547, + "learning_rate": 9.290996200977058e-06, + "loss": 0.5686, + "step": 2019 + }, + { + "epoch": 0.19659367396593674, + "grad_norm": 1.4634877349944297, + "learning_rate": 9.290186920931493e-06, + "loss": 0.4884, + "step": 2020 + }, + { + "epoch": 0.19669099756690997, + "grad_norm": 1.8795352763168436, + "learning_rate": 9.289377214567418e-06, + "loss": 0.279, + "step": 2021 + }, + { + "epoch": 0.19678832116788322, + "grad_norm": 1.2525962570268505, + "learning_rate": 9.288567081965292e-06, + "loss": 0.3003, + "step": 2022 + }, + { + "epoch": 0.19688564476885645, + "grad_norm": 1.4414518188882164, + "learning_rate": 9.28775652320562e-06, + "loss": 0.2883, + "step": 2023 + }, + { + "epoch": 0.19698296836982968, + "grad_norm": 1.1469869990322892, + "learning_rate": 9.286945538368946e-06, + "loss": 0.301, + "step": 2024 + }, + { + "epoch": 0.19708029197080293, + "grad_norm": 1.4386800814955665, + "learning_rate": 9.286134127535859e-06, + "loss": 0.417, + "step": 2025 + }, + { + "epoch": 0.19717761557177615, + "grad_norm": 1.4334168701816348, + "learning_rate": 9.28532229078699e-06, + "loss": 0.4694, + "step": 2026 + }, + { + "epoch": 0.19727493917274938, + "grad_norm": 1.2925159318336792, + "learning_rate": 9.28451002820301e-06, + "loss": 0.4438, + "step": 2027 + }, + { + "epoch": 0.19737226277372263, + "grad_norm": 1.1608723700468837, + "learning_rate": 9.283697339864635e-06, + "loss": 0.3899, + "step": 2028 + }, + { + "epoch": 0.19746958637469586, + "grad_norm": 1.0831308664734243, + "learning_rate": 9.282884225852625e-06, + "loss": 0.3594, + "step": 2029 + }, + { + "epoch": 0.19756690997566909, + "grad_norm": 1.3854325468066278, + "learning_rate": 9.282070686247773e-06, + "loss": 0.5111, + "step": 2030 + }, + { + "epoch": 0.19766423357664234, + "grad_norm": 1.2843702051671877, + "learning_rate": 9.281256721130927e-06, + "loss": 0.3298, + "step": 2031 + }, + { + "epoch": 0.19776155717761557, + "grad_norm": 1.4725158786403292, + "learning_rate": 9.280442330582968e-06, + "loss": 0.4776, + "step": 2032 + }, + { + "epoch": 0.19785888077858882, + "grad_norm": 1.2748346913452204, + "learning_rate": 9.279627514684826e-06, + "loss": 0.4438, + "step": 2033 + }, + { + "epoch": 0.19795620437956205, + "grad_norm": 1.406716290626126, + "learning_rate": 9.278812273517465e-06, + "loss": 0.2814, + "step": 2034 + }, + { + "epoch": 0.19805352798053527, + "grad_norm": 1.3303438388967537, + "learning_rate": 9.2779966071619e-06, + "loss": 0.4314, + "step": 2035 + }, + { + "epoch": 0.19815085158150852, + "grad_norm": 1.4134730169408085, + "learning_rate": 9.277180515699183e-06, + "loss": 0.2764, + "step": 2036 + }, + { + "epoch": 0.19824817518248175, + "grad_norm": 1.3255645305073551, + "learning_rate": 9.276363999210407e-06, + "loss": 0.4347, + "step": 2037 + }, + { + "epoch": 0.19834549878345498, + "grad_norm": 1.4369644328356708, + "learning_rate": 9.275547057776713e-06, + "loss": 0.3551, + "step": 2038 + }, + { + "epoch": 0.19844282238442823, + "grad_norm": 1.748281657046459, + "learning_rate": 9.27472969147928e-06, + "loss": 0.4372, + "step": 2039 + }, + { + "epoch": 0.19854014598540146, + "grad_norm": 1.2795189118800725, + "learning_rate": 9.273911900399331e-06, + "loss": 0.4431, + "step": 2040 + }, + { + "epoch": 0.1986374695863747, + "grad_norm": 1.165526474375854, + "learning_rate": 9.273093684618129e-06, + "loss": 0.2936, + "step": 2041 + }, + { + "epoch": 0.19873479318734794, + "grad_norm": 1.6068781771010836, + "learning_rate": 9.272275044216981e-06, + "loss": 0.5125, + "step": 2042 + }, + { + "epoch": 0.19883211678832116, + "grad_norm": 1.4210491087425543, + "learning_rate": 9.271455979277234e-06, + "loss": 0.4142, + "step": 2043 + }, + { + "epoch": 0.19892944038929442, + "grad_norm": 1.6609287753373938, + "learning_rate": 9.270636489880283e-06, + "loss": 0.6728, + "step": 2044 + }, + { + "epoch": 0.19902676399026764, + "grad_norm": 1.3902108507987736, + "learning_rate": 9.26981657610756e-06, + "loss": 0.3492, + "step": 2045 + }, + { + "epoch": 0.19912408759124087, + "grad_norm": 1.6316422644879316, + "learning_rate": 9.268996238040537e-06, + "loss": 0.5029, + "step": 2046 + }, + { + "epoch": 0.19922141119221412, + "grad_norm": 1.2841836791466006, + "learning_rate": 9.268175475760734e-06, + "loss": 0.3849, + "step": 2047 + }, + { + "epoch": 0.19931873479318735, + "grad_norm": 1.319713524379575, + "learning_rate": 9.267354289349712e-06, + "loss": 0.4439, + "step": 2048 + }, + { + "epoch": 0.19941605839416057, + "grad_norm": 1.3549935774985267, + "learning_rate": 9.266532678889071e-06, + "loss": 0.4382, + "step": 2049 + }, + { + "epoch": 0.19951338199513383, + "grad_norm": 1.8518976479625036, + "learning_rate": 9.265710644460455e-06, + "loss": 0.8216, + "step": 2050 + }, + { + "epoch": 0.19961070559610705, + "grad_norm": 1.9509154982810264, + "learning_rate": 9.26488818614555e-06, + "loss": 0.4607, + "step": 2051 + }, + { + "epoch": 0.1997080291970803, + "grad_norm": 1.2954164138913125, + "learning_rate": 9.264065304026087e-06, + "loss": 0.4257, + "step": 2052 + }, + { + "epoch": 0.19980535279805353, + "grad_norm": 1.925685176039115, + "learning_rate": 9.26324199818383e-06, + "loss": 0.6025, + "step": 2053 + }, + { + "epoch": 0.19990267639902676, + "grad_norm": 1.533947029174009, + "learning_rate": 9.262418268700596e-06, + "loss": 0.5443, + "step": 2054 + }, + { + "epoch": 0.2, + "grad_norm": 1.4995274594175463, + "learning_rate": 9.26159411565824e-06, + "loss": 0.5023, + "step": 2055 + }, + { + "epoch": 0.20009732360097324, + "grad_norm": 1.4350182215101954, + "learning_rate": 9.26076953913866e-06, + "loss": 0.3726, + "step": 2056 + }, + { + "epoch": 0.20019464720194646, + "grad_norm": 1.3019491914952392, + "learning_rate": 9.259944539223788e-06, + "loss": 0.4765, + "step": 2057 + }, + { + "epoch": 0.20029197080291972, + "grad_norm": 1.3884509805578256, + "learning_rate": 9.25911911599561e-06, + "loss": 0.338, + "step": 2058 + }, + { + "epoch": 0.20038929440389294, + "grad_norm": 1.488048064619486, + "learning_rate": 9.258293269536146e-06, + "loss": 0.5872, + "step": 2059 + }, + { + "epoch": 0.20048661800486617, + "grad_norm": 1.1548733119099643, + "learning_rate": 9.257466999927464e-06, + "loss": 0.3242, + "step": 2060 + }, + { + "epoch": 0.20058394160583942, + "grad_norm": 1.048222542797774, + "learning_rate": 9.25664030725167e-06, + "loss": 0.3253, + "step": 2061 + }, + { + "epoch": 0.20068126520681265, + "grad_norm": 1.211590892113714, + "learning_rate": 9.255813191590912e-06, + "loss": 0.3414, + "step": 2062 + }, + { + "epoch": 0.2007785888077859, + "grad_norm": 1.3770802107798175, + "learning_rate": 9.254985653027382e-06, + "loss": 0.4031, + "step": 2063 + }, + { + "epoch": 0.20087591240875913, + "grad_norm": 1.4503315973945832, + "learning_rate": 9.25415769164331e-06, + "loss": 0.4799, + "step": 2064 + }, + { + "epoch": 0.20097323600973235, + "grad_norm": 1.3613570222565128, + "learning_rate": 9.253329307520976e-06, + "loss": 0.3932, + "step": 2065 + }, + { + "epoch": 0.2010705596107056, + "grad_norm": 1.436956883536887, + "learning_rate": 9.252500500742692e-06, + "loss": 0.51, + "step": 2066 + }, + { + "epoch": 0.20116788321167883, + "grad_norm": 1.3042874208229347, + "learning_rate": 9.25167127139082e-06, + "loss": 0.3702, + "step": 2067 + }, + { + "epoch": 0.20126520681265206, + "grad_norm": 1.4601934649693376, + "learning_rate": 9.250841619547762e-06, + "loss": 0.3927, + "step": 2068 + }, + { + "epoch": 0.2013625304136253, + "grad_norm": 1.4877017036692342, + "learning_rate": 9.250011545295959e-06, + "loss": 0.5463, + "step": 2069 + }, + { + "epoch": 0.20145985401459854, + "grad_norm": 1.3385891837902342, + "learning_rate": 9.249181048717895e-06, + "loss": 0.3052, + "step": 2070 + }, + { + "epoch": 0.20155717761557176, + "grad_norm": 1.111892744483471, + "learning_rate": 9.2483501298961e-06, + "loss": 0.2342, + "step": 2071 + }, + { + "epoch": 0.20165450121654502, + "grad_norm": 1.4336755713622584, + "learning_rate": 9.247518788913141e-06, + "loss": 0.4416, + "step": 2072 + }, + { + "epoch": 0.20175182481751824, + "grad_norm": 1.4682039909825075, + "learning_rate": 9.246687025851629e-06, + "loss": 0.3044, + "step": 2073 + }, + { + "epoch": 0.2018491484184915, + "grad_norm": 1.1356161216510552, + "learning_rate": 9.245854840794217e-06, + "loss": 0.2913, + "step": 2074 + }, + { + "epoch": 0.20194647201946472, + "grad_norm": 1.2497989015941582, + "learning_rate": 9.2450222338236e-06, + "loss": 0.356, + "step": 2075 + }, + { + "epoch": 0.20204379562043795, + "grad_norm": 1.4662802201560914, + "learning_rate": 9.244189205022514e-06, + "loss": 0.5234, + "step": 2076 + }, + { + "epoch": 0.2021411192214112, + "grad_norm": 1.1493994388606168, + "learning_rate": 9.243355754473738e-06, + "loss": 0.3862, + "step": 2077 + }, + { + "epoch": 0.20223844282238443, + "grad_norm": 1.1352456631925198, + "learning_rate": 9.242521882260093e-06, + "loss": 0.3693, + "step": 2078 + }, + { + "epoch": 0.20233576642335765, + "grad_norm": 1.4112847797443164, + "learning_rate": 9.24168758846444e-06, + "loss": 0.4667, + "step": 2079 + }, + { + "epoch": 0.2024330900243309, + "grad_norm": 1.9587086933310962, + "learning_rate": 9.240852873169686e-06, + "loss": 0.5446, + "step": 2080 + }, + { + "epoch": 0.20253041362530413, + "grad_norm": 1.4532595336328356, + "learning_rate": 9.240017736458772e-06, + "loss": 0.56, + "step": 2081 + }, + { + "epoch": 0.20262773722627736, + "grad_norm": 1.1373158358211433, + "learning_rate": 9.239182178414694e-06, + "loss": 0.3998, + "step": 2082 + }, + { + "epoch": 0.2027250608272506, + "grad_norm": 1.4892855081953407, + "learning_rate": 9.238346199120473e-06, + "loss": 0.5564, + "step": 2083 + }, + { + "epoch": 0.20282238442822384, + "grad_norm": 1.4122351541601532, + "learning_rate": 9.237509798659188e-06, + "loss": 0.4407, + "step": 2084 + }, + { + "epoch": 0.2029197080291971, + "grad_norm": 1.266747153803517, + "learning_rate": 9.236672977113948e-06, + "loss": 0.3898, + "step": 2085 + }, + { + "epoch": 0.20301703163017032, + "grad_norm": 1.3972737248894866, + "learning_rate": 9.23583573456791e-06, + "loss": 0.4855, + "step": 2086 + }, + { + "epoch": 0.20311435523114355, + "grad_norm": 1.6424190339871019, + "learning_rate": 9.234998071104272e-06, + "loss": 0.732, + "step": 2087 + }, + { + "epoch": 0.2032116788321168, + "grad_norm": 1.4973722328869334, + "learning_rate": 9.234159986806275e-06, + "loss": 0.4796, + "step": 2088 + }, + { + "epoch": 0.20330900243309002, + "grad_norm": 1.5629802728678386, + "learning_rate": 9.233321481757196e-06, + "loss": 0.4762, + "step": 2089 + }, + { + "epoch": 0.20340632603406325, + "grad_norm": 1.5273353205689704, + "learning_rate": 9.23248255604036e-06, + "loss": 0.6446, + "step": 2090 + }, + { + "epoch": 0.2035036496350365, + "grad_norm": 1.3835329237350877, + "learning_rate": 9.231643209739128e-06, + "loss": 0.5297, + "step": 2091 + }, + { + "epoch": 0.20360097323600973, + "grad_norm": 1.2187102873763251, + "learning_rate": 9.230803442936911e-06, + "loss": 0.3727, + "step": 2092 + }, + { + "epoch": 0.20369829683698296, + "grad_norm": 1.325749011032711, + "learning_rate": 9.229963255717156e-06, + "loss": 0.5476, + "step": 2093 + }, + { + "epoch": 0.2037956204379562, + "grad_norm": 1.1246093495598513, + "learning_rate": 9.229122648163351e-06, + "loss": 0.3309, + "step": 2094 + }, + { + "epoch": 0.20389294403892944, + "grad_norm": 1.3415111254139396, + "learning_rate": 9.22828162035903e-06, + "loss": 0.4226, + "step": 2095 + }, + { + "epoch": 0.2039902676399027, + "grad_norm": 1.2431047519820402, + "learning_rate": 9.227440172387766e-06, + "loss": 0.2364, + "step": 2096 + }, + { + "epoch": 0.20408759124087592, + "grad_norm": 1.59824202042343, + "learning_rate": 9.226598304333175e-06, + "loss": 0.5713, + "step": 2097 + }, + { + "epoch": 0.20418491484184914, + "grad_norm": 1.3718145057357327, + "learning_rate": 9.22575601627891e-06, + "loss": 0.4366, + "step": 2098 + }, + { + "epoch": 0.2042822384428224, + "grad_norm": 1.8310954422547832, + "learning_rate": 9.224913308308672e-06, + "loss": 0.4098, + "step": 2099 + }, + { + "epoch": 0.20437956204379562, + "grad_norm": 1.3433956299970118, + "learning_rate": 9.224070180506202e-06, + "loss": 0.2959, + "step": 2100 + }, + { + "epoch": 0.20447688564476885, + "grad_norm": 1.0277615122037833, + "learning_rate": 9.223226632955283e-06, + "loss": 0.265, + "step": 2101 + }, + { + "epoch": 0.2045742092457421, + "grad_norm": 1.2285380399323877, + "learning_rate": 9.222382665739737e-06, + "loss": 0.3844, + "step": 2102 + }, + { + "epoch": 0.20467153284671533, + "grad_norm": 1.1151094116106592, + "learning_rate": 9.221538278943432e-06, + "loss": 0.2461, + "step": 2103 + }, + { + "epoch": 0.20476885644768855, + "grad_norm": 1.5239102143699876, + "learning_rate": 9.22069347265027e-06, + "loss": 0.4239, + "step": 2104 + }, + { + "epoch": 0.2048661800486618, + "grad_norm": 1.6502658051911525, + "learning_rate": 9.219848246944206e-06, + "loss": 0.6723, + "step": 2105 + }, + { + "epoch": 0.20496350364963503, + "grad_norm": 1.638974040274465, + "learning_rate": 9.219002601909229e-06, + "loss": 0.5068, + "step": 2106 + }, + { + "epoch": 0.20506082725060829, + "grad_norm": 1.4649352184984061, + "learning_rate": 9.218156537629368e-06, + "loss": 0.4698, + "step": 2107 + }, + { + "epoch": 0.2051581508515815, + "grad_norm": 1.5070786345583258, + "learning_rate": 9.217310054188699e-06, + "loss": 0.4654, + "step": 2108 + }, + { + "epoch": 0.20525547445255474, + "grad_norm": 1.2480947756940115, + "learning_rate": 9.216463151671338e-06, + "loss": 0.3614, + "step": 2109 + }, + { + "epoch": 0.205352798053528, + "grad_norm": 1.6536121595263205, + "learning_rate": 9.215615830161443e-06, + "loss": 0.5872, + "step": 2110 + }, + { + "epoch": 0.20545012165450122, + "grad_norm": 1.5559546132859907, + "learning_rate": 9.214768089743211e-06, + "loss": 0.5098, + "step": 2111 + }, + { + "epoch": 0.20554744525547444, + "grad_norm": 1.5691593927804695, + "learning_rate": 9.213919930500884e-06, + "loss": 0.3845, + "step": 2112 + }, + { + "epoch": 0.2056447688564477, + "grad_norm": 1.4385010923740136, + "learning_rate": 9.213071352518744e-06, + "loss": 0.4035, + "step": 2113 + }, + { + "epoch": 0.20574209245742092, + "grad_norm": 1.2415148755341134, + "learning_rate": 9.212222355881111e-06, + "loss": 0.2503, + "step": 2114 + }, + { + "epoch": 0.20583941605839415, + "grad_norm": 1.597224767194554, + "learning_rate": 9.211372940672356e-06, + "loss": 0.3831, + "step": 2115 + }, + { + "epoch": 0.2059367396593674, + "grad_norm": 1.3936071245663937, + "learning_rate": 9.210523106976884e-06, + "loss": 0.3664, + "step": 2116 + }, + { + "epoch": 0.20603406326034063, + "grad_norm": 1.4335641468120297, + "learning_rate": 9.209672854879142e-06, + "loss": 0.3182, + "step": 2117 + }, + { + "epoch": 0.20613138686131388, + "grad_norm": 1.2544256067640176, + "learning_rate": 9.20882218446362e-06, + "loss": 0.2678, + "step": 2118 + }, + { + "epoch": 0.2062287104622871, + "grad_norm": 1.4867246001264303, + "learning_rate": 9.207971095814852e-06, + "loss": 0.4934, + "step": 2119 + }, + { + "epoch": 0.20632603406326033, + "grad_norm": 1.5387304887069146, + "learning_rate": 9.207119589017408e-06, + "loss": 0.4552, + "step": 2120 + }, + { + "epoch": 0.2064233576642336, + "grad_norm": 1.507156387441411, + "learning_rate": 9.206267664155906e-06, + "loss": 0.4209, + "step": 2121 + }, + { + "epoch": 0.2065206812652068, + "grad_norm": 1.3407732350308024, + "learning_rate": 9.205415321315e-06, + "loss": 0.4256, + "step": 2122 + }, + { + "epoch": 0.20661800486618004, + "grad_norm": 1.6313949345186305, + "learning_rate": 9.20456256057939e-06, + "loss": 0.4727, + "step": 2123 + }, + { + "epoch": 0.2067153284671533, + "grad_norm": 1.695026004332969, + "learning_rate": 9.203709382033814e-06, + "loss": 0.6547, + "step": 2124 + }, + { + "epoch": 0.20681265206812652, + "grad_norm": 1.5677721722384952, + "learning_rate": 9.202855785763053e-06, + "loss": 0.4469, + "step": 2125 + }, + { + "epoch": 0.20690997566909974, + "grad_norm": 1.4276579746412523, + "learning_rate": 9.202001771851928e-06, + "loss": 0.4511, + "step": 2126 + }, + { + "epoch": 0.207007299270073, + "grad_norm": 1.365652083209099, + "learning_rate": 9.201147340385304e-06, + "loss": 0.4435, + "step": 2127 + }, + { + "epoch": 0.20710462287104622, + "grad_norm": 1.4014399599326692, + "learning_rate": 9.200292491448086e-06, + "loss": 0.4017, + "step": 2128 + }, + { + "epoch": 0.20720194647201948, + "grad_norm": 1.4131798281318602, + "learning_rate": 9.199437225125223e-06, + "loss": 0.2781, + "step": 2129 + }, + { + "epoch": 0.2072992700729927, + "grad_norm": 1.3392698432345278, + "learning_rate": 9.198581541501702e-06, + "loss": 0.3576, + "step": 2130 + }, + { + "epoch": 0.20739659367396593, + "grad_norm": 1.2859171090531423, + "learning_rate": 9.197725440662552e-06, + "loss": 0.4505, + "step": 2131 + }, + { + "epoch": 0.20749391727493918, + "grad_norm": 1.3075221898254676, + "learning_rate": 9.196868922692845e-06, + "loss": 0.42, + "step": 2132 + }, + { + "epoch": 0.2075912408759124, + "grad_norm": 1.3120969425940014, + "learning_rate": 9.196011987677693e-06, + "loss": 0.3918, + "step": 2133 + }, + { + "epoch": 0.20768856447688563, + "grad_norm": 1.2917866907447901, + "learning_rate": 9.19515463570225e-06, + "loss": 0.4515, + "step": 2134 + }, + { + "epoch": 0.2077858880778589, + "grad_norm": 1.4964227937052923, + "learning_rate": 9.194296866851714e-06, + "loss": 0.4007, + "step": 2135 + }, + { + "epoch": 0.2078832116788321, + "grad_norm": 1.4096694486456338, + "learning_rate": 9.19343868121132e-06, + "loss": 0.5684, + "step": 2136 + }, + { + "epoch": 0.20798053527980534, + "grad_norm": 1.1303877036272907, + "learning_rate": 9.192580078866346e-06, + "loss": 0.2661, + "step": 2137 + }, + { + "epoch": 0.2080778588807786, + "grad_norm": 1.4056619474271335, + "learning_rate": 9.191721059902112e-06, + "loss": 0.4174, + "step": 2138 + }, + { + "epoch": 0.20817518248175182, + "grad_norm": 1.7142064467904727, + "learning_rate": 9.190861624403981e-06, + "loss": 0.4453, + "step": 2139 + }, + { + "epoch": 0.20827250608272507, + "grad_norm": 1.3293557691236777, + "learning_rate": 9.190001772457356e-06, + "loss": 0.4541, + "step": 2140 + }, + { + "epoch": 0.2083698296836983, + "grad_norm": 1.6131133576379075, + "learning_rate": 9.189141504147676e-06, + "loss": 0.3751, + "step": 2141 + }, + { + "epoch": 0.20846715328467152, + "grad_norm": 1.509737357483189, + "learning_rate": 9.188280819560431e-06, + "loss": 0.4757, + "step": 2142 + }, + { + "epoch": 0.20856447688564478, + "grad_norm": 1.479538114231473, + "learning_rate": 9.187419718781149e-06, + "loss": 0.3243, + "step": 2143 + }, + { + "epoch": 0.208661800486618, + "grad_norm": 1.4973982658919327, + "learning_rate": 9.186558201895395e-06, + "loss": 0.3732, + "step": 2144 + }, + { + "epoch": 0.20875912408759123, + "grad_norm": 1.5121453838943797, + "learning_rate": 9.185696268988777e-06, + "loss": 0.5435, + "step": 2145 + }, + { + "epoch": 0.20885644768856448, + "grad_norm": 1.7349033410138828, + "learning_rate": 9.18483392014695e-06, + "loss": 0.6415, + "step": 2146 + }, + { + "epoch": 0.2089537712895377, + "grad_norm": 1.4812330220855032, + "learning_rate": 9.183971155455602e-06, + "loss": 0.4961, + "step": 2147 + }, + { + "epoch": 0.20905109489051094, + "grad_norm": 1.5121767597167877, + "learning_rate": 9.183107975000472e-06, + "loss": 0.5298, + "step": 2148 + }, + { + "epoch": 0.2091484184914842, + "grad_norm": 1.5424817825799644, + "learning_rate": 9.18224437886733e-06, + "loss": 0.4577, + "step": 2149 + }, + { + "epoch": 0.20924574209245742, + "grad_norm": 1.2733853569354763, + "learning_rate": 9.181380367141991e-06, + "loss": 0.3306, + "step": 2150 + }, + { + "epoch": 0.20934306569343067, + "grad_norm": 1.1384650904715041, + "learning_rate": 9.180515939910317e-06, + "loss": 0.3831, + "step": 2151 + }, + { + "epoch": 0.2094403892944039, + "grad_norm": 1.3798308474076018, + "learning_rate": 9.179651097258204e-06, + "loss": 0.4629, + "step": 2152 + }, + { + "epoch": 0.20953771289537712, + "grad_norm": 1.4059733648531154, + "learning_rate": 9.178785839271593e-06, + "loss": 0.4526, + "step": 2153 + }, + { + "epoch": 0.20963503649635037, + "grad_norm": 1.581039004516103, + "learning_rate": 9.177920166036464e-06, + "loss": 0.5397, + "step": 2154 + }, + { + "epoch": 0.2097323600973236, + "grad_norm": 1.4851118969101265, + "learning_rate": 9.17705407763884e-06, + "loss": 0.5052, + "step": 2155 + }, + { + "epoch": 0.20982968369829683, + "grad_norm": 1.3633687775503893, + "learning_rate": 9.176187574164785e-06, + "loss": 0.4427, + "step": 2156 + }, + { + "epoch": 0.20992700729927008, + "grad_norm": 1.360319094739405, + "learning_rate": 9.175320655700407e-06, + "loss": 0.3649, + "step": 2157 + }, + { + "epoch": 0.2100243309002433, + "grad_norm": 1.3829673206277566, + "learning_rate": 9.174453322331844e-06, + "loss": 0.3536, + "step": 2158 + }, + { + "epoch": 0.21012165450121653, + "grad_norm": 1.5804059757696094, + "learning_rate": 9.173585574145292e-06, + "loss": 0.5937, + "step": 2159 + }, + { + "epoch": 0.21021897810218979, + "grad_norm": 1.4991084469228289, + "learning_rate": 9.172717411226975e-06, + "loss": 0.3523, + "step": 2160 + }, + { + "epoch": 0.210316301703163, + "grad_norm": 1.4762289487935065, + "learning_rate": 9.171848833663165e-06, + "loss": 0.4991, + "step": 2161 + }, + { + "epoch": 0.21041362530413626, + "grad_norm": 1.4858484283610454, + "learning_rate": 9.17097984154017e-06, + "loss": 0.5153, + "step": 2162 + }, + { + "epoch": 0.2105109489051095, + "grad_norm": 1.2647097068290445, + "learning_rate": 9.170110434944345e-06, + "loss": 0.3193, + "step": 2163 + }, + { + "epoch": 0.21060827250608272, + "grad_norm": 1.6889738075479466, + "learning_rate": 9.169240613962086e-06, + "loss": 0.4755, + "step": 2164 + }, + { + "epoch": 0.21070559610705597, + "grad_norm": 1.6464662019172414, + "learning_rate": 9.168370378679821e-06, + "loss": 0.5303, + "step": 2165 + }, + { + "epoch": 0.2108029197080292, + "grad_norm": 1.287927301108519, + "learning_rate": 9.16749972918403e-06, + "loss": 0.3231, + "step": 2166 + }, + { + "epoch": 0.21090024330900242, + "grad_norm": 1.378935902738664, + "learning_rate": 9.16662866556123e-06, + "loss": 0.4654, + "step": 2167 + }, + { + "epoch": 0.21099756690997568, + "grad_norm": 1.415652566603492, + "learning_rate": 9.16575718789798e-06, + "loss": 0.42, + "step": 2168 + }, + { + "epoch": 0.2110948905109489, + "grad_norm": 1.189498123796033, + "learning_rate": 9.164885296280875e-06, + "loss": 0.3529, + "step": 2169 + }, + { + "epoch": 0.21119221411192213, + "grad_norm": 1.5371351227791108, + "learning_rate": 9.16401299079656e-06, + "loss": 0.4679, + "step": 2170 + }, + { + "epoch": 0.21128953771289538, + "grad_norm": 1.2493790037654902, + "learning_rate": 9.163140271531714e-06, + "loss": 0.3793, + "step": 2171 + }, + { + "epoch": 0.2113868613138686, + "grad_norm": 1.3836947713855836, + "learning_rate": 9.16226713857306e-06, + "loss": 0.436, + "step": 2172 + }, + { + "epoch": 0.21148418491484186, + "grad_norm": 1.583280621035993, + "learning_rate": 9.161393592007364e-06, + "loss": 0.5673, + "step": 2173 + }, + { + "epoch": 0.2115815085158151, + "grad_norm": 1.336076606512916, + "learning_rate": 9.160519631921427e-06, + "loss": 0.418, + "step": 2174 + }, + { + "epoch": 0.2116788321167883, + "grad_norm": 1.5539773056945747, + "learning_rate": 9.159645258402098e-06, + "loss": 0.4417, + "step": 2175 + }, + { + "epoch": 0.21177615571776157, + "grad_norm": 1.35099904216899, + "learning_rate": 9.158770471536261e-06, + "loss": 0.4389, + "step": 2176 + }, + { + "epoch": 0.2118734793187348, + "grad_norm": 1.5960801985245197, + "learning_rate": 9.157895271410848e-06, + "loss": 0.4444, + "step": 2177 + }, + { + "epoch": 0.21197080291970802, + "grad_norm": 1.343338393224711, + "learning_rate": 9.157019658112825e-06, + "loss": 0.3867, + "step": 2178 + }, + { + "epoch": 0.21206812652068127, + "grad_norm": 1.573040163695098, + "learning_rate": 9.156143631729205e-06, + "loss": 0.5564, + "step": 2179 + }, + { + "epoch": 0.2121654501216545, + "grad_norm": 1.477194998770335, + "learning_rate": 9.155267192347037e-06, + "loss": 0.5053, + "step": 2180 + }, + { + "epoch": 0.21226277372262772, + "grad_norm": 1.4697445687746653, + "learning_rate": 9.154390340053414e-06, + "loss": 0.4462, + "step": 2181 + }, + { + "epoch": 0.21236009732360098, + "grad_norm": 1.2383233673923462, + "learning_rate": 9.15351307493547e-06, + "loss": 0.4023, + "step": 2182 + }, + { + "epoch": 0.2124574209245742, + "grad_norm": 1.73929160255024, + "learning_rate": 9.152635397080377e-06, + "loss": 0.456, + "step": 2183 + }, + { + "epoch": 0.21255474452554746, + "grad_norm": 1.814215933055299, + "learning_rate": 9.151757306575354e-06, + "loss": 0.5283, + "step": 2184 + }, + { + "epoch": 0.21265206812652068, + "grad_norm": 1.440140413882406, + "learning_rate": 9.150878803507655e-06, + "loss": 0.4754, + "step": 2185 + }, + { + "epoch": 0.2127493917274939, + "grad_norm": 1.4991761170210094, + "learning_rate": 9.149999887964577e-06, + "loss": 0.4244, + "step": 2186 + }, + { + "epoch": 0.21284671532846716, + "grad_norm": 1.6045542244692401, + "learning_rate": 9.149120560033461e-06, + "loss": 0.4149, + "step": 2187 + }, + { + "epoch": 0.2129440389294404, + "grad_norm": 1.6999406355422166, + "learning_rate": 9.148240819801684e-06, + "loss": 0.7227, + "step": 2188 + }, + { + "epoch": 0.2130413625304136, + "grad_norm": 1.5383336234101048, + "learning_rate": 9.147360667356667e-06, + "loss": 0.4102, + "step": 2189 + }, + { + "epoch": 0.21313868613138687, + "grad_norm": 1.3100772476716567, + "learning_rate": 9.146480102785871e-06, + "loss": 0.4001, + "step": 2190 + }, + { + "epoch": 0.2132360097323601, + "grad_norm": 1.2113504505529646, + "learning_rate": 9.1455991261768e-06, + "loss": 0.3906, + "step": 2191 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 2.1524156395732996, + "learning_rate": 9.144717737616994e-06, + "loss": 0.3722, + "step": 2192 + }, + { + "epoch": 0.21343065693430657, + "grad_norm": 1.3156410053212892, + "learning_rate": 9.143835937194039e-06, + "loss": 0.414, + "step": 2193 + }, + { + "epoch": 0.2135279805352798, + "grad_norm": 1.382537469614808, + "learning_rate": 9.14295372499556e-06, + "loss": 0.3687, + "step": 2194 + }, + { + "epoch": 0.21362530413625305, + "grad_norm": 1.4106617705657403, + "learning_rate": 9.142071101109224e-06, + "loss": 0.2515, + "step": 2195 + }, + { + "epoch": 0.21372262773722628, + "grad_norm": 1.4292530170893925, + "learning_rate": 9.141188065622736e-06, + "loss": 0.4671, + "step": 2196 + }, + { + "epoch": 0.2138199513381995, + "grad_norm": 1.371262803483025, + "learning_rate": 9.140304618623844e-06, + "loss": 0.4397, + "step": 2197 + }, + { + "epoch": 0.21391727493917276, + "grad_norm": 1.3337172412513854, + "learning_rate": 9.13942076020034e-06, + "loss": 0.4518, + "step": 2198 + }, + { + "epoch": 0.21401459854014598, + "grad_norm": 1.195478639712577, + "learning_rate": 9.138536490440046e-06, + "loss": 0.3236, + "step": 2199 + }, + { + "epoch": 0.2141119221411192, + "grad_norm": 1.6207375008593756, + "learning_rate": 9.13765180943084e-06, + "loss": 0.5147, + "step": 2200 + }, + { + "epoch": 0.21420924574209246, + "grad_norm": 1.457360033672521, + "learning_rate": 9.136766717260631e-06, + "loss": 0.3228, + "step": 2201 + }, + { + "epoch": 0.2143065693430657, + "grad_norm": 1.2314544120773039, + "learning_rate": 9.13588121401737e-06, + "loss": 0.3413, + "step": 2202 + }, + { + "epoch": 0.21440389294403894, + "grad_norm": 1.3614880154600904, + "learning_rate": 9.13499529978905e-06, + "loss": 0.3902, + "step": 2203 + }, + { + "epoch": 0.21450121654501217, + "grad_norm": 1.3431981306372034, + "learning_rate": 9.134108974663707e-06, + "loss": 0.4893, + "step": 2204 + }, + { + "epoch": 0.2145985401459854, + "grad_norm": 1.346114362934121, + "learning_rate": 9.133222238729414e-06, + "loss": 0.4195, + "step": 2205 + }, + { + "epoch": 0.21469586374695865, + "grad_norm": 1.2405202461045035, + "learning_rate": 9.132335092074285e-06, + "loss": 0.4373, + "step": 2206 + }, + { + "epoch": 0.21479318734793187, + "grad_norm": 1.2952176269832685, + "learning_rate": 9.131447534786478e-06, + "loss": 0.3253, + "step": 2207 + }, + { + "epoch": 0.2148905109489051, + "grad_norm": 1.3497804127584312, + "learning_rate": 9.130559566954191e-06, + "loss": 0.4401, + "step": 2208 + }, + { + "epoch": 0.21498783454987835, + "grad_norm": 1.6094605506454212, + "learning_rate": 9.129671188665661e-06, + "loss": 0.5943, + "step": 2209 + }, + { + "epoch": 0.21508515815085158, + "grad_norm": 1.7393737788578179, + "learning_rate": 9.128782400009167e-06, + "loss": 0.6832, + "step": 2210 + }, + { + "epoch": 0.2151824817518248, + "grad_norm": 1.2888456219960003, + "learning_rate": 9.127893201073028e-06, + "loss": 0.4449, + "step": 2211 + }, + { + "epoch": 0.21527980535279806, + "grad_norm": 1.6231451452368957, + "learning_rate": 9.127003591945605e-06, + "loss": 0.6579, + "step": 2212 + }, + { + "epoch": 0.21537712895377129, + "grad_norm": 1.4013330754504585, + "learning_rate": 9.126113572715296e-06, + "loss": 0.5072, + "step": 2213 + }, + { + "epoch": 0.21547445255474454, + "grad_norm": 1.1928349667862592, + "learning_rate": 9.125223143470547e-06, + "loss": 0.2896, + "step": 2214 + }, + { + "epoch": 0.21557177615571776, + "grad_norm": 1.3027255903002162, + "learning_rate": 9.124332304299838e-06, + "loss": 0.3076, + "step": 2215 + }, + { + "epoch": 0.215669099756691, + "grad_norm": 1.6527022746103417, + "learning_rate": 9.123441055291694e-06, + "loss": 0.4688, + "step": 2216 + }, + { + "epoch": 0.21576642335766424, + "grad_norm": 1.3197927862863625, + "learning_rate": 9.122549396534676e-06, + "loss": 0.318, + "step": 2217 + }, + { + "epoch": 0.21586374695863747, + "grad_norm": 1.5297610770776902, + "learning_rate": 9.121657328117392e-06, + "loss": 0.6176, + "step": 2218 + }, + { + "epoch": 0.2159610705596107, + "grad_norm": 1.338041823259507, + "learning_rate": 9.120764850128486e-06, + "loss": 0.3941, + "step": 2219 + }, + { + "epoch": 0.21605839416058395, + "grad_norm": 1.200858421054794, + "learning_rate": 9.119871962656644e-06, + "loss": 0.3758, + "step": 2220 + }, + { + "epoch": 0.21615571776155718, + "grad_norm": 1.5023816592412242, + "learning_rate": 9.118978665790592e-06, + "loss": 0.5032, + "step": 2221 + }, + { + "epoch": 0.2162530413625304, + "grad_norm": 1.2258656459952086, + "learning_rate": 9.118084959619099e-06, + "loss": 0.4489, + "step": 2222 + }, + { + "epoch": 0.21635036496350366, + "grad_norm": 1.717063075964899, + "learning_rate": 9.117190844230971e-06, + "loss": 0.7762, + "step": 2223 + }, + { + "epoch": 0.21644768856447688, + "grad_norm": 1.210140433555958, + "learning_rate": 9.11629631971506e-06, + "loss": 0.4431, + "step": 2224 + }, + { + "epoch": 0.21654501216545013, + "grad_norm": 1.4188251693910732, + "learning_rate": 9.115401386160252e-06, + "loss": 0.3495, + "step": 2225 + }, + { + "epoch": 0.21664233576642336, + "grad_norm": 2.073136961715272, + "learning_rate": 9.11450604365548e-06, + "loss": 0.4268, + "step": 2226 + }, + { + "epoch": 0.2167396593673966, + "grad_norm": 1.5265588328884594, + "learning_rate": 9.113610292289714e-06, + "loss": 0.4303, + "step": 2227 + }, + { + "epoch": 0.21683698296836984, + "grad_norm": 1.3220401272995868, + "learning_rate": 9.112714132151963e-06, + "loss": 0.4221, + "step": 2228 + }, + { + "epoch": 0.21693430656934307, + "grad_norm": 1.4088441022230214, + "learning_rate": 9.111817563331282e-06, + "loss": 0.1886, + "step": 2229 + }, + { + "epoch": 0.2170316301703163, + "grad_norm": 1.3947572498286958, + "learning_rate": 9.110920585916763e-06, + "loss": 0.353, + "step": 2230 + }, + { + "epoch": 0.21712895377128955, + "grad_norm": 1.2369368803593181, + "learning_rate": 9.110023199997537e-06, + "loss": 0.2576, + "step": 2231 + }, + { + "epoch": 0.21722627737226277, + "grad_norm": 1.1860471672244592, + "learning_rate": 9.10912540566278e-06, + "loss": 0.3994, + "step": 2232 + }, + { + "epoch": 0.217323600973236, + "grad_norm": 1.309576411449957, + "learning_rate": 9.108227203001708e-06, + "loss": 0.4453, + "step": 2233 + }, + { + "epoch": 0.21742092457420925, + "grad_norm": 1.6554896930775824, + "learning_rate": 9.10732859210357e-06, + "loss": 0.589, + "step": 2234 + }, + { + "epoch": 0.21751824817518248, + "grad_norm": 1.761859219992272, + "learning_rate": 9.106429573057666e-06, + "loss": 0.726, + "step": 2235 + }, + { + "epoch": 0.21761557177615573, + "grad_norm": 1.35833156484165, + "learning_rate": 9.105530145953335e-06, + "loss": 0.4012, + "step": 2236 + }, + { + "epoch": 0.21771289537712896, + "grad_norm": 3.4502529438559884, + "learning_rate": 9.104630310879944e-06, + "loss": 0.4621, + "step": 2237 + }, + { + "epoch": 0.21781021897810218, + "grad_norm": 1.3357957463599541, + "learning_rate": 9.103730067926922e-06, + "loss": 0.317, + "step": 2238 + }, + { + "epoch": 0.21790754257907544, + "grad_norm": 1.3566642568052916, + "learning_rate": 9.102829417183716e-06, + "loss": 0.4245, + "step": 2239 + }, + { + "epoch": 0.21800486618004866, + "grad_norm": 1.673808040965782, + "learning_rate": 9.10192835873983e-06, + "loss": 0.6908, + "step": 2240 + }, + { + "epoch": 0.2181021897810219, + "grad_norm": 1.8194308130790637, + "learning_rate": 9.101026892684804e-06, + "loss": 0.5157, + "step": 2241 + }, + { + "epoch": 0.21819951338199514, + "grad_norm": 1.4443029228393756, + "learning_rate": 9.100125019108214e-06, + "loss": 0.5417, + "step": 2242 + }, + { + "epoch": 0.21829683698296837, + "grad_norm": 1.4594341846039764, + "learning_rate": 9.099222738099682e-06, + "loss": 0.4297, + "step": 2243 + }, + { + "epoch": 0.2183941605839416, + "grad_norm": 1.3121064822320374, + "learning_rate": 9.098320049748864e-06, + "loss": 0.4646, + "step": 2244 + }, + { + "epoch": 0.21849148418491485, + "grad_norm": 1.5596348242175504, + "learning_rate": 9.097416954145467e-06, + "loss": 0.4877, + "step": 2245 + }, + { + "epoch": 0.21858880778588807, + "grad_norm": 1.1835003302943965, + "learning_rate": 9.096513451379225e-06, + "loss": 0.3548, + "step": 2246 + }, + { + "epoch": 0.21868613138686133, + "grad_norm": 1.4956699498169375, + "learning_rate": 9.095609541539925e-06, + "loss": 0.3958, + "step": 2247 + }, + { + "epoch": 0.21878345498783455, + "grad_norm": 1.3761247023142853, + "learning_rate": 9.094705224717388e-06, + "loss": 0.4076, + "step": 2248 + }, + { + "epoch": 0.21888077858880778, + "grad_norm": 1.2940624946938768, + "learning_rate": 9.093800501001476e-06, + "loss": 0.4989, + "step": 2249 + }, + { + "epoch": 0.21897810218978103, + "grad_norm": 1.1389229499303237, + "learning_rate": 9.092895370482091e-06, + "loss": 0.332, + "step": 2250 + }, + { + "epoch": 0.21907542579075426, + "grad_norm": 1.5338979130860617, + "learning_rate": 9.091989833249179e-06, + "loss": 0.5609, + "step": 2251 + }, + { + "epoch": 0.21917274939172748, + "grad_norm": 1.3736786128370664, + "learning_rate": 9.091083889392721e-06, + "loss": 0.3767, + "step": 2252 + }, + { + "epoch": 0.21927007299270074, + "grad_norm": 1.6001218689759074, + "learning_rate": 9.090177539002743e-06, + "loss": 0.5709, + "step": 2253 + }, + { + "epoch": 0.21936739659367396, + "grad_norm": 1.2578364778685514, + "learning_rate": 9.089270782169308e-06, + "loss": 0.3796, + "step": 2254 + }, + { + "epoch": 0.2194647201946472, + "grad_norm": 1.5508865589735865, + "learning_rate": 9.088363618982523e-06, + "loss": 0.5947, + "step": 2255 + }, + { + "epoch": 0.21956204379562044, + "grad_norm": 1.2646857650137902, + "learning_rate": 9.08745604953253e-06, + "loss": 0.3024, + "step": 2256 + }, + { + "epoch": 0.21965936739659367, + "grad_norm": 1.1168071392771144, + "learning_rate": 9.08654807390952e-06, + "loss": 0.3113, + "step": 2257 + }, + { + "epoch": 0.21975669099756692, + "grad_norm": 1.238369237619726, + "learning_rate": 9.085639692203713e-06, + "loss": 0.2179, + "step": 2258 + }, + { + "epoch": 0.21985401459854015, + "grad_norm": 1.2485790759653945, + "learning_rate": 9.084730904505381e-06, + "loss": 0.3763, + "step": 2259 + }, + { + "epoch": 0.21995133819951337, + "grad_norm": 1.6082877032407055, + "learning_rate": 9.083821710904827e-06, + "loss": 0.3831, + "step": 2260 + }, + { + "epoch": 0.22004866180048663, + "grad_norm": 1.3213256018887491, + "learning_rate": 9.082912111492401e-06, + "loss": 0.4091, + "step": 2261 + }, + { + "epoch": 0.22014598540145985, + "grad_norm": 1.5899440724355371, + "learning_rate": 9.08200210635849e-06, + "loss": 0.4491, + "step": 2262 + }, + { + "epoch": 0.22024330900243308, + "grad_norm": 1.30089465497526, + "learning_rate": 9.081091695593518e-06, + "loss": 0.3762, + "step": 2263 + }, + { + "epoch": 0.22034063260340633, + "grad_norm": 1.5403984971127525, + "learning_rate": 9.080180879287957e-06, + "loss": 0.438, + "step": 2264 + }, + { + "epoch": 0.22043795620437956, + "grad_norm": 1.5500984898931875, + "learning_rate": 9.079269657532312e-06, + "loss": 0.398, + "step": 2265 + }, + { + "epoch": 0.22053527980535279, + "grad_norm": 1.4834461719298844, + "learning_rate": 9.078358030417136e-06, + "loss": 0.6175, + "step": 2266 + }, + { + "epoch": 0.22063260340632604, + "grad_norm": 1.3553003212010182, + "learning_rate": 9.077445998033015e-06, + "loss": 0.2719, + "step": 2267 + }, + { + "epoch": 0.22072992700729926, + "grad_norm": 1.573783871238475, + "learning_rate": 9.07653356047058e-06, + "loss": 0.2328, + "step": 2268 + }, + { + "epoch": 0.22082725060827252, + "grad_norm": 1.54928316645126, + "learning_rate": 9.075620717820498e-06, + "loss": 0.3514, + "step": 2269 + }, + { + "epoch": 0.22092457420924574, + "grad_norm": 1.3616253433976528, + "learning_rate": 9.07470747017348e-06, + "loss": 0.4636, + "step": 2270 + }, + { + "epoch": 0.22102189781021897, + "grad_norm": 1.6741713680481711, + "learning_rate": 9.073793817620277e-06, + "loss": 0.6321, + "step": 2271 + }, + { + "epoch": 0.22111922141119222, + "grad_norm": 1.3794305685281492, + "learning_rate": 9.07287976025168e-06, + "loss": 0.3172, + "step": 2272 + }, + { + "epoch": 0.22121654501216545, + "grad_norm": 1.362894347632133, + "learning_rate": 9.071965298158516e-06, + "loss": 0.3989, + "step": 2273 + }, + { + "epoch": 0.22131386861313868, + "grad_norm": 1.4233131262232992, + "learning_rate": 9.071050431431658e-06, + "loss": 0.4922, + "step": 2274 + }, + { + "epoch": 0.22141119221411193, + "grad_norm": 1.4905332812995968, + "learning_rate": 9.070135160162016e-06, + "loss": 0.3952, + "step": 2275 + }, + { + "epoch": 0.22150851581508516, + "grad_norm": 1.4389307945528345, + "learning_rate": 9.069219484440541e-06, + "loss": 0.4364, + "step": 2276 + }, + { + "epoch": 0.22160583941605838, + "grad_norm": 1.4796907096594347, + "learning_rate": 9.068303404358226e-06, + "loss": 0.4842, + "step": 2277 + }, + { + "epoch": 0.22170316301703163, + "grad_norm": 1.6561415294899449, + "learning_rate": 9.0673869200061e-06, + "loss": 0.5595, + "step": 2278 + }, + { + "epoch": 0.22180048661800486, + "grad_norm": 1.4198474890784685, + "learning_rate": 9.066470031475236e-06, + "loss": 0.4762, + "step": 2279 + }, + { + "epoch": 0.22189781021897811, + "grad_norm": 1.437724469115563, + "learning_rate": 9.065552738856745e-06, + "loss": 0.3687, + "step": 2280 + }, + { + "epoch": 0.22199513381995134, + "grad_norm": 1.2431258010669888, + "learning_rate": 9.06463504224178e-06, + "loss": 0.3854, + "step": 2281 + }, + { + "epoch": 0.22209245742092457, + "grad_norm": 1.362042407967867, + "learning_rate": 9.063716941721534e-06, + "loss": 0.3981, + "step": 2282 + }, + { + "epoch": 0.22218978102189782, + "grad_norm": 1.3260780267557537, + "learning_rate": 9.062798437387236e-06, + "loss": 0.4304, + "step": 2283 + }, + { + "epoch": 0.22228710462287105, + "grad_norm": 1.2009742636293355, + "learning_rate": 9.06187952933016e-06, + "loss": 0.3441, + "step": 2284 + }, + { + "epoch": 0.22238442822384427, + "grad_norm": 1.7089934430562992, + "learning_rate": 9.060960217641618e-06, + "loss": 0.3488, + "step": 2285 + }, + { + "epoch": 0.22248175182481753, + "grad_norm": 1.3539106224768682, + "learning_rate": 9.060040502412965e-06, + "loss": 0.3617, + "step": 2286 + }, + { + "epoch": 0.22257907542579075, + "grad_norm": 1.3952537396094973, + "learning_rate": 9.05912038373559e-06, + "loss": 0.4507, + "step": 2287 + }, + { + "epoch": 0.22267639902676398, + "grad_norm": 1.201207552744405, + "learning_rate": 9.058199861700928e-06, + "loss": 0.3074, + "step": 2288 + }, + { + "epoch": 0.22277372262773723, + "grad_norm": 1.1918182161083974, + "learning_rate": 9.057278936400453e-06, + "loss": 0.3713, + "step": 2289 + }, + { + "epoch": 0.22287104622871046, + "grad_norm": 1.5864015097741249, + "learning_rate": 9.056357607925674e-06, + "loss": 0.4651, + "step": 2290 + }, + { + "epoch": 0.2229683698296837, + "grad_norm": 1.0855034708664277, + "learning_rate": 9.055435876368148e-06, + "loss": 0.2361, + "step": 2291 + }, + { + "epoch": 0.22306569343065694, + "grad_norm": 1.1945153364440069, + "learning_rate": 9.054513741819466e-06, + "loss": 0.2803, + "step": 2292 + }, + { + "epoch": 0.22316301703163016, + "grad_norm": 1.3734264039165323, + "learning_rate": 9.053591204371262e-06, + "loss": 0.3709, + "step": 2293 + }, + { + "epoch": 0.22326034063260342, + "grad_norm": 1.662571628719731, + "learning_rate": 9.052668264115206e-06, + "loss": 0.6615, + "step": 2294 + }, + { + "epoch": 0.22335766423357664, + "grad_norm": 1.4371203045482563, + "learning_rate": 9.051744921143015e-06, + "loss": 0.4082, + "step": 2295 + }, + { + "epoch": 0.22345498783454987, + "grad_norm": 1.5571182647752952, + "learning_rate": 9.050821175546442e-06, + "loss": 0.5338, + "step": 2296 + }, + { + "epoch": 0.22355231143552312, + "grad_norm": 1.4022335338581293, + "learning_rate": 9.049897027417277e-06, + "loss": 0.3933, + "step": 2297 + }, + { + "epoch": 0.22364963503649635, + "grad_norm": 1.2815006290096387, + "learning_rate": 9.048972476847356e-06, + "loss": 0.4662, + "step": 2298 + }, + { + "epoch": 0.22374695863746957, + "grad_norm": 1.4344706750679865, + "learning_rate": 9.04804752392855e-06, + "loss": 0.4422, + "step": 2299 + }, + { + "epoch": 0.22384428223844283, + "grad_norm": 1.2984999163116793, + "learning_rate": 9.047122168752775e-06, + "loss": 0.3659, + "step": 2300 + }, + { + "epoch": 0.22394160583941605, + "grad_norm": 1.1587669196843096, + "learning_rate": 9.046196411411982e-06, + "loss": 0.2974, + "step": 2301 + }, + { + "epoch": 0.2240389294403893, + "grad_norm": 2.322228254141064, + "learning_rate": 9.045270251998166e-06, + "loss": 0.5667, + "step": 2302 + }, + { + "epoch": 0.22413625304136253, + "grad_norm": 1.5137300738559605, + "learning_rate": 9.044343690603358e-06, + "loss": 0.3889, + "step": 2303 + }, + { + "epoch": 0.22423357664233576, + "grad_norm": 1.472679239189759, + "learning_rate": 9.04341672731963e-06, + "loss": 0.4875, + "step": 2304 + }, + { + "epoch": 0.224330900243309, + "grad_norm": 1.391957619608358, + "learning_rate": 9.042489362239097e-06, + "loss": 0.4513, + "step": 2305 + }, + { + "epoch": 0.22442822384428224, + "grad_norm": 1.5752423841676473, + "learning_rate": 9.041561595453914e-06, + "loss": 0.6021, + "step": 2306 + }, + { + "epoch": 0.22452554744525546, + "grad_norm": 1.340696458312585, + "learning_rate": 9.040633427056268e-06, + "loss": 0.36, + "step": 2307 + }, + { + "epoch": 0.22462287104622872, + "grad_norm": 1.319309191993897, + "learning_rate": 9.039704857138396e-06, + "loss": 0.2632, + "step": 2308 + }, + { + "epoch": 0.22472019464720194, + "grad_norm": 1.3567748798839634, + "learning_rate": 9.03877588579257e-06, + "loss": 0.4085, + "step": 2309 + }, + { + "epoch": 0.22481751824817517, + "grad_norm": 1.7234931003044007, + "learning_rate": 9.0378465131111e-06, + "loss": 0.5366, + "step": 2310 + }, + { + "epoch": 0.22491484184914842, + "grad_norm": 1.3431964443797024, + "learning_rate": 9.036916739186341e-06, + "loss": 0.3406, + "step": 2311 + }, + { + "epoch": 0.22501216545012165, + "grad_norm": 1.6143507102825565, + "learning_rate": 9.035986564110685e-06, + "loss": 0.6322, + "step": 2312 + }, + { + "epoch": 0.2251094890510949, + "grad_norm": 1.421713348254314, + "learning_rate": 9.035055987976563e-06, + "loss": 0.3963, + "step": 2313 + }, + { + "epoch": 0.22520681265206813, + "grad_norm": 1.5860325075452377, + "learning_rate": 9.034125010876447e-06, + "loss": 0.4722, + "step": 2314 + }, + { + "epoch": 0.22530413625304135, + "grad_norm": 1.633700480684755, + "learning_rate": 9.03319363290285e-06, + "loss": 0.2649, + "step": 2315 + }, + { + "epoch": 0.2254014598540146, + "grad_norm": 1.5598775600409591, + "learning_rate": 9.03226185414832e-06, + "loss": 0.4778, + "step": 2316 + }, + { + "epoch": 0.22549878345498783, + "grad_norm": 1.4413798673536165, + "learning_rate": 9.031329674705455e-06, + "loss": 0.3182, + "step": 2317 + }, + { + "epoch": 0.22559610705596106, + "grad_norm": 1.437989358950148, + "learning_rate": 9.03039709466688e-06, + "loss": 0.4297, + "step": 2318 + }, + { + "epoch": 0.2256934306569343, + "grad_norm": 1.3355568683760275, + "learning_rate": 9.029464114125267e-06, + "loss": 0.3393, + "step": 2319 + }, + { + "epoch": 0.22579075425790754, + "grad_norm": 1.353161962413978, + "learning_rate": 9.028530733173332e-06, + "loss": 0.3362, + "step": 2320 + }, + { + "epoch": 0.22588807785888076, + "grad_norm": 1.1699742479017108, + "learning_rate": 9.027596951903819e-06, + "loss": 0.3674, + "step": 2321 + }, + { + "epoch": 0.22598540145985402, + "grad_norm": 1.1235278882417843, + "learning_rate": 9.026662770409524e-06, + "loss": 0.3209, + "step": 2322 + }, + { + "epoch": 0.22608272506082724, + "grad_norm": 1.4951135995374567, + "learning_rate": 9.025728188783273e-06, + "loss": 0.4297, + "step": 2323 + }, + { + "epoch": 0.2261800486618005, + "grad_norm": 1.3046514997255336, + "learning_rate": 9.024793207117937e-06, + "loss": 0.3765, + "step": 2324 + }, + { + "epoch": 0.22627737226277372, + "grad_norm": 1.3346554142143854, + "learning_rate": 9.023857825506426e-06, + "loss": 0.5228, + "step": 2325 + }, + { + "epoch": 0.22637469586374695, + "grad_norm": 1.4309619163867682, + "learning_rate": 9.022922044041691e-06, + "loss": 0.4605, + "step": 2326 + }, + { + "epoch": 0.2264720194647202, + "grad_norm": 1.5152634651556307, + "learning_rate": 9.021985862816718e-06, + "loss": 0.5553, + "step": 2327 + }, + { + "epoch": 0.22656934306569343, + "grad_norm": 1.3885182055556289, + "learning_rate": 9.02104928192454e-06, + "loss": 0.4831, + "step": 2328 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 1.2729317064328092, + "learning_rate": 9.020112301458221e-06, + "loss": 0.4314, + "step": 2329 + }, + { + "epoch": 0.2267639902676399, + "grad_norm": 0.9679503678492228, + "learning_rate": 9.019174921510874e-06, + "loss": 0.1925, + "step": 2330 + }, + { + "epoch": 0.22686131386861313, + "grad_norm": 1.4513146393120597, + "learning_rate": 9.018237142175643e-06, + "loss": 0.5487, + "step": 2331 + }, + { + "epoch": 0.2269586374695864, + "grad_norm": 1.5377065039176208, + "learning_rate": 9.017298963545718e-06, + "loss": 0.4063, + "step": 2332 + }, + { + "epoch": 0.22705596107055961, + "grad_norm": 1.0180180453516632, + "learning_rate": 9.016360385714324e-06, + "loss": 0.2101, + "step": 2333 + }, + { + "epoch": 0.22715328467153284, + "grad_norm": 1.3145676629552665, + "learning_rate": 9.015421408774732e-06, + "loss": 0.4575, + "step": 2334 + }, + { + "epoch": 0.2272506082725061, + "grad_norm": 1.3213351651174845, + "learning_rate": 9.014482032820247e-06, + "loss": 0.3924, + "step": 2335 + }, + { + "epoch": 0.22734793187347932, + "grad_norm": 1.9370834148842127, + "learning_rate": 9.013542257944212e-06, + "loss": 0.4332, + "step": 2336 + }, + { + "epoch": 0.22744525547445255, + "grad_norm": 1.4754695985325648, + "learning_rate": 9.012602084240018e-06, + "loss": 0.4014, + "step": 2337 + }, + { + "epoch": 0.2275425790754258, + "grad_norm": 1.1124893316550342, + "learning_rate": 9.011661511801088e-06, + "loss": 0.2957, + "step": 2338 + }, + { + "epoch": 0.22763990267639903, + "grad_norm": 1.2537185195667433, + "learning_rate": 9.010720540720888e-06, + "loss": 0.3004, + "step": 2339 + }, + { + "epoch": 0.22773722627737225, + "grad_norm": 1.4597689601256807, + "learning_rate": 9.009779171092923e-06, + "loss": 0.2555, + "step": 2340 + }, + { + "epoch": 0.2278345498783455, + "grad_norm": 1.4737791439989423, + "learning_rate": 9.008837403010736e-06, + "loss": 0.5355, + "step": 2341 + }, + { + "epoch": 0.22793187347931873, + "grad_norm": 1.3795639069131398, + "learning_rate": 9.007895236567913e-06, + "loss": 0.3961, + "step": 2342 + }, + { + "epoch": 0.22802919708029198, + "grad_norm": 1.6364796903185053, + "learning_rate": 9.006952671858078e-06, + "loss": 0.444, + "step": 2343 + }, + { + "epoch": 0.2281265206812652, + "grad_norm": 1.1964346909925698, + "learning_rate": 9.006009708974892e-06, + "loss": 0.3297, + "step": 2344 + }, + { + "epoch": 0.22822384428223844, + "grad_norm": 1.343808771666808, + "learning_rate": 9.00506634801206e-06, + "loss": 0.4537, + "step": 2345 + }, + { + "epoch": 0.2283211678832117, + "grad_norm": 1.4003110727355261, + "learning_rate": 9.004122589063323e-06, + "loss": 0.3883, + "step": 2346 + }, + { + "epoch": 0.22841849148418492, + "grad_norm": 1.2435101838594087, + "learning_rate": 9.003178432222462e-06, + "loss": 0.4238, + "step": 2347 + }, + { + "epoch": 0.22851581508515814, + "grad_norm": 1.324643227390155, + "learning_rate": 9.0022338775833e-06, + "loss": 0.4139, + "step": 2348 + }, + { + "epoch": 0.2286131386861314, + "grad_norm": 1.7692069120616638, + "learning_rate": 9.001288925239698e-06, + "loss": 0.4719, + "step": 2349 + }, + { + "epoch": 0.22871046228710462, + "grad_norm": 1.223562422765287, + "learning_rate": 9.000343575285555e-06, + "loss": 0.3256, + "step": 2350 + }, + { + "epoch": 0.22880778588807785, + "grad_norm": 1.3407025045830592, + "learning_rate": 8.999397827814812e-06, + "loss": 0.3788, + "step": 2351 + }, + { + "epoch": 0.2289051094890511, + "grad_norm": 1.5281139100341292, + "learning_rate": 8.99845168292145e-06, + "loss": 0.5565, + "step": 2352 + }, + { + "epoch": 0.22900243309002433, + "grad_norm": 1.560155712083658, + "learning_rate": 8.997505140699488e-06, + "loss": 0.4957, + "step": 2353 + }, + { + "epoch": 0.22909975669099758, + "grad_norm": 1.290422773797366, + "learning_rate": 8.996558201242981e-06, + "loss": 0.4011, + "step": 2354 + }, + { + "epoch": 0.2291970802919708, + "grad_norm": 1.2847680894150124, + "learning_rate": 8.99561086464603e-06, + "loss": 0.4419, + "step": 2355 + }, + { + "epoch": 0.22929440389294403, + "grad_norm": 1.4625413220428547, + "learning_rate": 8.99466313100277e-06, + "loss": 0.2511, + "step": 2356 + }, + { + "epoch": 0.22939172749391729, + "grad_norm": 1.2882840667194135, + "learning_rate": 8.99371500040738e-06, + "loss": 0.3992, + "step": 2357 + }, + { + "epoch": 0.2294890510948905, + "grad_norm": 1.1997126453782205, + "learning_rate": 8.992766472954077e-06, + "loss": 0.2639, + "step": 2358 + }, + { + "epoch": 0.22958637469586374, + "grad_norm": 1.6688893120724655, + "learning_rate": 8.991817548737114e-06, + "loss": 0.3103, + "step": 2359 + }, + { + "epoch": 0.229683698296837, + "grad_norm": 1.4031771252981649, + "learning_rate": 8.990868227850788e-06, + "loss": 0.4245, + "step": 2360 + }, + { + "epoch": 0.22978102189781022, + "grad_norm": 1.4825462721346627, + "learning_rate": 8.989918510389432e-06, + "loss": 0.3973, + "step": 2361 + }, + { + "epoch": 0.22987834549878344, + "grad_norm": 1.7756990641125774, + "learning_rate": 8.988968396447424e-06, + "loss": 0.6091, + "step": 2362 + }, + { + "epoch": 0.2299756690997567, + "grad_norm": 1.5519381803018173, + "learning_rate": 8.988017886119172e-06, + "loss": 0.5849, + "step": 2363 + }, + { + "epoch": 0.23007299270072992, + "grad_norm": 1.5288537407748173, + "learning_rate": 8.987066979499133e-06, + "loss": 0.594, + "step": 2364 + }, + { + "epoch": 0.23017031630170318, + "grad_norm": 1.2519254160654887, + "learning_rate": 8.986115676681797e-06, + "loss": 0.3781, + "step": 2365 + }, + { + "epoch": 0.2302676399026764, + "grad_norm": 1.2118409754918265, + "learning_rate": 8.985163977761697e-06, + "loss": 0.3761, + "step": 2366 + }, + { + "epoch": 0.23036496350364963, + "grad_norm": 1.3123505825187787, + "learning_rate": 8.984211882833402e-06, + "loss": 0.405, + "step": 2367 + }, + { + "epoch": 0.23046228710462288, + "grad_norm": 1.6027642184293107, + "learning_rate": 8.983259391991524e-06, + "loss": 0.597, + "step": 2368 + }, + { + "epoch": 0.2305596107055961, + "grad_norm": 1.3646497443348367, + "learning_rate": 8.982306505330712e-06, + "loss": 0.4036, + "step": 2369 + }, + { + "epoch": 0.23065693430656933, + "grad_norm": 1.2894115553392402, + "learning_rate": 8.981353222945653e-06, + "loss": 0.2778, + "step": 2370 + }, + { + "epoch": 0.2307542579075426, + "grad_norm": 1.27883786418869, + "learning_rate": 8.98039954493108e-06, + "loss": 0.3803, + "step": 2371 + }, + { + "epoch": 0.2308515815085158, + "grad_norm": 1.5863647637061415, + "learning_rate": 8.979445471381755e-06, + "loss": 0.4716, + "step": 2372 + }, + { + "epoch": 0.23094890510948904, + "grad_norm": 1.1874137646332688, + "learning_rate": 8.97849100239249e-06, + "loss": 0.2846, + "step": 2373 + }, + { + "epoch": 0.2310462287104623, + "grad_norm": 1.6936318641369774, + "learning_rate": 8.977536138058126e-06, + "loss": 0.3418, + "step": 2374 + }, + { + "epoch": 0.23114355231143552, + "grad_norm": 1.0526167143851337, + "learning_rate": 8.976580878473553e-06, + "loss": 0.258, + "step": 2375 + }, + { + "epoch": 0.23124087591240877, + "grad_norm": 1.753799809070063, + "learning_rate": 8.975625223733693e-06, + "loss": 0.4764, + "step": 2376 + }, + { + "epoch": 0.231338199513382, + "grad_norm": 1.3814482775299988, + "learning_rate": 8.97466917393351e-06, + "loss": 0.3811, + "step": 2377 + }, + { + "epoch": 0.23143552311435522, + "grad_norm": 1.575424754678499, + "learning_rate": 8.97371272916801e-06, + "loss": 0.5028, + "step": 2378 + }, + { + "epoch": 0.23153284671532848, + "grad_norm": 1.5163540217481704, + "learning_rate": 8.972755889532234e-06, + "loss": 0.4055, + "step": 2379 + }, + { + "epoch": 0.2316301703163017, + "grad_norm": 1.1877796947964157, + "learning_rate": 8.971798655121264e-06, + "loss": 0.2978, + "step": 2380 + }, + { + "epoch": 0.23172749391727493, + "grad_norm": 1.6274909221671408, + "learning_rate": 8.970841026030218e-06, + "loss": 0.4319, + "step": 2381 + }, + { + "epoch": 0.23182481751824818, + "grad_norm": 1.413480143472021, + "learning_rate": 8.969883002354259e-06, + "loss": 0.4015, + "step": 2382 + }, + { + "epoch": 0.2319221411192214, + "grad_norm": 1.451327617189514, + "learning_rate": 8.968924584188587e-06, + "loss": 0.5107, + "step": 2383 + }, + { + "epoch": 0.23201946472019463, + "grad_norm": 1.4288160659587352, + "learning_rate": 8.96796577162844e-06, + "loss": 0.369, + "step": 2384 + }, + { + "epoch": 0.2321167883211679, + "grad_norm": 1.6469132304956866, + "learning_rate": 8.967006564769094e-06, + "loss": 0.5982, + "step": 2385 + }, + { + "epoch": 0.23221411192214111, + "grad_norm": 1.4887239693800984, + "learning_rate": 8.966046963705869e-06, + "loss": 0.4967, + "step": 2386 + }, + { + "epoch": 0.23231143552311437, + "grad_norm": 1.2469481884120308, + "learning_rate": 8.965086968534116e-06, + "loss": 0.4022, + "step": 2387 + }, + { + "epoch": 0.2324087591240876, + "grad_norm": 2.6320603198934047, + "learning_rate": 8.964126579349237e-06, + "loss": 0.2489, + "step": 2388 + }, + { + "epoch": 0.23250608272506082, + "grad_norm": 1.2339093742509784, + "learning_rate": 8.963165796246663e-06, + "loss": 0.3694, + "step": 2389 + }, + { + "epoch": 0.23260340632603407, + "grad_norm": 1.4634162966788549, + "learning_rate": 8.962204619321866e-06, + "loss": 0.5646, + "step": 2390 + }, + { + "epoch": 0.2327007299270073, + "grad_norm": 1.2919651066139786, + "learning_rate": 8.961243048670363e-06, + "loss": 0.3833, + "step": 2391 + }, + { + "epoch": 0.23279805352798053, + "grad_norm": 1.5273773111622013, + "learning_rate": 8.960281084387701e-06, + "loss": 0.5724, + "step": 2392 + }, + { + "epoch": 0.23289537712895378, + "grad_norm": 1.4704498843019616, + "learning_rate": 8.959318726569475e-06, + "loss": 0.5232, + "step": 2393 + }, + { + "epoch": 0.232992700729927, + "grad_norm": 1.52947786509823, + "learning_rate": 8.958355975311314e-06, + "loss": 0.5014, + "step": 2394 + }, + { + "epoch": 0.23309002433090023, + "grad_norm": 1.457234959002331, + "learning_rate": 8.957392830708886e-06, + "loss": 0.5401, + "step": 2395 + }, + { + "epoch": 0.23318734793187348, + "grad_norm": 1.5878948291380384, + "learning_rate": 8.9564292928579e-06, + "loss": 0.4481, + "step": 2396 + }, + { + "epoch": 0.2332846715328467, + "grad_norm": 1.3353181262068508, + "learning_rate": 8.955465361854103e-06, + "loss": 0.3668, + "step": 2397 + }, + { + "epoch": 0.23338199513381996, + "grad_norm": 2.023729457927684, + "learning_rate": 8.954501037793282e-06, + "loss": 0.256, + "step": 2398 + }, + { + "epoch": 0.2334793187347932, + "grad_norm": 1.3501136378744423, + "learning_rate": 8.953536320771264e-06, + "loss": 0.4288, + "step": 2399 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.9695156209886321, + "learning_rate": 8.95257121088391e-06, + "loss": 0.3313, + "step": 2400 + }, + { + "epoch": 0.23367396593673967, + "grad_norm": 1.6268089203048999, + "learning_rate": 8.951605708227125e-06, + "loss": 0.5031, + "step": 2401 + }, + { + "epoch": 0.2337712895377129, + "grad_norm": 1.3327356528771297, + "learning_rate": 8.950639812896852e-06, + "loss": 0.352, + "step": 2402 + }, + { + "epoch": 0.23386861313868612, + "grad_norm": 1.646158604731562, + "learning_rate": 8.949673524989074e-06, + "loss": 0.6143, + "step": 2403 + }, + { + "epoch": 0.23396593673965937, + "grad_norm": 1.4459398712277267, + "learning_rate": 8.948706844599809e-06, + "loss": 0.301, + "step": 2404 + }, + { + "epoch": 0.2340632603406326, + "grad_norm": 1.242464142709881, + "learning_rate": 8.947739771825118e-06, + "loss": 0.3867, + "step": 2405 + }, + { + "epoch": 0.23416058394160583, + "grad_norm": 1.283369590610404, + "learning_rate": 8.946772306761099e-06, + "loss": 0.3396, + "step": 2406 + }, + { + "epoch": 0.23425790754257908, + "grad_norm": 1.659051576981879, + "learning_rate": 8.94580444950389e-06, + "loss": 0.2985, + "step": 2407 + }, + { + "epoch": 0.2343552311435523, + "grad_norm": 1.5811557183787177, + "learning_rate": 8.944836200149669e-06, + "loss": 0.5412, + "step": 2408 + }, + { + "epoch": 0.23445255474452556, + "grad_norm": 1.5937284580345608, + "learning_rate": 8.943867558794648e-06, + "loss": 0.4562, + "step": 2409 + }, + { + "epoch": 0.23454987834549879, + "grad_norm": 1.179539450140548, + "learning_rate": 8.942898525535085e-06, + "loss": 0.2436, + "step": 2410 + }, + { + "epoch": 0.234647201946472, + "grad_norm": 1.2115140465312926, + "learning_rate": 8.941929100467272e-06, + "loss": 0.325, + "step": 2411 + }, + { + "epoch": 0.23474452554744527, + "grad_norm": 1.3228525862104779, + "learning_rate": 8.94095928368754e-06, + "loss": 0.4001, + "step": 2412 + }, + { + "epoch": 0.2348418491484185, + "grad_norm": 1.5093562470528878, + "learning_rate": 8.939989075292263e-06, + "loss": 0.3554, + "step": 2413 + }, + { + "epoch": 0.23493917274939172, + "grad_norm": 1.629660086616085, + "learning_rate": 8.93901847537785e-06, + "loss": 0.6349, + "step": 2414 + }, + { + "epoch": 0.23503649635036497, + "grad_norm": 1.0826348229158524, + "learning_rate": 8.938047484040749e-06, + "loss": 0.2681, + "step": 2415 + }, + { + "epoch": 0.2351338199513382, + "grad_norm": 1.2841520241198179, + "learning_rate": 8.93707610137745e-06, + "loss": 0.4081, + "step": 2416 + }, + { + "epoch": 0.23523114355231142, + "grad_norm": 1.913465881785096, + "learning_rate": 8.936104327484479e-06, + "loss": 0.7043, + "step": 2417 + }, + { + "epoch": 0.23532846715328468, + "grad_norm": 1.386306701477425, + "learning_rate": 8.935132162458401e-06, + "loss": 0.341, + "step": 2418 + }, + { + "epoch": 0.2354257907542579, + "grad_norm": 1.1278547518059516, + "learning_rate": 8.934159606395821e-06, + "loss": 0.3151, + "step": 2419 + }, + { + "epoch": 0.23552311435523116, + "grad_norm": 1.5540265542588236, + "learning_rate": 8.933186659393384e-06, + "loss": 0.6514, + "step": 2420 + }, + { + "epoch": 0.23562043795620438, + "grad_norm": 1.278787339635804, + "learning_rate": 8.932213321547769e-06, + "loss": 0.3423, + "step": 2421 + }, + { + "epoch": 0.2357177615571776, + "grad_norm": 1.2885094583361822, + "learning_rate": 8.931239592955701e-06, + "loss": 0.2958, + "step": 2422 + }, + { + "epoch": 0.23581508515815086, + "grad_norm": 1.5181901598500283, + "learning_rate": 8.930265473713939e-06, + "loss": 0.4212, + "step": 2423 + }, + { + "epoch": 0.2359124087591241, + "grad_norm": 1.2136160482551297, + "learning_rate": 8.92929096391928e-06, + "loss": 0.3982, + "step": 2424 + }, + { + "epoch": 0.2360097323600973, + "grad_norm": 1.5487072814518004, + "learning_rate": 8.928316063668562e-06, + "loss": 0.5676, + "step": 2425 + }, + { + "epoch": 0.23610705596107057, + "grad_norm": 1.430432818475582, + "learning_rate": 8.927340773058664e-06, + "loss": 0.4735, + "step": 2426 + }, + { + "epoch": 0.2362043795620438, + "grad_norm": 1.4586841524588252, + "learning_rate": 8.926365092186498e-06, + "loss": 0.5637, + "step": 2427 + }, + { + "epoch": 0.23630170316301702, + "grad_norm": 1.5364014523424565, + "learning_rate": 8.92538902114902e-06, + "loss": 0.4783, + "step": 2428 + }, + { + "epoch": 0.23639902676399027, + "grad_norm": 1.3896600182614345, + "learning_rate": 8.924412560043223e-06, + "loss": 0.3748, + "step": 2429 + }, + { + "epoch": 0.2364963503649635, + "grad_norm": 1.304447540327908, + "learning_rate": 8.923435708966135e-06, + "loss": 0.3373, + "step": 2430 + }, + { + "epoch": 0.23659367396593675, + "grad_norm": 1.3383082719469825, + "learning_rate": 8.922458468014833e-06, + "loss": 0.3089, + "step": 2431 + }, + { + "epoch": 0.23669099756690998, + "grad_norm": 1.4376693294142868, + "learning_rate": 8.921480837286418e-06, + "loss": 0.2665, + "step": 2432 + }, + { + "epoch": 0.2367883211678832, + "grad_norm": 1.3948368197200884, + "learning_rate": 8.920502816878045e-06, + "loss": 0.4349, + "step": 2433 + }, + { + "epoch": 0.23688564476885646, + "grad_norm": 1.5583938814663865, + "learning_rate": 8.919524406886897e-06, + "loss": 0.4528, + "step": 2434 + }, + { + "epoch": 0.23698296836982968, + "grad_norm": 1.455016515054737, + "learning_rate": 8.918545607410199e-06, + "loss": 0.416, + "step": 2435 + }, + { + "epoch": 0.2370802919708029, + "grad_norm": 1.5707414335423742, + "learning_rate": 8.917566418545215e-06, + "loss": 0.4269, + "step": 2436 + }, + { + "epoch": 0.23717761557177616, + "grad_norm": 1.6214497738286784, + "learning_rate": 8.916586840389248e-06, + "loss": 0.5531, + "step": 2437 + }, + { + "epoch": 0.2372749391727494, + "grad_norm": 1.5231468510302828, + "learning_rate": 8.91560687303964e-06, + "loss": 0.5464, + "step": 2438 + }, + { + "epoch": 0.23737226277372261, + "grad_norm": 1.5631657517225734, + "learning_rate": 8.91462651659377e-06, + "loss": 0.4098, + "step": 2439 + }, + { + "epoch": 0.23746958637469587, + "grad_norm": 1.5003582208774642, + "learning_rate": 8.913645771149058e-06, + "loss": 0.342, + "step": 2440 + }, + { + "epoch": 0.2375669099756691, + "grad_norm": 1.2703591332316027, + "learning_rate": 8.91266463680296e-06, + "loss": 0.3195, + "step": 2441 + }, + { + "epoch": 0.23766423357664235, + "grad_norm": 1.3910967851640175, + "learning_rate": 8.91168311365297e-06, + "loss": 0.334, + "step": 2442 + }, + { + "epoch": 0.23776155717761557, + "grad_norm": 1.5001053773105038, + "learning_rate": 8.910701201796625e-06, + "loss": 0.4665, + "step": 2443 + }, + { + "epoch": 0.2378588807785888, + "grad_norm": 1.6142849143926903, + "learning_rate": 8.9097189013315e-06, + "loss": 0.5276, + "step": 2444 + }, + { + "epoch": 0.23795620437956205, + "grad_norm": 1.2059866820401877, + "learning_rate": 8.908736212355202e-06, + "loss": 0.2936, + "step": 2445 + }, + { + "epoch": 0.23805352798053528, + "grad_norm": 1.4496663268694052, + "learning_rate": 8.907753134965387e-06, + "loss": 0.475, + "step": 2446 + }, + { + "epoch": 0.2381508515815085, + "grad_norm": 1.4184456855989886, + "learning_rate": 8.90676966925974e-06, + "loss": 0.4477, + "step": 2447 + }, + { + "epoch": 0.23824817518248176, + "grad_norm": 1.7126804340284862, + "learning_rate": 8.90578581533599e-06, + "loss": 0.6392, + "step": 2448 + }, + { + "epoch": 0.23834549878345498, + "grad_norm": 1.6085356958926766, + "learning_rate": 8.904801573291901e-06, + "loss": 0.4428, + "step": 2449 + }, + { + "epoch": 0.2384428223844282, + "grad_norm": 1.1724096477321129, + "learning_rate": 8.903816943225281e-06, + "loss": 0.23, + "step": 2450 + }, + { + "epoch": 0.23854014598540146, + "grad_norm": 1.2849397331978023, + "learning_rate": 8.902831925233972e-06, + "loss": 0.4315, + "step": 2451 + }, + { + "epoch": 0.2386374695863747, + "grad_norm": 1.3479724015628292, + "learning_rate": 8.901846519415856e-06, + "loss": 0.4528, + "step": 2452 + }, + { + "epoch": 0.23873479318734794, + "grad_norm": 1.5241447958707557, + "learning_rate": 8.900860725868852e-06, + "loss": 0.5638, + "step": 2453 + }, + { + "epoch": 0.23883211678832117, + "grad_norm": 1.3008951589753057, + "learning_rate": 8.899874544690921e-06, + "loss": 0.4364, + "step": 2454 + }, + { + "epoch": 0.2389294403892944, + "grad_norm": 1.3889516127516133, + "learning_rate": 8.89888797598006e-06, + "loss": 0.5968, + "step": 2455 + }, + { + "epoch": 0.23902676399026765, + "grad_norm": 1.3382384356293548, + "learning_rate": 8.8979010198343e-06, + "loss": 0.3423, + "step": 2456 + }, + { + "epoch": 0.23912408759124087, + "grad_norm": 1.3927455122024084, + "learning_rate": 8.896913676351726e-06, + "loss": 0.5291, + "step": 2457 + }, + { + "epoch": 0.2392214111922141, + "grad_norm": 1.3654704619725508, + "learning_rate": 8.895925945630441e-06, + "loss": 0.3224, + "step": 2458 + }, + { + "epoch": 0.23931873479318735, + "grad_norm": 2.420859240745107, + "learning_rate": 8.8949378277686e-06, + "loss": 0.4526, + "step": 2459 + }, + { + "epoch": 0.23941605839416058, + "grad_norm": 1.279171164356654, + "learning_rate": 8.893949322864394e-06, + "loss": 0.3452, + "step": 2460 + }, + { + "epoch": 0.2395133819951338, + "grad_norm": 1.4336845514712926, + "learning_rate": 8.89296043101605e-06, + "loss": 0.3891, + "step": 2461 + }, + { + "epoch": 0.23961070559610706, + "grad_norm": 1.2391493008048138, + "learning_rate": 8.891971152321836e-06, + "loss": 0.5135, + "step": 2462 + }, + { + "epoch": 0.23970802919708029, + "grad_norm": 1.2398633987802397, + "learning_rate": 8.890981486880057e-06, + "loss": 0.2688, + "step": 2463 + }, + { + "epoch": 0.23980535279805354, + "grad_norm": 1.1975725536626207, + "learning_rate": 8.889991434789054e-06, + "loss": 0.4181, + "step": 2464 + }, + { + "epoch": 0.23990267639902677, + "grad_norm": 1.5121790458693565, + "learning_rate": 8.889000996147213e-06, + "loss": 0.667, + "step": 2465 + }, + { + "epoch": 0.24, + "grad_norm": 1.2980809407294283, + "learning_rate": 8.888010171052951e-06, + "loss": 0.4025, + "step": 2466 + }, + { + "epoch": 0.24009732360097324, + "grad_norm": 1.3683247659037883, + "learning_rate": 8.887018959604731e-06, + "loss": 0.4195, + "step": 2467 + }, + { + "epoch": 0.24019464720194647, + "grad_norm": 1.6392091000056277, + "learning_rate": 8.886027361901045e-06, + "loss": 0.4464, + "step": 2468 + }, + { + "epoch": 0.2402919708029197, + "grad_norm": 1.4286158146093557, + "learning_rate": 8.885035378040435e-06, + "loss": 0.503, + "step": 2469 + }, + { + "epoch": 0.24038929440389295, + "grad_norm": 1.6249203295617591, + "learning_rate": 8.884043008121468e-06, + "loss": 0.5875, + "step": 2470 + }, + { + "epoch": 0.24048661800486618, + "grad_norm": 1.316531393288964, + "learning_rate": 8.883050252242762e-06, + "loss": 0.3225, + "step": 2471 + }, + { + "epoch": 0.24058394160583943, + "grad_norm": 1.3738066957140371, + "learning_rate": 8.882057110502964e-06, + "loss": 0.3863, + "step": 2472 + }, + { + "epoch": 0.24068126520681266, + "grad_norm": 1.6149562610100578, + "learning_rate": 8.881063583000766e-06, + "loss": 0.6899, + "step": 2473 + }, + { + "epoch": 0.24077858880778588, + "grad_norm": 1.1978996054498634, + "learning_rate": 8.880069669834895e-06, + "loss": 0.4647, + "step": 2474 + }, + { + "epoch": 0.24087591240875914, + "grad_norm": 1.3737195294986575, + "learning_rate": 8.879075371104114e-06, + "loss": 0.3404, + "step": 2475 + }, + { + "epoch": 0.24097323600973236, + "grad_norm": 1.3242090275500389, + "learning_rate": 8.878080686907231e-06, + "loss": 0.4923, + "step": 2476 + }, + { + "epoch": 0.2410705596107056, + "grad_norm": 1.295191211796917, + "learning_rate": 8.877085617343085e-06, + "loss": 0.4449, + "step": 2477 + }, + { + "epoch": 0.24116788321167884, + "grad_norm": 1.5068542914468723, + "learning_rate": 8.87609016251056e-06, + "loss": 0.5506, + "step": 2478 + }, + { + "epoch": 0.24126520681265207, + "grad_norm": 1.650040845654398, + "learning_rate": 8.87509432250857e-06, + "loss": 0.5715, + "step": 2479 + }, + { + "epoch": 0.2413625304136253, + "grad_norm": 1.5289429392674028, + "learning_rate": 8.874098097436078e-06, + "loss": 0.5626, + "step": 2480 + }, + { + "epoch": 0.24145985401459855, + "grad_norm": 1.3609358059405043, + "learning_rate": 8.873101487392078e-06, + "loss": 0.4096, + "step": 2481 + }, + { + "epoch": 0.24155717761557177, + "grad_norm": 1.5725676631470524, + "learning_rate": 8.8721044924756e-06, + "loss": 0.6597, + "step": 2482 + }, + { + "epoch": 0.24165450121654503, + "grad_norm": 1.094002939677081, + "learning_rate": 8.87110711278572e-06, + "loss": 0.3206, + "step": 2483 + }, + { + "epoch": 0.24175182481751825, + "grad_norm": 1.4551979783640236, + "learning_rate": 8.870109348421544e-06, + "loss": 0.445, + "step": 2484 + }, + { + "epoch": 0.24184914841849148, + "grad_norm": 1.534219781362636, + "learning_rate": 8.869111199482227e-06, + "loss": 0.6666, + "step": 2485 + }, + { + "epoch": 0.24194647201946473, + "grad_norm": 0.9530847884904149, + "learning_rate": 8.86811266606695e-06, + "loss": 0.2756, + "step": 2486 + }, + { + "epoch": 0.24204379562043796, + "grad_norm": 1.4859819247146357, + "learning_rate": 8.86711374827494e-06, + "loss": 0.4626, + "step": 2487 + }, + { + "epoch": 0.24214111922141118, + "grad_norm": 1.5336983239407425, + "learning_rate": 8.86611444620546e-06, + "loss": 0.5383, + "step": 2488 + }, + { + "epoch": 0.24223844282238444, + "grad_norm": 1.4073640437212571, + "learning_rate": 8.865114759957812e-06, + "loss": 0.4675, + "step": 2489 + }, + { + "epoch": 0.24233576642335766, + "grad_norm": 1.562895534043348, + "learning_rate": 8.864114689631334e-06, + "loss": 0.5641, + "step": 2490 + }, + { + "epoch": 0.2424330900243309, + "grad_norm": 1.643145414496213, + "learning_rate": 8.863114235325405e-06, + "loss": 0.5749, + "step": 2491 + }, + { + "epoch": 0.24253041362530414, + "grad_norm": 1.226721686463078, + "learning_rate": 8.862113397139437e-06, + "loss": 0.3432, + "step": 2492 + }, + { + "epoch": 0.24262773722627737, + "grad_norm": 1.2699959241996903, + "learning_rate": 8.86111217517289e-06, + "loss": 0.4203, + "step": 2493 + }, + { + "epoch": 0.24272506082725062, + "grad_norm": 1.4233705808484327, + "learning_rate": 8.860110569525253e-06, + "loss": 0.2601, + "step": 2494 + }, + { + "epoch": 0.24282238442822385, + "grad_norm": 1.3784035260656315, + "learning_rate": 8.859108580296055e-06, + "loss": 0.4973, + "step": 2495 + }, + { + "epoch": 0.24291970802919707, + "grad_norm": 1.2790024746236357, + "learning_rate": 8.858106207584864e-06, + "loss": 0.4067, + "step": 2496 + }, + { + "epoch": 0.24301703163017033, + "grad_norm": 1.4041054798155945, + "learning_rate": 8.857103451491292e-06, + "loss": 0.5228, + "step": 2497 + }, + { + "epoch": 0.24311435523114355, + "grad_norm": 1.6788565066048042, + "learning_rate": 8.856100312114975e-06, + "loss": 0.7133, + "step": 2498 + }, + { + "epoch": 0.24321167883211678, + "grad_norm": 1.2024623978380433, + "learning_rate": 8.855096789555602e-06, + "loss": 0.2507, + "step": 2499 + }, + { + "epoch": 0.24330900243309003, + "grad_norm": 1.3828470689148782, + "learning_rate": 8.85409288391289e-06, + "loss": 0.3993, + "step": 2500 + }, + { + "epoch": 0.24340632603406326, + "grad_norm": 1.427484284296059, + "learning_rate": 8.8530885952866e-06, + "loss": 0.3926, + "step": 2501 + }, + { + "epoch": 0.24350364963503648, + "grad_norm": 1.3193446567792235, + "learning_rate": 8.852083923776529e-06, + "loss": 0.2152, + "step": 2502 + }, + { + "epoch": 0.24360097323600974, + "grad_norm": 1.3297823718570532, + "learning_rate": 8.851078869482509e-06, + "loss": 0.4772, + "step": 2503 + }, + { + "epoch": 0.24369829683698296, + "grad_norm": 1.3044660003313646, + "learning_rate": 8.850073432504416e-06, + "loss": 0.3589, + "step": 2504 + }, + { + "epoch": 0.24379562043795622, + "grad_norm": 1.4488096250914715, + "learning_rate": 8.84906761294216e-06, + "loss": 0.3261, + "step": 2505 + }, + { + "epoch": 0.24389294403892944, + "grad_norm": 1.2778329641523152, + "learning_rate": 8.848061410895687e-06, + "loss": 0.3047, + "step": 2506 + }, + { + "epoch": 0.24399026763990267, + "grad_norm": 1.135638375757245, + "learning_rate": 8.847054826464988e-06, + "loss": 0.3173, + "step": 2507 + }, + { + "epoch": 0.24408759124087592, + "grad_norm": 1.5033745953013864, + "learning_rate": 8.846047859750086e-06, + "loss": 0.4813, + "step": 2508 + }, + { + "epoch": 0.24418491484184915, + "grad_norm": 1.1189501535394493, + "learning_rate": 8.845040510851044e-06, + "loss": 0.3359, + "step": 2509 + }, + { + "epoch": 0.24428223844282237, + "grad_norm": 1.4743455663494507, + "learning_rate": 8.844032779867966e-06, + "loss": 0.5354, + "step": 2510 + }, + { + "epoch": 0.24437956204379563, + "grad_norm": 1.2644405709657818, + "learning_rate": 8.843024666900983e-06, + "loss": 0.4019, + "step": 2511 + }, + { + "epoch": 0.24447688564476885, + "grad_norm": 1.5585250648144962, + "learning_rate": 8.84201617205028e-06, + "loss": 0.4977, + "step": 2512 + }, + { + "epoch": 0.24457420924574208, + "grad_norm": 1.5187811483320863, + "learning_rate": 8.841007295416069e-06, + "loss": 0.6282, + "step": 2513 + }, + { + "epoch": 0.24467153284671533, + "grad_norm": 1.461783750506842, + "learning_rate": 8.839998037098601e-06, + "loss": 0.6085, + "step": 2514 + }, + { + "epoch": 0.24476885644768856, + "grad_norm": 1.4235036556142022, + "learning_rate": 8.838988397198167e-06, + "loss": 0.5696, + "step": 2515 + }, + { + "epoch": 0.2448661800486618, + "grad_norm": 1.6731038078758624, + "learning_rate": 8.837978375815097e-06, + "loss": 0.5026, + "step": 2516 + }, + { + "epoch": 0.24496350364963504, + "grad_norm": 1.2803102163564937, + "learning_rate": 8.836967973049757e-06, + "loss": 0.2605, + "step": 2517 + }, + { + "epoch": 0.24506082725060827, + "grad_norm": 1.2869808613318177, + "learning_rate": 8.835957189002551e-06, + "loss": 0.3073, + "step": 2518 + }, + { + "epoch": 0.24515815085158152, + "grad_norm": 1.4129342483481067, + "learning_rate": 8.834946023773921e-06, + "loss": 0.334, + "step": 2519 + }, + { + "epoch": 0.24525547445255474, + "grad_norm": 1.6342111830003216, + "learning_rate": 8.833934477464348e-06, + "loss": 0.6127, + "step": 2520 + }, + { + "epoch": 0.24535279805352797, + "grad_norm": 1.6465764681762454, + "learning_rate": 8.83292255017435e-06, + "loss": 0.6432, + "step": 2521 + }, + { + "epoch": 0.24545012165450122, + "grad_norm": 1.4262158711234114, + "learning_rate": 8.83191024200448e-06, + "loss": 0.5224, + "step": 2522 + }, + { + "epoch": 0.24554744525547445, + "grad_norm": 1.593193256147642, + "learning_rate": 8.830897553055337e-06, + "loss": 0.5211, + "step": 2523 + }, + { + "epoch": 0.24564476885644768, + "grad_norm": 1.624031218270973, + "learning_rate": 8.829884483427547e-06, + "loss": 0.5128, + "step": 2524 + }, + { + "epoch": 0.24574209245742093, + "grad_norm": 1.991662408778961, + "learning_rate": 8.828871033221783e-06, + "loss": 0.3025, + "step": 2525 + }, + { + "epoch": 0.24583941605839416, + "grad_norm": 1.4390691402915812, + "learning_rate": 8.82785720253875e-06, + "loss": 0.5088, + "step": 2526 + }, + { + "epoch": 0.2459367396593674, + "grad_norm": 1.4179406701872763, + "learning_rate": 8.826842991479197e-06, + "loss": 0.3887, + "step": 2527 + }, + { + "epoch": 0.24603406326034064, + "grad_norm": 1.460230365502962, + "learning_rate": 8.825828400143902e-06, + "loss": 0.3316, + "step": 2528 + }, + { + "epoch": 0.24613138686131386, + "grad_norm": 1.4924241123043909, + "learning_rate": 8.824813428633685e-06, + "loss": 0.4989, + "step": 2529 + }, + { + "epoch": 0.24622871046228711, + "grad_norm": 1.593556186634644, + "learning_rate": 8.82379807704941e-06, + "loss": 0.525, + "step": 2530 + }, + { + "epoch": 0.24632603406326034, + "grad_norm": 1.6809566227650843, + "learning_rate": 8.822782345491968e-06, + "loss": 0.3421, + "step": 2531 + }, + { + "epoch": 0.24642335766423357, + "grad_norm": 1.4773288736144092, + "learning_rate": 8.821766234062294e-06, + "loss": 0.534, + "step": 2532 + }, + { + "epoch": 0.24652068126520682, + "grad_norm": 1.4001059355846526, + "learning_rate": 8.820749742861363e-06, + "loss": 0.3887, + "step": 2533 + }, + { + "epoch": 0.24661800486618005, + "grad_norm": 1.349012582441713, + "learning_rate": 8.81973287199018e-06, + "loss": 0.2852, + "step": 2534 + }, + { + "epoch": 0.24671532846715327, + "grad_norm": 1.651550318908522, + "learning_rate": 8.818715621549794e-06, + "loss": 0.4967, + "step": 2535 + }, + { + "epoch": 0.24681265206812653, + "grad_norm": 1.5932669562049986, + "learning_rate": 8.817697991641289e-06, + "loss": 0.4173, + "step": 2536 + }, + { + "epoch": 0.24690997566909975, + "grad_norm": 1.3550488264007063, + "learning_rate": 8.816679982365787e-06, + "loss": 0.3404, + "step": 2537 + }, + { + "epoch": 0.247007299270073, + "grad_norm": 1.571341106532058, + "learning_rate": 8.815661593824451e-06, + "loss": 0.5666, + "step": 2538 + }, + { + "epoch": 0.24710462287104623, + "grad_norm": 1.5685299297246114, + "learning_rate": 8.814642826118477e-06, + "loss": 0.4521, + "step": 2539 + }, + { + "epoch": 0.24720194647201946, + "grad_norm": 1.5355691524375334, + "learning_rate": 8.8136236793491e-06, + "loss": 0.3452, + "step": 2540 + }, + { + "epoch": 0.2472992700729927, + "grad_norm": 1.4490992247448509, + "learning_rate": 8.812604153617594e-06, + "loss": 0.3046, + "step": 2541 + }, + { + "epoch": 0.24739659367396594, + "grad_norm": 1.5790493967738255, + "learning_rate": 8.81158424902527e-06, + "loss": 0.5957, + "step": 2542 + }, + { + "epoch": 0.24749391727493916, + "grad_norm": 1.8299083651337236, + "learning_rate": 8.810563965673478e-06, + "loss": 0.529, + "step": 2543 + }, + { + "epoch": 0.24759124087591242, + "grad_norm": 1.336357630649535, + "learning_rate": 8.8095433036636e-06, + "loss": 0.2498, + "step": 2544 + }, + { + "epoch": 0.24768856447688564, + "grad_norm": 3.272954864246679, + "learning_rate": 8.808522263097065e-06, + "loss": 0.3439, + "step": 2545 + }, + { + "epoch": 0.24778588807785887, + "grad_norm": 1.5948700054852, + "learning_rate": 8.80750084407533e-06, + "loss": 0.5754, + "step": 2546 + }, + { + "epoch": 0.24788321167883212, + "grad_norm": 1.2457293034288246, + "learning_rate": 8.806479046699896e-06, + "loss": 0.3355, + "step": 2547 + }, + { + "epoch": 0.24798053527980535, + "grad_norm": 1.4118835775208534, + "learning_rate": 8.8054568710723e-06, + "loss": 0.4843, + "step": 2548 + }, + { + "epoch": 0.2480778588807786, + "grad_norm": 2.0167817337794745, + "learning_rate": 8.804434317294115e-06, + "loss": 0.4781, + "step": 2549 + }, + { + "epoch": 0.24817518248175183, + "grad_norm": 1.630746510877536, + "learning_rate": 8.803411385466954e-06, + "loss": 0.5226, + "step": 2550 + }, + { + "epoch": 0.24827250608272505, + "grad_norm": 1.0942598516950242, + "learning_rate": 8.802388075692465e-06, + "loss": 0.1843, + "step": 2551 + }, + { + "epoch": 0.2483698296836983, + "grad_norm": 1.8060042956650721, + "learning_rate": 8.801364388072336e-06, + "loss": 0.705, + "step": 2552 + }, + { + "epoch": 0.24846715328467153, + "grad_norm": 1.632331667833736, + "learning_rate": 8.800340322708291e-06, + "loss": 0.4964, + "step": 2553 + }, + { + "epoch": 0.24856447688564476, + "grad_norm": 1.539098206701319, + "learning_rate": 8.799315879702095e-06, + "loss": 0.3962, + "step": 2554 + }, + { + "epoch": 0.248661800486618, + "grad_norm": 1.2219114137184675, + "learning_rate": 8.798291059155543e-06, + "loss": 0.2497, + "step": 2555 + }, + { + "epoch": 0.24875912408759124, + "grad_norm": 1.4540964796439875, + "learning_rate": 8.797265861170471e-06, + "loss": 0.5159, + "step": 2556 + }, + { + "epoch": 0.24885644768856446, + "grad_norm": 1.554150512584087, + "learning_rate": 8.796240285848761e-06, + "loss": 0.4412, + "step": 2557 + }, + { + "epoch": 0.24895377128953772, + "grad_norm": 1.7004545782091594, + "learning_rate": 8.795214333292318e-06, + "loss": 0.5179, + "step": 2558 + }, + { + "epoch": 0.24905109489051094, + "grad_norm": 1.726524110945535, + "learning_rate": 8.794188003603095e-06, + "loss": 0.4071, + "step": 2559 + }, + { + "epoch": 0.2491484184914842, + "grad_norm": 1.27126477948415, + "learning_rate": 8.793161296883077e-06, + "loss": 0.2268, + "step": 2560 + }, + { + "epoch": 0.24924574209245742, + "grad_norm": 1.9752049062158858, + "learning_rate": 8.79213421323429e-06, + "loss": 0.3632, + "step": 2561 + }, + { + "epoch": 0.24934306569343065, + "grad_norm": 1.0556825817929254, + "learning_rate": 8.791106752758796e-06, + "loss": 0.3627, + "step": 2562 + }, + { + "epoch": 0.2494403892944039, + "grad_norm": 1.6452772754401714, + "learning_rate": 8.790078915558693e-06, + "loss": 0.6043, + "step": 2563 + }, + { + "epoch": 0.24953771289537713, + "grad_norm": 1.278547180886592, + "learning_rate": 8.789050701736117e-06, + "loss": 0.3768, + "step": 2564 + }, + { + "epoch": 0.24963503649635035, + "grad_norm": 1.3443028399521961, + "learning_rate": 8.788022111393247e-06, + "loss": 0.3856, + "step": 2565 + }, + { + "epoch": 0.2497323600973236, + "grad_norm": 1.2774166354695482, + "learning_rate": 8.78699314463229e-06, + "loss": 0.4391, + "step": 2566 + }, + { + "epoch": 0.24982968369829683, + "grad_norm": 1.2231715277397497, + "learning_rate": 8.785963801555497e-06, + "loss": 0.4128, + "step": 2567 + }, + { + "epoch": 0.24992700729927006, + "grad_norm": 1.4012153782510572, + "learning_rate": 8.784934082265154e-06, + "loss": 0.4683, + "step": 2568 + }, + { + "epoch": 0.2500243309002433, + "grad_norm": 1.1954060436870173, + "learning_rate": 8.783903986863583e-06, + "loss": 0.2786, + "step": 2569 + }, + { + "epoch": 0.25012165450121654, + "grad_norm": 1.7116998515615807, + "learning_rate": 8.782873515453148e-06, + "loss": 0.6004, + "step": 2570 + }, + { + "epoch": 0.2502189781021898, + "grad_norm": 1.5712719922889962, + "learning_rate": 8.781842668136247e-06, + "loss": 0.6172, + "step": 2571 + }, + { + "epoch": 0.250316301703163, + "grad_norm": 1.246915874910697, + "learning_rate": 8.780811445015316e-06, + "loss": 0.4335, + "step": 2572 + }, + { + "epoch": 0.25041362530413624, + "grad_norm": 1.341456518636559, + "learning_rate": 8.779779846192827e-06, + "loss": 0.4187, + "step": 2573 + }, + { + "epoch": 0.2505109489051095, + "grad_norm": 1.1323562755710477, + "learning_rate": 8.778747871771293e-06, + "loss": 0.2832, + "step": 2574 + }, + { + "epoch": 0.25060827250608275, + "grad_norm": 1.4401083791532063, + "learning_rate": 8.777715521853258e-06, + "loss": 0.3779, + "step": 2575 + }, + { + "epoch": 0.25070559610705595, + "grad_norm": 1.4784987737181619, + "learning_rate": 8.77668279654131e-06, + "loss": 0.3129, + "step": 2576 + }, + { + "epoch": 0.2508029197080292, + "grad_norm": 1.1394717513462493, + "learning_rate": 8.775649695938074e-06, + "loss": 0.3162, + "step": 2577 + }, + { + "epoch": 0.25090024330900246, + "grad_norm": 1.4625556674372375, + "learning_rate": 8.774616220146204e-06, + "loss": 0.4605, + "step": 2578 + }, + { + "epoch": 0.25099756690997566, + "grad_norm": 3.1521808341091875, + "learning_rate": 8.773582369268402e-06, + "loss": 0.3485, + "step": 2579 + }, + { + "epoch": 0.2510948905109489, + "grad_norm": 1.3578124438111323, + "learning_rate": 8.7725481434074e-06, + "loss": 0.4693, + "step": 2580 + }, + { + "epoch": 0.25119221411192216, + "grad_norm": 1.63411664215404, + "learning_rate": 8.771513542665969e-06, + "loss": 0.4956, + "step": 2581 + }, + { + "epoch": 0.25128953771289536, + "grad_norm": 1.5098765580454843, + "learning_rate": 8.77047856714692e-06, + "loss": 0.4657, + "step": 2582 + }, + { + "epoch": 0.2513868613138686, + "grad_norm": 1.2801786921613054, + "learning_rate": 8.7694432169531e-06, + "loss": 0.3369, + "step": 2583 + }, + { + "epoch": 0.25148418491484187, + "grad_norm": 1.4360422953324754, + "learning_rate": 8.768407492187388e-06, + "loss": 0.4907, + "step": 2584 + }, + { + "epoch": 0.25158150851581507, + "grad_norm": 1.4560406874169747, + "learning_rate": 8.767371392952708e-06, + "loss": 0.3157, + "step": 2585 + }, + { + "epoch": 0.2516788321167883, + "grad_norm": 1.934211832538441, + "learning_rate": 8.766334919352018e-06, + "loss": 0.7151, + "step": 2586 + }, + { + "epoch": 0.2517761557177616, + "grad_norm": 1.6767044903158872, + "learning_rate": 8.76529807148831e-06, + "loss": 0.331, + "step": 2587 + }, + { + "epoch": 0.25187347931873477, + "grad_norm": 1.4698852047894042, + "learning_rate": 8.76426084946462e-06, + "loss": 0.3951, + "step": 2588 + }, + { + "epoch": 0.251970802919708, + "grad_norm": 1.3539539414605721, + "learning_rate": 8.763223253384015e-06, + "loss": 0.4011, + "step": 2589 + }, + { + "epoch": 0.2520681265206813, + "grad_norm": 1.506242240790805, + "learning_rate": 8.762185283349603e-06, + "loss": 0.5274, + "step": 2590 + }, + { + "epoch": 0.2521654501216545, + "grad_norm": 1.3140936667503142, + "learning_rate": 8.761146939464527e-06, + "loss": 0.3198, + "step": 2591 + }, + { + "epoch": 0.25226277372262773, + "grad_norm": 1.1404767919952752, + "learning_rate": 8.760108221831967e-06, + "loss": 0.4013, + "step": 2592 + }, + { + "epoch": 0.252360097323601, + "grad_norm": 1.4693477307137552, + "learning_rate": 8.759069130555142e-06, + "loss": 0.4783, + "step": 2593 + }, + { + "epoch": 0.2524574209245742, + "grad_norm": 1.3352582665983712, + "learning_rate": 8.75802966573731e-06, + "loss": 0.4617, + "step": 2594 + }, + { + "epoch": 0.25255474452554744, + "grad_norm": 1.2824428866870197, + "learning_rate": 8.756989827481756e-06, + "loss": 0.3352, + "step": 2595 + }, + { + "epoch": 0.2526520681265207, + "grad_norm": 1.4774059328965283, + "learning_rate": 8.755949615891814e-06, + "loss": 0.4635, + "step": 2596 + }, + { + "epoch": 0.25274939172749394, + "grad_norm": 1.6875827910282526, + "learning_rate": 8.754909031070852e-06, + "loss": 0.6222, + "step": 2597 + }, + { + "epoch": 0.25284671532846714, + "grad_norm": 1.2063205441417741, + "learning_rate": 8.75386807312227e-06, + "loss": 0.2455, + "step": 2598 + }, + { + "epoch": 0.2529440389294404, + "grad_norm": 1.3021547323360578, + "learning_rate": 8.752826742149512e-06, + "loss": 0.4329, + "step": 2599 + }, + { + "epoch": 0.25304136253041365, + "grad_norm": 1.1835878076183852, + "learning_rate": 8.751785038256054e-06, + "loss": 0.3662, + "step": 2600 + }, + { + "epoch": 0.25313868613138685, + "grad_norm": 1.544717999496196, + "learning_rate": 8.750742961545409e-06, + "loss": 0.3971, + "step": 2601 + }, + { + "epoch": 0.2532360097323601, + "grad_norm": 1.3629505007649398, + "learning_rate": 8.749700512121131e-06, + "loss": 0.5107, + "step": 2602 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 1.5737001686599814, + "learning_rate": 8.74865769008681e-06, + "loss": 0.5279, + "step": 2603 + }, + { + "epoch": 0.25343065693430655, + "grad_norm": 1.4784815997261378, + "learning_rate": 8.747614495546069e-06, + "loss": 0.4792, + "step": 2604 + }, + { + "epoch": 0.2535279805352798, + "grad_norm": 1.3236076722973804, + "learning_rate": 8.74657092860257e-06, + "loss": 0.3975, + "step": 2605 + }, + { + "epoch": 0.25362530413625306, + "grad_norm": 1.0968595172191475, + "learning_rate": 8.745526989360018e-06, + "loss": 0.269, + "step": 2606 + }, + { + "epoch": 0.25372262773722626, + "grad_norm": 1.562505340567045, + "learning_rate": 8.744482677922147e-06, + "loss": 0.5157, + "step": 2607 + }, + { + "epoch": 0.2538199513381995, + "grad_norm": 1.656826278908397, + "learning_rate": 8.743437994392729e-06, + "loss": 0.4867, + "step": 2608 + }, + { + "epoch": 0.25391727493917277, + "grad_norm": 1.3948672448161548, + "learning_rate": 8.742392938875577e-06, + "loss": 0.5279, + "step": 2609 + }, + { + "epoch": 0.25401459854014596, + "grad_norm": 1.5892338813179163, + "learning_rate": 8.741347511474539e-06, + "loss": 0.5611, + "step": 2610 + }, + { + "epoch": 0.2541119221411192, + "grad_norm": 1.6074249897923194, + "learning_rate": 8.740301712293498e-06, + "loss": 0.351, + "step": 2611 + }, + { + "epoch": 0.25420924574209247, + "grad_norm": 1.6505540033315536, + "learning_rate": 8.739255541436379e-06, + "loss": 0.5747, + "step": 2612 + }, + { + "epoch": 0.25430656934306567, + "grad_norm": 1.3314247428577628, + "learning_rate": 8.738208999007137e-06, + "loss": 0.3779, + "step": 2613 + }, + { + "epoch": 0.2544038929440389, + "grad_norm": 1.2796270745139389, + "learning_rate": 8.737162085109768e-06, + "loss": 0.3557, + "step": 2614 + }, + { + "epoch": 0.2545012165450122, + "grad_norm": 1.602637102567401, + "learning_rate": 8.736114799848307e-06, + "loss": 0.2882, + "step": 2615 + }, + { + "epoch": 0.2545985401459854, + "grad_norm": 1.4207119219562419, + "learning_rate": 8.735067143326821e-06, + "loss": 0.3881, + "step": 2616 + }, + { + "epoch": 0.25469586374695863, + "grad_norm": 1.4305110706379638, + "learning_rate": 8.73401911564942e-06, + "loss": 0.3486, + "step": 2617 + }, + { + "epoch": 0.2547931873479319, + "grad_norm": 1.434428707272536, + "learning_rate": 8.732970716920242e-06, + "loss": 0.3169, + "step": 2618 + }, + { + "epoch": 0.25489051094890514, + "grad_norm": 1.3228470441064362, + "learning_rate": 8.73192194724347e-06, + "loss": 0.4485, + "step": 2619 + }, + { + "epoch": 0.25498783454987833, + "grad_norm": 1.3897030806906485, + "learning_rate": 8.730872806723318e-06, + "loss": 0.4172, + "step": 2620 + }, + { + "epoch": 0.2550851581508516, + "grad_norm": 1.3840681937318722, + "learning_rate": 8.729823295464045e-06, + "loss": 0.251, + "step": 2621 + }, + { + "epoch": 0.25518248175182484, + "grad_norm": 1.775278354364079, + "learning_rate": 8.728773413569938e-06, + "loss": 0.4811, + "step": 2622 + }, + { + "epoch": 0.25527980535279804, + "grad_norm": 1.2701408917829737, + "learning_rate": 8.727723161145325e-06, + "loss": 0.2827, + "step": 2623 + }, + { + "epoch": 0.2553771289537713, + "grad_norm": 1.5528362504659363, + "learning_rate": 8.72667253829457e-06, + "loss": 0.5084, + "step": 2624 + }, + { + "epoch": 0.25547445255474455, + "grad_norm": 1.3793988523162408, + "learning_rate": 8.725621545122074e-06, + "loss": 0.3979, + "step": 2625 + }, + { + "epoch": 0.25557177615571774, + "grad_norm": 1.70282889775673, + "learning_rate": 8.724570181732275e-06, + "loss": 0.5983, + "step": 2626 + }, + { + "epoch": 0.255669099756691, + "grad_norm": 1.28105292316495, + "learning_rate": 8.723518448229649e-06, + "loss": 0.4756, + "step": 2627 + }, + { + "epoch": 0.25576642335766425, + "grad_norm": 1.3826686116158597, + "learning_rate": 8.722466344718705e-06, + "loss": 0.2978, + "step": 2628 + }, + { + "epoch": 0.25586374695863745, + "grad_norm": 1.460242284502631, + "learning_rate": 8.721413871303992e-06, + "loss": 0.4036, + "step": 2629 + }, + { + "epoch": 0.2559610705596107, + "grad_norm": 1.4181157816170762, + "learning_rate": 8.720361028090095e-06, + "loss": 0.4224, + "step": 2630 + }, + { + "epoch": 0.25605839416058396, + "grad_norm": 1.7898330028782403, + "learning_rate": 8.719307815181638e-06, + "loss": 0.7314, + "step": 2631 + }, + { + "epoch": 0.25615571776155716, + "grad_norm": 1.6886124652733636, + "learning_rate": 8.718254232683276e-06, + "loss": 0.3513, + "step": 2632 + }, + { + "epoch": 0.2562530413625304, + "grad_norm": 1.2562027575971086, + "learning_rate": 8.717200280699705e-06, + "loss": 0.284, + "step": 2633 + }, + { + "epoch": 0.25635036496350366, + "grad_norm": 1.4899596514775177, + "learning_rate": 8.716145959335658e-06, + "loss": 0.2778, + "step": 2634 + }, + { + "epoch": 0.25644768856447686, + "grad_norm": 1.1699021581347986, + "learning_rate": 8.715091268695903e-06, + "loss": 0.3163, + "step": 2635 + }, + { + "epoch": 0.2565450121654501, + "grad_norm": 1.020653527182934, + "learning_rate": 8.714036208885243e-06, + "loss": 0.2191, + "step": 2636 + }, + { + "epoch": 0.25664233576642337, + "grad_norm": 1.5373942827305265, + "learning_rate": 8.712980780008526e-06, + "loss": 0.4183, + "step": 2637 + }, + { + "epoch": 0.25673965936739657, + "grad_norm": 1.1268355971062876, + "learning_rate": 8.711924982170623e-06, + "loss": 0.2851, + "step": 2638 + }, + { + "epoch": 0.2568369829683698, + "grad_norm": 1.25228244300652, + "learning_rate": 8.710868815476456e-06, + "loss": 0.1963, + "step": 2639 + }, + { + "epoch": 0.2569343065693431, + "grad_norm": 1.3905442460862172, + "learning_rate": 8.709812280030971e-06, + "loss": 0.3648, + "step": 2640 + }, + { + "epoch": 0.2570316301703163, + "grad_norm": 1.5078176389616522, + "learning_rate": 8.708755375939162e-06, + "loss": 0.4131, + "step": 2641 + }, + { + "epoch": 0.2571289537712895, + "grad_norm": 1.4441200079463874, + "learning_rate": 8.70769810330605e-06, + "loss": 0.4047, + "step": 2642 + }, + { + "epoch": 0.2572262773722628, + "grad_norm": 1.3883503516178042, + "learning_rate": 8.7066404622367e-06, + "loss": 0.3308, + "step": 2643 + }, + { + "epoch": 0.25732360097323603, + "grad_norm": 1.7851696055640995, + "learning_rate": 8.705582452836208e-06, + "loss": 0.336, + "step": 2644 + }, + { + "epoch": 0.25742092457420923, + "grad_norm": 1.309628752016819, + "learning_rate": 8.70452407520971e-06, + "loss": 0.3462, + "step": 2645 + }, + { + "epoch": 0.2575182481751825, + "grad_norm": 1.3618437175125289, + "learning_rate": 8.703465329462379e-06, + "loss": 0.3047, + "step": 2646 + }, + { + "epoch": 0.25761557177615574, + "grad_norm": 1.5821297320572192, + "learning_rate": 8.702406215699421e-06, + "loss": 0.2318, + "step": 2647 + }, + { + "epoch": 0.25771289537712894, + "grad_norm": 1.4729014225467234, + "learning_rate": 8.701346734026082e-06, + "loss": 0.3147, + "step": 2648 + }, + { + "epoch": 0.2578102189781022, + "grad_norm": 1.6287249640343295, + "learning_rate": 8.700286884547642e-06, + "loss": 0.5808, + "step": 2649 + }, + { + "epoch": 0.25790754257907544, + "grad_norm": 1.2824109098190504, + "learning_rate": 8.69922666736942e-06, + "loss": 0.3836, + "step": 2650 + }, + { + "epoch": 0.25800486618004864, + "grad_norm": 1.5096397594183033, + "learning_rate": 8.69816608259677e-06, + "loss": 0.3804, + "step": 2651 + }, + { + "epoch": 0.2581021897810219, + "grad_norm": 1.7247008216261863, + "learning_rate": 8.697105130335084e-06, + "loss": 0.3378, + "step": 2652 + }, + { + "epoch": 0.25819951338199515, + "grad_norm": 1.5872130127065738, + "learning_rate": 8.69604381068979e-06, + "loss": 0.4369, + "step": 2653 + }, + { + "epoch": 0.25829683698296835, + "grad_norm": 1.5909295650502344, + "learning_rate": 8.694982123766348e-06, + "loss": 0.3554, + "step": 2654 + }, + { + "epoch": 0.2583941605839416, + "grad_norm": 1.7135035115393307, + "learning_rate": 8.693920069670265e-06, + "loss": 0.4869, + "step": 2655 + }, + { + "epoch": 0.25849148418491485, + "grad_norm": 1.3366492087792976, + "learning_rate": 8.692857648507071e-06, + "loss": 0.3102, + "step": 2656 + }, + { + "epoch": 0.25858880778588805, + "grad_norm": 1.2478048122674565, + "learning_rate": 8.691794860382345e-06, + "loss": 0.3722, + "step": 2657 + }, + { + "epoch": 0.2586861313868613, + "grad_norm": 1.5080776475601503, + "learning_rate": 8.690731705401694e-06, + "loss": 0.316, + "step": 2658 + }, + { + "epoch": 0.25878345498783456, + "grad_norm": 1.443811575497146, + "learning_rate": 8.689668183670763e-06, + "loss": 0.2875, + "step": 2659 + }, + { + "epoch": 0.25888077858880776, + "grad_norm": 1.7036441396737687, + "learning_rate": 8.688604295295238e-06, + "loss": 0.4025, + "step": 2660 + }, + { + "epoch": 0.258978102189781, + "grad_norm": 1.4234806259439374, + "learning_rate": 8.687540040380838e-06, + "loss": 0.4452, + "step": 2661 + }, + { + "epoch": 0.25907542579075427, + "grad_norm": 1.2741393980838642, + "learning_rate": 8.686475419033315e-06, + "loss": 0.2237, + "step": 2662 + }, + { + "epoch": 0.2591727493917275, + "grad_norm": 1.1826384563722763, + "learning_rate": 8.685410431358464e-06, + "loss": 0.3398, + "step": 2663 + }, + { + "epoch": 0.2592700729927007, + "grad_norm": 1.5757741509023746, + "learning_rate": 8.684345077462117e-06, + "loss": 0.3846, + "step": 2664 + }, + { + "epoch": 0.25936739659367397, + "grad_norm": 1.475707275733763, + "learning_rate": 8.683279357450131e-06, + "loss": 0.2804, + "step": 2665 + }, + { + "epoch": 0.2594647201946472, + "grad_norm": 1.4241797244636094, + "learning_rate": 8.682213271428415e-06, + "loss": 0.2553, + "step": 2666 + }, + { + "epoch": 0.2595620437956204, + "grad_norm": 1.1548194283365685, + "learning_rate": 8.6811468195029e-06, + "loss": 0.3118, + "step": 2667 + }, + { + "epoch": 0.2596593673965937, + "grad_norm": 1.5918458521510486, + "learning_rate": 8.680080001779564e-06, + "loss": 0.4525, + "step": 2668 + }, + { + "epoch": 0.25975669099756693, + "grad_norm": 1.5508802560099362, + "learning_rate": 8.679012818364416e-06, + "loss": 0.4163, + "step": 2669 + }, + { + "epoch": 0.25985401459854013, + "grad_norm": 2.4434630008376232, + "learning_rate": 8.677945269363504e-06, + "loss": 0.4372, + "step": 2670 + }, + { + "epoch": 0.2599513381995134, + "grad_norm": 1.5324792404386718, + "learning_rate": 8.676877354882907e-06, + "loss": 0.3514, + "step": 2671 + }, + { + "epoch": 0.26004866180048664, + "grad_norm": 2.0012246197360493, + "learning_rate": 8.67580907502875e-06, + "loss": 0.5067, + "step": 2672 + }, + { + "epoch": 0.26014598540145983, + "grad_norm": 1.5232176793280576, + "learning_rate": 8.674740429907186e-06, + "loss": 0.4174, + "step": 2673 + }, + { + "epoch": 0.2602433090024331, + "grad_norm": 1.3322865976928646, + "learning_rate": 8.673671419624405e-06, + "loss": 0.4095, + "step": 2674 + }, + { + "epoch": 0.26034063260340634, + "grad_norm": 1.514406481268828, + "learning_rate": 8.672602044286638e-06, + "loss": 0.5915, + "step": 2675 + }, + { + "epoch": 0.26043795620437954, + "grad_norm": 1.528467413797325, + "learning_rate": 8.67153230400015e-06, + "loss": 0.4018, + "step": 2676 + }, + { + "epoch": 0.2605352798053528, + "grad_norm": 1.4367698805538582, + "learning_rate": 8.670462198871237e-06, + "loss": 0.4115, + "step": 2677 + }, + { + "epoch": 0.26063260340632605, + "grad_norm": 1.6984444092554742, + "learning_rate": 8.66939172900624e-06, + "loss": 0.59, + "step": 2678 + }, + { + "epoch": 0.26072992700729924, + "grad_norm": 1.4698751482200727, + "learning_rate": 8.668320894511534e-06, + "loss": 0.4144, + "step": 2679 + }, + { + "epoch": 0.2608272506082725, + "grad_norm": 1.5003641004534345, + "learning_rate": 8.667249695493525e-06, + "loss": 0.4294, + "step": 2680 + }, + { + "epoch": 0.26092457420924575, + "grad_norm": 1.3123452231563197, + "learning_rate": 8.666178132058659e-06, + "loss": 0.3408, + "step": 2681 + }, + { + "epoch": 0.261021897810219, + "grad_norm": 1.5184535738040659, + "learning_rate": 8.665106204313418e-06, + "loss": 0.3662, + "step": 2682 + }, + { + "epoch": 0.2611192214111922, + "grad_norm": 1.0623024588559944, + "learning_rate": 8.664033912364321e-06, + "loss": 0.2953, + "step": 2683 + }, + { + "epoch": 0.26121654501216546, + "grad_norm": 1.4112725317400583, + "learning_rate": 8.662961256317923e-06, + "loss": 0.3825, + "step": 2684 + }, + { + "epoch": 0.2613138686131387, + "grad_norm": 2.2729536767377065, + "learning_rate": 8.661888236280813e-06, + "loss": 0.5791, + "step": 2685 + }, + { + "epoch": 0.2614111922141119, + "grad_norm": 2.2747614305768504, + "learning_rate": 8.660814852359617e-06, + "loss": 0.4859, + "step": 2686 + }, + { + "epoch": 0.26150851581508516, + "grad_norm": 1.6069562939941755, + "learning_rate": 8.659741104661002e-06, + "loss": 0.5254, + "step": 2687 + }, + { + "epoch": 0.2616058394160584, + "grad_norm": 1.3624858995460438, + "learning_rate": 8.658666993291662e-06, + "loss": 0.3904, + "step": 2688 + }, + { + "epoch": 0.2617031630170316, + "grad_norm": 1.2954398797770197, + "learning_rate": 8.657592518358332e-06, + "loss": 0.3789, + "step": 2689 + }, + { + "epoch": 0.26180048661800487, + "grad_norm": 1.4158991903907718, + "learning_rate": 8.656517679967788e-06, + "loss": 0.3732, + "step": 2690 + }, + { + "epoch": 0.2618978102189781, + "grad_norm": 1.3754641009755615, + "learning_rate": 8.655442478226835e-06, + "loss": 0.3035, + "step": 2691 + }, + { + "epoch": 0.2619951338199513, + "grad_norm": 1.3522608722257456, + "learning_rate": 8.654366913242316e-06, + "loss": 0.347, + "step": 2692 + }, + { + "epoch": 0.2620924574209246, + "grad_norm": 1.2764013704656585, + "learning_rate": 8.65329098512111e-06, + "loss": 0.4207, + "step": 2693 + }, + { + "epoch": 0.2621897810218978, + "grad_norm": 1.4009476621873176, + "learning_rate": 8.652214693970133e-06, + "loss": 0.4628, + "step": 2694 + }, + { + "epoch": 0.262287104622871, + "grad_norm": 1.3860597575903169, + "learning_rate": 8.65113803989634e-06, + "loss": 0.3844, + "step": 2695 + }, + { + "epoch": 0.2623844282238443, + "grad_norm": 1.5636622874346966, + "learning_rate": 8.650061023006711e-06, + "loss": 0.6239, + "step": 2696 + }, + { + "epoch": 0.26248175182481753, + "grad_norm": 1.3677003606993399, + "learning_rate": 8.648983643408276e-06, + "loss": 0.4319, + "step": 2697 + }, + { + "epoch": 0.26257907542579073, + "grad_norm": 1.4720449620822884, + "learning_rate": 8.647905901208096e-06, + "loss": 0.4824, + "step": 2698 + }, + { + "epoch": 0.262676399026764, + "grad_norm": 1.4180687903221385, + "learning_rate": 8.646827796513262e-06, + "loss": 0.539, + "step": 2699 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 1.3679667840460958, + "learning_rate": 8.64574932943091e-06, + "loss": 0.4588, + "step": 2700 + }, + { + "epoch": 0.26287104622871044, + "grad_norm": 1.125542933529368, + "learning_rate": 8.644670500068205e-06, + "loss": 0.3441, + "step": 2701 + }, + { + "epoch": 0.2629683698296837, + "grad_norm": 1.5641789380613262, + "learning_rate": 8.643591308532353e-06, + "loss": 0.4998, + "step": 2702 + }, + { + "epoch": 0.26306569343065694, + "grad_norm": 1.3425870342919086, + "learning_rate": 8.642511754930592e-06, + "loss": 0.4678, + "step": 2703 + }, + { + "epoch": 0.2631630170316302, + "grad_norm": 1.3010588112101855, + "learning_rate": 8.641431839370199e-06, + "loss": 0.4005, + "step": 2704 + }, + { + "epoch": 0.2632603406326034, + "grad_norm": 1.0067860306988832, + "learning_rate": 8.640351561958487e-06, + "loss": 0.2243, + "step": 2705 + }, + { + "epoch": 0.26335766423357665, + "grad_norm": 1.4713856201410829, + "learning_rate": 8.639270922802802e-06, + "loss": 0.4325, + "step": 2706 + }, + { + "epoch": 0.2634549878345499, + "grad_norm": 1.55962351192921, + "learning_rate": 8.63818992201053e-06, + "loss": 0.5307, + "step": 2707 + }, + { + "epoch": 0.2635523114355231, + "grad_norm": 1.4073629002175063, + "learning_rate": 8.637108559689088e-06, + "loss": 0.3329, + "step": 2708 + }, + { + "epoch": 0.26364963503649635, + "grad_norm": 1.2827086170801953, + "learning_rate": 8.636026835945933e-06, + "loss": 0.3095, + "step": 2709 + }, + { + "epoch": 0.2637469586374696, + "grad_norm": 1.4100209855486194, + "learning_rate": 8.634944750888556e-06, + "loss": 0.3033, + "step": 2710 + }, + { + "epoch": 0.2638442822384428, + "grad_norm": 1.343279822840104, + "learning_rate": 8.633862304624484e-06, + "loss": 0.402, + "step": 2711 + }, + { + "epoch": 0.26394160583941606, + "grad_norm": 1.4374516520455163, + "learning_rate": 8.632779497261284e-06, + "loss": 0.4574, + "step": 2712 + }, + { + "epoch": 0.2640389294403893, + "grad_norm": 1.1554648336740065, + "learning_rate": 8.63169632890655e-06, + "loss": 0.3091, + "step": 2713 + }, + { + "epoch": 0.2641362530413625, + "grad_norm": 1.5304191047752203, + "learning_rate": 8.630612799667923e-06, + "loss": 0.5392, + "step": 2714 + }, + { + "epoch": 0.26423357664233577, + "grad_norm": 1.9364214941018973, + "learning_rate": 8.629528909653067e-06, + "loss": 0.4705, + "step": 2715 + }, + { + "epoch": 0.264330900243309, + "grad_norm": 1.5176007479008755, + "learning_rate": 8.628444658969694e-06, + "loss": 0.3969, + "step": 2716 + }, + { + "epoch": 0.2644282238442822, + "grad_norm": 1.3882529784475808, + "learning_rate": 8.627360047725543e-06, + "loss": 0.4672, + "step": 2717 + }, + { + "epoch": 0.26452554744525547, + "grad_norm": 1.0419873824719341, + "learning_rate": 8.626275076028397e-06, + "loss": 0.2247, + "step": 2718 + }, + { + "epoch": 0.2646228710462287, + "grad_norm": 1.4147177174052021, + "learning_rate": 8.625189743986068e-06, + "loss": 0.3922, + "step": 2719 + }, + { + "epoch": 0.2647201946472019, + "grad_norm": 1.3513629744004096, + "learning_rate": 8.624104051706405e-06, + "loss": 0.415, + "step": 2720 + }, + { + "epoch": 0.2648175182481752, + "grad_norm": 1.3701041364422066, + "learning_rate": 8.623017999297294e-06, + "loss": 0.4329, + "step": 2721 + }, + { + "epoch": 0.26491484184914843, + "grad_norm": 1.5102917148163044, + "learning_rate": 8.621931586866658e-06, + "loss": 0.4104, + "step": 2722 + }, + { + "epoch": 0.26501216545012163, + "grad_norm": 1.4836677874290423, + "learning_rate": 8.620844814522455e-06, + "loss": 0.5131, + "step": 2723 + }, + { + "epoch": 0.2651094890510949, + "grad_norm": 1.2607364196409017, + "learning_rate": 8.619757682372675e-06, + "loss": 0.3856, + "step": 2724 + }, + { + "epoch": 0.26520681265206814, + "grad_norm": 1.4082529003642341, + "learning_rate": 8.61867019052535e-06, + "loss": 0.4719, + "step": 2725 + }, + { + "epoch": 0.2653041362530414, + "grad_norm": 1.4276001080419702, + "learning_rate": 8.617582339088545e-06, + "loss": 0.2825, + "step": 2726 + }, + { + "epoch": 0.2654014598540146, + "grad_norm": 1.4331001450603844, + "learning_rate": 8.61649412817036e-06, + "loss": 0.5104, + "step": 2727 + }, + { + "epoch": 0.26549878345498784, + "grad_norm": 1.358868383954866, + "learning_rate": 8.615405557878929e-06, + "loss": 0.4359, + "step": 2728 + }, + { + "epoch": 0.2655961070559611, + "grad_norm": 1.678463370024911, + "learning_rate": 8.614316628322427e-06, + "loss": 0.4658, + "step": 2729 + }, + { + "epoch": 0.2656934306569343, + "grad_norm": 1.2268291596580612, + "learning_rate": 8.61322733960906e-06, + "loss": 0.2337, + "step": 2730 + }, + { + "epoch": 0.26579075425790755, + "grad_norm": 0.9437944818586388, + "learning_rate": 8.61213769184707e-06, + "loss": 0.2525, + "step": 2731 + }, + { + "epoch": 0.2658880778588808, + "grad_norm": 1.2480121542051432, + "learning_rate": 8.611047685144737e-06, + "loss": 0.2656, + "step": 2732 + }, + { + "epoch": 0.265985401459854, + "grad_norm": 1.5255853623894704, + "learning_rate": 8.609957319610377e-06, + "loss": 0.5071, + "step": 2733 + }, + { + "epoch": 0.26608272506082725, + "grad_norm": 1.5847632660353408, + "learning_rate": 8.60886659535234e-06, + "loss": 0.4018, + "step": 2734 + }, + { + "epoch": 0.2661800486618005, + "grad_norm": 1.3469310633769445, + "learning_rate": 8.60777551247901e-06, + "loss": 0.451, + "step": 2735 + }, + { + "epoch": 0.2662773722627737, + "grad_norm": 1.3995570810499534, + "learning_rate": 8.60668407109881e-06, + "loss": 0.4991, + "step": 2736 + }, + { + "epoch": 0.26637469586374696, + "grad_norm": 1.5198269828404072, + "learning_rate": 8.605592271320199e-06, + "loss": 0.4266, + "step": 2737 + }, + { + "epoch": 0.2664720194647202, + "grad_norm": 1.3040716122405567, + "learning_rate": 8.604500113251666e-06, + "loss": 0.3465, + "step": 2738 + }, + { + "epoch": 0.2665693430656934, + "grad_norm": 1.3643506509353014, + "learning_rate": 8.60340759700174e-06, + "loss": 0.4355, + "step": 2739 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.026074804296968, + "learning_rate": 8.602314722678989e-06, + "loss": 0.2507, + "step": 2740 + }, + { + "epoch": 0.2667639902676399, + "grad_norm": 1.3894972782664292, + "learning_rate": 8.601221490392009e-06, + "loss": 0.3981, + "step": 2741 + }, + { + "epoch": 0.2668613138686131, + "grad_norm": 1.3071238902768438, + "learning_rate": 8.600127900249435e-06, + "loss": 0.5138, + "step": 2742 + }, + { + "epoch": 0.26695863746958637, + "grad_norm": 1.61583752885221, + "learning_rate": 8.59903395235994e-06, + "loss": 0.5072, + "step": 2743 + }, + { + "epoch": 0.2670559610705596, + "grad_norm": 1.3679578518174673, + "learning_rate": 8.597939646832227e-06, + "loss": 0.3754, + "step": 2744 + }, + { + "epoch": 0.2671532846715328, + "grad_norm": 1.0943121419181938, + "learning_rate": 8.596844983775042e-06, + "loss": 0.2457, + "step": 2745 + }, + { + "epoch": 0.2672506082725061, + "grad_norm": 1.176479145152164, + "learning_rate": 8.59574996329716e-06, + "loss": 0.3687, + "step": 2746 + }, + { + "epoch": 0.2673479318734793, + "grad_norm": 1.2666642902167933, + "learning_rate": 8.594654585507393e-06, + "loss": 0.2664, + "step": 2747 + }, + { + "epoch": 0.2674452554744526, + "grad_norm": 1.3951377938692817, + "learning_rate": 8.59355885051459e-06, + "loss": 0.4035, + "step": 2748 + }, + { + "epoch": 0.2675425790754258, + "grad_norm": 1.2722832533001889, + "learning_rate": 8.592462758427635e-06, + "loss": 0.4643, + "step": 2749 + }, + { + "epoch": 0.26763990267639903, + "grad_norm": 1.2157588835981379, + "learning_rate": 8.59136630935545e-06, + "loss": 0.3612, + "step": 2750 + }, + { + "epoch": 0.2677372262773723, + "grad_norm": 1.0785566378114326, + "learning_rate": 8.590269503406986e-06, + "loss": 0.3403, + "step": 2751 + }, + { + "epoch": 0.2678345498783455, + "grad_norm": 1.2447292785758555, + "learning_rate": 8.589172340691235e-06, + "loss": 0.3873, + "step": 2752 + }, + { + "epoch": 0.26793187347931874, + "grad_norm": 1.166378916722292, + "learning_rate": 8.588074821317222e-06, + "loss": 0.3264, + "step": 2753 + }, + { + "epoch": 0.268029197080292, + "grad_norm": 1.2197572995933224, + "learning_rate": 8.586976945394008e-06, + "loss": 0.3793, + "step": 2754 + }, + { + "epoch": 0.2681265206812652, + "grad_norm": 1.6234832434134598, + "learning_rate": 8.58587871303069e-06, + "loss": 0.5521, + "step": 2755 + }, + { + "epoch": 0.26822384428223844, + "grad_norm": 1.4760533014923396, + "learning_rate": 8.584780124336403e-06, + "loss": 0.5024, + "step": 2756 + }, + { + "epoch": 0.2683211678832117, + "grad_norm": 1.4156240197993037, + "learning_rate": 8.58368117942031e-06, + "loss": 0.2848, + "step": 2757 + }, + { + "epoch": 0.2684184914841849, + "grad_norm": 1.9092848960981135, + "learning_rate": 8.582581878391614e-06, + "loss": 0.4053, + "step": 2758 + }, + { + "epoch": 0.26851581508515815, + "grad_norm": 1.2158050168465575, + "learning_rate": 8.581482221359557e-06, + "loss": 0.2709, + "step": 2759 + }, + { + "epoch": 0.2686131386861314, + "grad_norm": 1.5515245630825936, + "learning_rate": 8.580382208433408e-06, + "loss": 0.4549, + "step": 2760 + }, + { + "epoch": 0.2687104622871046, + "grad_norm": 1.6603384837941395, + "learning_rate": 8.57928183972248e-06, + "loss": 0.3316, + "step": 2761 + }, + { + "epoch": 0.26880778588807785, + "grad_norm": 1.5595744401068579, + "learning_rate": 8.578181115336114e-06, + "loss": 0.5733, + "step": 2762 + }, + { + "epoch": 0.2689051094890511, + "grad_norm": 1.3547786308004384, + "learning_rate": 8.577080035383693e-06, + "loss": 0.5295, + "step": 2763 + }, + { + "epoch": 0.2690024330900243, + "grad_norm": 1.2889595684224195, + "learning_rate": 8.57597859997463e-06, + "loss": 0.3876, + "step": 2764 + }, + { + "epoch": 0.26909975669099756, + "grad_norm": 1.5401948742368967, + "learning_rate": 8.574876809218375e-06, + "loss": 0.4847, + "step": 2765 + }, + { + "epoch": 0.2691970802919708, + "grad_norm": 1.5886773556984544, + "learning_rate": 8.573774663224414e-06, + "loss": 0.4746, + "step": 2766 + }, + { + "epoch": 0.269294403892944, + "grad_norm": 1.2747463684628804, + "learning_rate": 8.572672162102269e-06, + "loss": 0.2568, + "step": 2767 + }, + { + "epoch": 0.26939172749391727, + "grad_norm": 1.1674673988315882, + "learning_rate": 8.571569305961495e-06, + "loss": 0.4329, + "step": 2768 + }, + { + "epoch": 0.2694890510948905, + "grad_norm": 1.6882113617461265, + "learning_rate": 8.570466094911684e-06, + "loss": 0.6891, + "step": 2769 + }, + { + "epoch": 0.2695863746958638, + "grad_norm": 1.6660737969996857, + "learning_rate": 8.569362529062461e-06, + "loss": 0.5887, + "step": 2770 + }, + { + "epoch": 0.26968369829683697, + "grad_norm": 1.1653044559020052, + "learning_rate": 8.568258608523491e-06, + "loss": 0.2452, + "step": 2771 + }, + { + "epoch": 0.2697810218978102, + "grad_norm": 1.5681206888540218, + "learning_rate": 8.567154333404471e-06, + "loss": 0.4952, + "step": 2772 + }, + { + "epoch": 0.2698783454987835, + "grad_norm": 1.3994591247160806, + "learning_rate": 8.56604970381513e-06, + "loss": 0.2848, + "step": 2773 + }, + { + "epoch": 0.2699756690997567, + "grad_norm": 1.300192393224716, + "learning_rate": 8.564944719865238e-06, + "loss": 0.3924, + "step": 2774 + }, + { + "epoch": 0.27007299270072993, + "grad_norm": 1.4412015443912716, + "learning_rate": 8.5638393816646e-06, + "loss": 0.4531, + "step": 2775 + }, + { + "epoch": 0.2701703163017032, + "grad_norm": 1.4360872043281558, + "learning_rate": 8.56273368932305e-06, + "loss": 0.4571, + "step": 2776 + }, + { + "epoch": 0.2702676399026764, + "grad_norm": 1.5811581309774965, + "learning_rate": 8.561627642950465e-06, + "loss": 0.4638, + "step": 2777 + }, + { + "epoch": 0.27036496350364964, + "grad_norm": 1.7924696283680308, + "learning_rate": 8.560521242656751e-06, + "loss": 0.2922, + "step": 2778 + }, + { + "epoch": 0.2704622871046229, + "grad_norm": 1.7929283253885162, + "learning_rate": 8.559414488551854e-06, + "loss": 0.6197, + "step": 2779 + }, + { + "epoch": 0.2705596107055961, + "grad_norm": 1.5593955671219286, + "learning_rate": 8.558307380745751e-06, + "loss": 0.5448, + "step": 2780 + }, + { + "epoch": 0.27065693430656934, + "grad_norm": 1.3760682204767343, + "learning_rate": 8.557199919348455e-06, + "loss": 0.4434, + "step": 2781 + }, + { + "epoch": 0.2707542579075426, + "grad_norm": 3.203989647256839, + "learning_rate": 8.556092104470019e-06, + "loss": 0.4323, + "step": 2782 + }, + { + "epoch": 0.2708515815085158, + "grad_norm": 1.3460764595466628, + "learning_rate": 8.554983936220525e-06, + "loss": 0.3367, + "step": 2783 + }, + { + "epoch": 0.27094890510948905, + "grad_norm": 1.6160732245190643, + "learning_rate": 8.553875414710088e-06, + "loss": 0.5301, + "step": 2784 + }, + { + "epoch": 0.2710462287104623, + "grad_norm": 1.5749454761331767, + "learning_rate": 8.552766540048872e-06, + "loss": 0.3741, + "step": 2785 + }, + { + "epoch": 0.2711435523114355, + "grad_norm": 1.150423059184381, + "learning_rate": 8.551657312347057e-06, + "loss": 0.2796, + "step": 2786 + }, + { + "epoch": 0.27124087591240875, + "grad_norm": 1.4217054664233575, + "learning_rate": 8.550547731714874e-06, + "loss": 0.4543, + "step": 2787 + }, + { + "epoch": 0.271338199513382, + "grad_norm": 1.470206005686861, + "learning_rate": 8.54943779826258e-06, + "loss": 0.438, + "step": 2788 + }, + { + "epoch": 0.2714355231143552, + "grad_norm": 1.5766219733733982, + "learning_rate": 8.54832751210047e-06, + "loss": 0.4966, + "step": 2789 + }, + { + "epoch": 0.27153284671532846, + "grad_norm": 1.2135102045567707, + "learning_rate": 8.547216873338876e-06, + "loss": 0.358, + "step": 2790 + }, + { + "epoch": 0.2716301703163017, + "grad_norm": 1.4595225616938101, + "learning_rate": 8.546105882088158e-06, + "loss": 0.2225, + "step": 2791 + }, + { + "epoch": 0.27172749391727496, + "grad_norm": 1.3363330099445299, + "learning_rate": 8.54499453845872e-06, + "loss": 0.3914, + "step": 2792 + }, + { + "epoch": 0.27182481751824816, + "grad_norm": 1.3646141902938869, + "learning_rate": 8.543882842560997e-06, + "loss": 0.4558, + "step": 2793 + }, + { + "epoch": 0.2719221411192214, + "grad_norm": 1.3464180828493995, + "learning_rate": 8.542770794505456e-06, + "loss": 0.4786, + "step": 2794 + }, + { + "epoch": 0.27201946472019467, + "grad_norm": 1.044551377255888, + "learning_rate": 8.541658394402606e-06, + "loss": 0.303, + "step": 2795 + }, + { + "epoch": 0.27211678832116787, + "grad_norm": 1.6706499263846184, + "learning_rate": 8.540545642362982e-06, + "loss": 0.4033, + "step": 2796 + }, + { + "epoch": 0.2722141119221411, + "grad_norm": 1.3164784669169094, + "learning_rate": 8.539432538497162e-06, + "loss": 0.4343, + "step": 2797 + }, + { + "epoch": 0.2723114355231144, + "grad_norm": 1.6044535524867656, + "learning_rate": 8.538319082915757e-06, + "loss": 0.3641, + "step": 2798 + }, + { + "epoch": 0.2724087591240876, + "grad_norm": 1.9897822202433566, + "learning_rate": 8.537205275729406e-06, + "loss": 0.48, + "step": 2799 + }, + { + "epoch": 0.2725060827250608, + "grad_norm": 1.4110579632506512, + "learning_rate": 8.536091117048794e-06, + "loss": 0.4798, + "step": 2800 + }, + { + "epoch": 0.2726034063260341, + "grad_norm": 1.4415607317920478, + "learning_rate": 8.534976606984636e-06, + "loss": 0.343, + "step": 2801 + }, + { + "epoch": 0.2727007299270073, + "grad_norm": 1.6363482727427716, + "learning_rate": 8.53386174564768e-06, + "loss": 0.6087, + "step": 2802 + }, + { + "epoch": 0.27279805352798053, + "grad_norm": 1.1272383780084416, + "learning_rate": 8.532746533148708e-06, + "loss": 0.2444, + "step": 2803 + }, + { + "epoch": 0.2728953771289538, + "grad_norm": 1.712140222332907, + "learning_rate": 8.531630969598544e-06, + "loss": 0.6702, + "step": 2804 + }, + { + "epoch": 0.272992700729927, + "grad_norm": 1.474485197586056, + "learning_rate": 8.530515055108038e-06, + "loss": 0.3876, + "step": 2805 + }, + { + "epoch": 0.27309002433090024, + "grad_norm": 1.2926370708159094, + "learning_rate": 8.529398789788082e-06, + "loss": 0.3239, + "step": 2806 + }, + { + "epoch": 0.2731873479318735, + "grad_norm": 1.1171205940753008, + "learning_rate": 8.528282173749599e-06, + "loss": 0.3135, + "step": 2807 + }, + { + "epoch": 0.2732846715328467, + "grad_norm": 1.3561762741371761, + "learning_rate": 8.527165207103546e-06, + "loss": 0.4686, + "step": 2808 + }, + { + "epoch": 0.27338199513381994, + "grad_norm": 1.3082129080843141, + "learning_rate": 8.52604788996092e-06, + "loss": 0.4274, + "step": 2809 + }, + { + "epoch": 0.2734793187347932, + "grad_norm": 1.2958697823961909, + "learning_rate": 8.524930222432748e-06, + "loss": 0.4334, + "step": 2810 + }, + { + "epoch": 0.2735766423357664, + "grad_norm": 1.4541266485936315, + "learning_rate": 8.523812204630093e-06, + "loss": 0.5685, + "step": 2811 + }, + { + "epoch": 0.27367396593673965, + "grad_norm": 1.3303596097899522, + "learning_rate": 8.522693836664052e-06, + "loss": 0.4305, + "step": 2812 + }, + { + "epoch": 0.2737712895377129, + "grad_norm": 1.220005269273729, + "learning_rate": 8.521575118645761e-06, + "loss": 0.4281, + "step": 2813 + }, + { + "epoch": 0.27386861313868616, + "grad_norm": 1.0981673276035366, + "learning_rate": 8.520456050686384e-06, + "loss": 0.3641, + "step": 2814 + }, + { + "epoch": 0.27396593673965935, + "grad_norm": 1.4310281439998578, + "learning_rate": 8.519336632897128e-06, + "loss": 0.557, + "step": 2815 + }, + { + "epoch": 0.2740632603406326, + "grad_norm": 1.345841620727785, + "learning_rate": 8.518216865389227e-06, + "loss": 0.3991, + "step": 2816 + }, + { + "epoch": 0.27416058394160586, + "grad_norm": 1.6650753610183784, + "learning_rate": 8.517096748273951e-06, + "loss": 0.3624, + "step": 2817 + }, + { + "epoch": 0.27425790754257906, + "grad_norm": 1.2633026385457689, + "learning_rate": 8.515976281662613e-06, + "loss": 0.349, + "step": 2818 + }, + { + "epoch": 0.2743552311435523, + "grad_norm": 1.392024932172172, + "learning_rate": 8.514855465666546e-06, + "loss": 0.4514, + "step": 2819 + }, + { + "epoch": 0.27445255474452557, + "grad_norm": 1.4295145565971665, + "learning_rate": 8.513734300397135e-06, + "loss": 0.5668, + "step": 2820 + }, + { + "epoch": 0.27454987834549877, + "grad_norm": 1.0967459926110283, + "learning_rate": 8.512612785965787e-06, + "loss": 0.1808, + "step": 2821 + }, + { + "epoch": 0.274647201946472, + "grad_norm": 1.4843839946273536, + "learning_rate": 8.511490922483946e-06, + "loss": 0.4352, + "step": 2822 + }, + { + "epoch": 0.2747445255474453, + "grad_norm": 1.339649820333997, + "learning_rate": 8.510368710063093e-06, + "loss": 0.3137, + "step": 2823 + }, + { + "epoch": 0.27484184914841847, + "grad_norm": 1.32567882782868, + "learning_rate": 8.509246148814745e-06, + "loss": 0.4089, + "step": 2824 + }, + { + "epoch": 0.2749391727493917, + "grad_norm": 1.2497731956714773, + "learning_rate": 8.50812323885045e-06, + "loss": 0.382, + "step": 2825 + }, + { + "epoch": 0.275036496350365, + "grad_norm": 1.5771259884963846, + "learning_rate": 8.506999980281791e-06, + "loss": 0.501, + "step": 2826 + }, + { + "epoch": 0.2751338199513382, + "grad_norm": 1.3295615561309837, + "learning_rate": 8.505876373220393e-06, + "loss": 0.3635, + "step": 2827 + }, + { + "epoch": 0.27523114355231143, + "grad_norm": 1.55543645713159, + "learning_rate": 8.504752417777899e-06, + "loss": 0.2986, + "step": 2828 + }, + { + "epoch": 0.2753284671532847, + "grad_norm": 1.421283473121396, + "learning_rate": 8.503628114066008e-06, + "loss": 0.4931, + "step": 2829 + }, + { + "epoch": 0.2754257907542579, + "grad_norm": 1.1988827610585986, + "learning_rate": 8.502503462196435e-06, + "loss": 0.3272, + "step": 2830 + }, + { + "epoch": 0.27552311435523114, + "grad_norm": 1.6163491550131937, + "learning_rate": 8.501378462280941e-06, + "loss": 0.5794, + "step": 2831 + }, + { + "epoch": 0.2756204379562044, + "grad_norm": 1.6499795796835799, + "learning_rate": 8.500253114431316e-06, + "loss": 0.3668, + "step": 2832 + }, + { + "epoch": 0.27571776155717764, + "grad_norm": 1.7305434923413188, + "learning_rate": 8.499127418759388e-06, + "loss": 0.5291, + "step": 2833 + }, + { + "epoch": 0.27581508515815084, + "grad_norm": 1.4062980643641485, + "learning_rate": 8.498001375377018e-06, + "loss": 0.4645, + "step": 2834 + }, + { + "epoch": 0.2759124087591241, + "grad_norm": 1.2961260919749351, + "learning_rate": 8.496874984396101e-06, + "loss": 0.2517, + "step": 2835 + }, + { + "epoch": 0.27600973236009735, + "grad_norm": 1.4273972641674804, + "learning_rate": 8.495748245928568e-06, + "loss": 0.4705, + "step": 2836 + }, + { + "epoch": 0.27610705596107055, + "grad_norm": 1.1525746776855315, + "learning_rate": 8.494621160086383e-06, + "loss": 0.3747, + "step": 2837 + }, + { + "epoch": 0.2762043795620438, + "grad_norm": 1.6083708658269757, + "learning_rate": 8.493493726981545e-06, + "loss": 0.5754, + "step": 2838 + }, + { + "epoch": 0.27630170316301705, + "grad_norm": 1.6380932846987073, + "learning_rate": 8.492365946726087e-06, + "loss": 0.4668, + "step": 2839 + }, + { + "epoch": 0.27639902676399025, + "grad_norm": 1.3587028332396105, + "learning_rate": 8.491237819432081e-06, + "loss": 0.3466, + "step": 2840 + }, + { + "epoch": 0.2764963503649635, + "grad_norm": 1.5812508624530597, + "learning_rate": 8.490109345211625e-06, + "loss": 0.628, + "step": 2841 + }, + { + "epoch": 0.27659367396593676, + "grad_norm": 1.359461682943084, + "learning_rate": 8.48898052417686e-06, + "loss": 0.4799, + "step": 2842 + }, + { + "epoch": 0.27669099756690996, + "grad_norm": 1.3773089875645015, + "learning_rate": 8.487851356439953e-06, + "loss": 0.3064, + "step": 2843 + }, + { + "epoch": 0.2767883211678832, + "grad_norm": 1.445505572645753, + "learning_rate": 8.486721842113114e-06, + "loss": 0.4629, + "step": 2844 + }, + { + "epoch": 0.27688564476885646, + "grad_norm": 2.1729540442826796, + "learning_rate": 8.485591981308584e-06, + "loss": 0.501, + "step": 2845 + }, + { + "epoch": 0.27698296836982966, + "grad_norm": 1.2698072866971275, + "learning_rate": 8.484461774138635e-06, + "loss": 0.3354, + "step": 2846 + }, + { + "epoch": 0.2770802919708029, + "grad_norm": 1.2270792461817257, + "learning_rate": 8.483331220715578e-06, + "loss": 0.2925, + "step": 2847 + }, + { + "epoch": 0.27717761557177617, + "grad_norm": 1.4982940191444252, + "learning_rate": 8.482200321151757e-06, + "loss": 0.4372, + "step": 2848 + }, + { + "epoch": 0.27727493917274937, + "grad_norm": 1.7962422459275051, + "learning_rate": 8.48106907555955e-06, + "loss": 0.2514, + "step": 2849 + }, + { + "epoch": 0.2773722627737226, + "grad_norm": 1.1765428275481227, + "learning_rate": 8.479937484051368e-06, + "loss": 0.2466, + "step": 2850 + }, + { + "epoch": 0.2774695863746959, + "grad_norm": 1.3671035304850088, + "learning_rate": 8.47880554673966e-06, + "loss": 0.4388, + "step": 2851 + }, + { + "epoch": 0.2775669099756691, + "grad_norm": 1.584083262413021, + "learning_rate": 8.477673263736908e-06, + "loss": 0.3117, + "step": 2852 + }, + { + "epoch": 0.2776642335766423, + "grad_norm": 1.6251518472003594, + "learning_rate": 8.476540635155623e-06, + "loss": 0.4661, + "step": 2853 + }, + { + "epoch": 0.2777615571776156, + "grad_norm": 1.6392857489539867, + "learning_rate": 8.475407661108361e-06, + "loss": 0.354, + "step": 2854 + }, + { + "epoch": 0.27785888077858883, + "grad_norm": 1.3195625296951223, + "learning_rate": 8.474274341707702e-06, + "loss": 0.3744, + "step": 2855 + }, + { + "epoch": 0.27795620437956203, + "grad_norm": 1.34410915454318, + "learning_rate": 8.473140677066267e-06, + "loss": 0.4069, + "step": 2856 + }, + { + "epoch": 0.2780535279805353, + "grad_norm": 1.0527413957181246, + "learning_rate": 8.472006667296709e-06, + "loss": 0.2776, + "step": 2857 + }, + { + "epoch": 0.27815085158150854, + "grad_norm": 1.496471387248685, + "learning_rate": 8.470872312511714e-06, + "loss": 0.3642, + "step": 2858 + }, + { + "epoch": 0.27824817518248174, + "grad_norm": 1.532429299396127, + "learning_rate": 8.469737612824001e-06, + "loss": 0.44, + "step": 2859 + }, + { + "epoch": 0.278345498783455, + "grad_norm": 1.601112711944827, + "learning_rate": 8.468602568346332e-06, + "loss": 0.421, + "step": 2860 + }, + { + "epoch": 0.27844282238442825, + "grad_norm": 1.5148720198103927, + "learning_rate": 8.467467179191493e-06, + "loss": 0.5258, + "step": 2861 + }, + { + "epoch": 0.27854014598540144, + "grad_norm": 1.573048120862393, + "learning_rate": 8.466331445472308e-06, + "loss": 0.4507, + "step": 2862 + }, + { + "epoch": 0.2786374695863747, + "grad_norm": 1.3938890789758775, + "learning_rate": 8.465195367301639e-06, + "loss": 0.3365, + "step": 2863 + }, + { + "epoch": 0.27873479318734795, + "grad_norm": 1.6895380781567202, + "learning_rate": 8.464058944792375e-06, + "loss": 0.4132, + "step": 2864 + }, + { + "epoch": 0.27883211678832115, + "grad_norm": 1.6880546647255488, + "learning_rate": 8.462922178057444e-06, + "loss": 0.2605, + "step": 2865 + }, + { + "epoch": 0.2789294403892944, + "grad_norm": 1.491755717654464, + "learning_rate": 8.46178506720981e-06, + "loss": 0.3983, + "step": 2866 + }, + { + "epoch": 0.27902676399026766, + "grad_norm": 1.5848666178901887, + "learning_rate": 8.460647612362464e-06, + "loss": 0.5101, + "step": 2867 + }, + { + "epoch": 0.27912408759124085, + "grad_norm": 1.3442317187907376, + "learning_rate": 8.459509813628437e-06, + "loss": 0.458, + "step": 2868 + }, + { + "epoch": 0.2792214111922141, + "grad_norm": 1.8095809186860319, + "learning_rate": 8.458371671120795e-06, + "loss": 0.382, + "step": 2869 + }, + { + "epoch": 0.27931873479318736, + "grad_norm": 0.9909926300929587, + "learning_rate": 8.457233184952635e-06, + "loss": 0.2292, + "step": 2870 + }, + { + "epoch": 0.27941605839416056, + "grad_norm": 1.7013118787018624, + "learning_rate": 8.456094355237086e-06, + "loss": 0.6861, + "step": 2871 + }, + { + "epoch": 0.2795133819951338, + "grad_norm": 3.4293212695090025, + "learning_rate": 8.45495518208732e-06, + "loss": 0.3233, + "step": 2872 + }, + { + "epoch": 0.27961070559610707, + "grad_norm": 1.4903797163776311, + "learning_rate": 8.45381566561653e-06, + "loss": 0.3231, + "step": 2873 + }, + { + "epoch": 0.27970802919708027, + "grad_norm": 1.5615177882070261, + "learning_rate": 8.452675805937956e-06, + "loss": 0.4125, + "step": 2874 + }, + { + "epoch": 0.2798053527980535, + "grad_norm": 1.4099046900170047, + "learning_rate": 8.451535603164865e-06, + "loss": 0.4967, + "step": 2875 + }, + { + "epoch": 0.2799026763990268, + "grad_norm": 1.383217014263479, + "learning_rate": 8.450395057410561e-06, + "loss": 0.3411, + "step": 2876 + }, + { + "epoch": 0.28, + "grad_norm": 1.2661588037606646, + "learning_rate": 8.449254168788377e-06, + "loss": 0.3734, + "step": 2877 + }, + { + "epoch": 0.2800973236009732, + "grad_norm": 1.4107359648240771, + "learning_rate": 8.448112937411689e-06, + "loss": 0.4765, + "step": 2878 + }, + { + "epoch": 0.2801946472019465, + "grad_norm": 1.567373989947911, + "learning_rate": 8.446971363393897e-06, + "loss": 0.5806, + "step": 2879 + }, + { + "epoch": 0.28029197080291973, + "grad_norm": 1.5980994022663064, + "learning_rate": 8.445829446848442e-06, + "loss": 0.3765, + "step": 2880 + }, + { + "epoch": 0.28038929440389293, + "grad_norm": 1.5582627635759285, + "learning_rate": 8.444687187888798e-06, + "loss": 0.3838, + "step": 2881 + }, + { + "epoch": 0.2804866180048662, + "grad_norm": 2.097365147798996, + "learning_rate": 8.44354458662847e-06, + "loss": 0.6467, + "step": 2882 + }, + { + "epoch": 0.28058394160583944, + "grad_norm": 1.5302257615618868, + "learning_rate": 8.442401643181e-06, + "loss": 0.4415, + "step": 2883 + }, + { + "epoch": 0.28068126520681264, + "grad_norm": 1.1646338986978766, + "learning_rate": 8.441258357659962e-06, + "loss": 0.3176, + "step": 2884 + }, + { + "epoch": 0.2807785888077859, + "grad_norm": 1.2287928718701633, + "learning_rate": 8.440114730178968e-06, + "loss": 0.4175, + "step": 2885 + }, + { + "epoch": 0.28087591240875914, + "grad_norm": 1.4416072881006319, + "learning_rate": 8.438970760851658e-06, + "loss": 0.4838, + "step": 2886 + }, + { + "epoch": 0.28097323600973234, + "grad_norm": 1.319870372533973, + "learning_rate": 8.437826449791709e-06, + "loss": 0.3421, + "step": 2887 + }, + { + "epoch": 0.2810705596107056, + "grad_norm": 1.6261475252650914, + "learning_rate": 8.436681797112833e-06, + "loss": 0.5019, + "step": 2888 + }, + { + "epoch": 0.28116788321167885, + "grad_norm": 1.6203143716652342, + "learning_rate": 8.435536802928774e-06, + "loss": 0.4282, + "step": 2889 + }, + { + "epoch": 0.28126520681265205, + "grad_norm": 1.4127079920263665, + "learning_rate": 8.434391467353312e-06, + "loss": 0.4542, + "step": 2890 + }, + { + "epoch": 0.2813625304136253, + "grad_norm": 1.1756885783532405, + "learning_rate": 8.433245790500258e-06, + "loss": 0.3563, + "step": 2891 + }, + { + "epoch": 0.28145985401459855, + "grad_norm": 1.1824997482138238, + "learning_rate": 8.43209977248346e-06, + "loss": 0.3628, + "step": 2892 + }, + { + "epoch": 0.28155717761557175, + "grad_norm": 1.4280724079623635, + "learning_rate": 8.430953413416798e-06, + "loss": 0.446, + "step": 2893 + }, + { + "epoch": 0.281654501216545, + "grad_norm": 1.0710350994410123, + "learning_rate": 8.429806713414188e-06, + "loss": 0.2016, + "step": 2894 + }, + { + "epoch": 0.28175182481751826, + "grad_norm": 1.453985226232095, + "learning_rate": 8.428659672589574e-06, + "loss": 0.4325, + "step": 2895 + }, + { + "epoch": 0.28184914841849146, + "grad_norm": 1.3045306996673216, + "learning_rate": 8.427512291056943e-06, + "loss": 0.3838, + "step": 2896 + }, + { + "epoch": 0.2819464720194647, + "grad_norm": 1.483337521636422, + "learning_rate": 8.426364568930309e-06, + "loss": 0.4212, + "step": 2897 + }, + { + "epoch": 0.28204379562043796, + "grad_norm": 1.0901324802348065, + "learning_rate": 8.425216506323721e-06, + "loss": 0.2392, + "step": 2898 + }, + { + "epoch": 0.2821411192214112, + "grad_norm": 1.3761268679827663, + "learning_rate": 8.424068103351264e-06, + "loss": 0.4459, + "step": 2899 + }, + { + "epoch": 0.2822384428223844, + "grad_norm": 1.461105500215717, + "learning_rate": 8.422919360127053e-06, + "loss": 0.5018, + "step": 2900 + }, + { + "epoch": 0.28233576642335767, + "grad_norm": 1.4314465150478046, + "learning_rate": 8.421770276765245e-06, + "loss": 0.4474, + "step": 2901 + }, + { + "epoch": 0.2824330900243309, + "grad_norm": 1.6060806185106393, + "learning_rate": 8.420620853380018e-06, + "loss": 0.5798, + "step": 2902 + }, + { + "epoch": 0.2825304136253041, + "grad_norm": 1.4468000025910832, + "learning_rate": 8.419471090085596e-06, + "loss": 0.5597, + "step": 2903 + }, + { + "epoch": 0.2826277372262774, + "grad_norm": 5.585104457387235, + "learning_rate": 8.41832098699623e-06, + "loss": 0.3493, + "step": 2904 + }, + { + "epoch": 0.28272506082725063, + "grad_norm": 1.3577816273786794, + "learning_rate": 8.417170544226205e-06, + "loss": 0.3262, + "step": 2905 + }, + { + "epoch": 0.2828223844282238, + "grad_norm": 1.1546363912171016, + "learning_rate": 8.416019761889845e-06, + "loss": 0.3691, + "step": 2906 + }, + { + "epoch": 0.2829197080291971, + "grad_norm": 1.3224407401265832, + "learning_rate": 8.4148686401015e-06, + "loss": 0.3079, + "step": 2907 + }, + { + "epoch": 0.28301703163017033, + "grad_norm": 1.5947860641264806, + "learning_rate": 8.413717178975558e-06, + "loss": 0.277, + "step": 2908 + }, + { + "epoch": 0.28311435523114353, + "grad_norm": 1.343045870800707, + "learning_rate": 8.412565378626442e-06, + "loss": 0.3448, + "step": 2909 + }, + { + "epoch": 0.2832116788321168, + "grad_norm": 1.5567901041780798, + "learning_rate": 8.411413239168609e-06, + "loss": 0.3954, + "step": 2910 + }, + { + "epoch": 0.28330900243309004, + "grad_norm": 1.5232536009297208, + "learning_rate": 8.410260760716545e-06, + "loss": 0.5103, + "step": 2911 + }, + { + "epoch": 0.28340632603406324, + "grad_norm": 1.2493384040941995, + "learning_rate": 8.409107943384773e-06, + "loss": 0.3671, + "step": 2912 + }, + { + "epoch": 0.2835036496350365, + "grad_norm": 1.246217249188392, + "learning_rate": 8.407954787287848e-06, + "loss": 0.4112, + "step": 2913 + }, + { + "epoch": 0.28360097323600975, + "grad_norm": 1.2012340002353967, + "learning_rate": 8.406801292540364e-06, + "loss": 0.3769, + "step": 2914 + }, + { + "epoch": 0.28369829683698294, + "grad_norm": 1.51749407168492, + "learning_rate": 8.405647459256939e-06, + "loss": 0.5515, + "step": 2915 + }, + { + "epoch": 0.2837956204379562, + "grad_norm": 1.1589770762667257, + "learning_rate": 8.404493287552232e-06, + "loss": 0.2577, + "step": 2916 + }, + { + "epoch": 0.28389294403892945, + "grad_norm": 1.5139932402052954, + "learning_rate": 8.403338777540936e-06, + "loss": 0.4796, + "step": 2917 + }, + { + "epoch": 0.28399026763990265, + "grad_norm": 1.5544290759133006, + "learning_rate": 8.402183929337774e-06, + "loss": 0.4594, + "step": 2918 + }, + { + "epoch": 0.2840875912408759, + "grad_norm": 1.3525572627526583, + "learning_rate": 8.401028743057503e-06, + "loss": 0.3978, + "step": 2919 + }, + { + "epoch": 0.28418491484184916, + "grad_norm": 1.3610916698563846, + "learning_rate": 8.399873218814916e-06, + "loss": 0.4308, + "step": 2920 + }, + { + "epoch": 0.2842822384428224, + "grad_norm": 1.2060322500759533, + "learning_rate": 8.398717356724837e-06, + "loss": 0.482, + "step": 2921 + }, + { + "epoch": 0.2843795620437956, + "grad_norm": 1.152727586861314, + "learning_rate": 8.397561156902126e-06, + "loss": 0.3862, + "step": 2922 + }, + { + "epoch": 0.28447688564476886, + "grad_norm": 1.6371195081735355, + "learning_rate": 8.396404619461673e-06, + "loss": 0.684, + "step": 2923 + }, + { + "epoch": 0.2845742092457421, + "grad_norm": 1.4756480619833048, + "learning_rate": 8.395247744518407e-06, + "loss": 0.4432, + "step": 2924 + }, + { + "epoch": 0.2846715328467153, + "grad_norm": 1.3495353534897125, + "learning_rate": 8.394090532187286e-06, + "loss": 0.4574, + "step": 2925 + }, + { + "epoch": 0.28476885644768857, + "grad_norm": 1.361248347874279, + "learning_rate": 8.392932982583301e-06, + "loss": 0.3117, + "step": 2926 + }, + { + "epoch": 0.2848661800486618, + "grad_norm": 1.5493409509214389, + "learning_rate": 8.391775095821481e-06, + "loss": 0.5949, + "step": 2927 + }, + { + "epoch": 0.284963503649635, + "grad_norm": 1.5159424124979992, + "learning_rate": 8.390616872016886e-06, + "loss": 0.612, + "step": 2928 + }, + { + "epoch": 0.2850608272506083, + "grad_norm": 0.9819694068633834, + "learning_rate": 8.389458311284606e-06, + "loss": 0.2407, + "step": 2929 + }, + { + "epoch": 0.2851581508515815, + "grad_norm": 1.4338313849048412, + "learning_rate": 8.388299413739772e-06, + "loss": 0.504, + "step": 2930 + }, + { + "epoch": 0.2852554744525547, + "grad_norm": 1.6033282710660985, + "learning_rate": 8.387140179497541e-06, + "loss": 0.4686, + "step": 2931 + }, + { + "epoch": 0.285352798053528, + "grad_norm": 1.4339139409278308, + "learning_rate": 8.38598060867311e-06, + "loss": 0.5885, + "step": 2932 + }, + { + "epoch": 0.28545012165450123, + "grad_norm": 1.6962944035069916, + "learning_rate": 8.384820701381705e-06, + "loss": 0.6325, + "step": 2933 + }, + { + "epoch": 0.28554744525547443, + "grad_norm": 1.2380931242026982, + "learning_rate": 8.383660457738585e-06, + "loss": 0.3528, + "step": 2934 + }, + { + "epoch": 0.2856447688564477, + "grad_norm": 1.4958548492045998, + "learning_rate": 8.382499877859046e-06, + "loss": 0.5261, + "step": 2935 + }, + { + "epoch": 0.28574209245742094, + "grad_norm": 1.2493863506860636, + "learning_rate": 8.381338961858417e-06, + "loss": 0.295, + "step": 2936 + }, + { + "epoch": 0.28583941605839414, + "grad_norm": 1.0264542939220365, + "learning_rate": 8.380177709852055e-06, + "loss": 0.2736, + "step": 2937 + }, + { + "epoch": 0.2859367396593674, + "grad_norm": 1.3694559515073481, + "learning_rate": 8.379016121955358e-06, + "loss": 0.2437, + "step": 2938 + }, + { + "epoch": 0.28603406326034064, + "grad_norm": 1.3958652644514353, + "learning_rate": 8.377854198283751e-06, + "loss": 0.5162, + "step": 2939 + }, + { + "epoch": 0.28613138686131384, + "grad_norm": 1.3188642877167738, + "learning_rate": 8.376691938952694e-06, + "loss": 0.4403, + "step": 2940 + }, + { + "epoch": 0.2862287104622871, + "grad_norm": 1.5563883463328907, + "learning_rate": 8.375529344077686e-06, + "loss": 0.3871, + "step": 2941 + }, + { + "epoch": 0.28632603406326035, + "grad_norm": 1.7106139691477682, + "learning_rate": 8.37436641377425e-06, + "loss": 0.5998, + "step": 2942 + }, + { + "epoch": 0.2864233576642336, + "grad_norm": 1.8227768617334648, + "learning_rate": 8.373203148157953e-06, + "loss": 0.4192, + "step": 2943 + }, + { + "epoch": 0.2865206812652068, + "grad_norm": 1.3645142496496503, + "learning_rate": 8.372039547344383e-06, + "loss": 0.4301, + "step": 2944 + }, + { + "epoch": 0.28661800486618005, + "grad_norm": 1.4644520960794265, + "learning_rate": 8.370875611449173e-06, + "loss": 0.4333, + "step": 2945 + }, + { + "epoch": 0.2867153284671533, + "grad_norm": 1.3686778637415178, + "learning_rate": 8.369711340587981e-06, + "loss": 0.4735, + "step": 2946 + }, + { + "epoch": 0.2868126520681265, + "grad_norm": 1.7752150982830557, + "learning_rate": 8.368546734876499e-06, + "loss": 0.605, + "step": 2947 + }, + { + "epoch": 0.28690997566909976, + "grad_norm": 1.6349896239905135, + "learning_rate": 8.36738179443046e-06, + "loss": 0.4521, + "step": 2948 + }, + { + "epoch": 0.287007299270073, + "grad_norm": 1.7001103309282906, + "learning_rate": 8.366216519365623e-06, + "loss": 0.5243, + "step": 2949 + }, + { + "epoch": 0.2871046228710462, + "grad_norm": 1.3288526449094853, + "learning_rate": 8.365050909797779e-06, + "loss": 0.4226, + "step": 2950 + }, + { + "epoch": 0.28720194647201946, + "grad_norm": 1.0609308885865543, + "learning_rate": 8.36388496584276e-06, + "loss": 0.2761, + "step": 2951 + }, + { + "epoch": 0.2872992700729927, + "grad_norm": 1.3048762567541314, + "learning_rate": 8.362718687616422e-06, + "loss": 0.3166, + "step": 2952 + }, + { + "epoch": 0.2873965936739659, + "grad_norm": 1.5602591658770568, + "learning_rate": 8.361552075234664e-06, + "loss": 0.1814, + "step": 2953 + }, + { + "epoch": 0.28749391727493917, + "grad_norm": 1.261612878851385, + "learning_rate": 8.360385128813409e-06, + "loss": 0.3431, + "step": 2954 + }, + { + "epoch": 0.2875912408759124, + "grad_norm": 1.6502840086679433, + "learning_rate": 8.359217848468617e-06, + "loss": 0.5688, + "step": 2955 + }, + { + "epoch": 0.2876885644768856, + "grad_norm": 1.1758618501430975, + "learning_rate": 8.358050234316283e-06, + "loss": 0.376, + "step": 2956 + }, + { + "epoch": 0.2877858880778589, + "grad_norm": 1.3748216513361973, + "learning_rate": 8.356882286472433e-06, + "loss": 0.4893, + "step": 2957 + }, + { + "epoch": 0.28788321167883213, + "grad_norm": 1.490557754247365, + "learning_rate": 8.35571400505313e-06, + "loss": 0.4322, + "step": 2958 + }, + { + "epoch": 0.2879805352798053, + "grad_norm": 1.2474734521766377, + "learning_rate": 8.35454539017446e-06, + "loss": 0.249, + "step": 2959 + }, + { + "epoch": 0.2880778588807786, + "grad_norm": 1.3041956082790018, + "learning_rate": 8.353376441952554e-06, + "loss": 0.3629, + "step": 2960 + }, + { + "epoch": 0.28817518248175183, + "grad_norm": 1.1813542799359134, + "learning_rate": 8.352207160503572e-06, + "loss": 0.2541, + "step": 2961 + }, + { + "epoch": 0.2882725060827251, + "grad_norm": 1.6196703441196314, + "learning_rate": 8.351037545943702e-06, + "loss": 0.5863, + "step": 2962 + }, + { + "epoch": 0.2883698296836983, + "grad_norm": 1.6020435634219072, + "learning_rate": 8.34986759838917e-06, + "loss": 0.5539, + "step": 2963 + }, + { + "epoch": 0.28846715328467154, + "grad_norm": 1.6170521555116952, + "learning_rate": 8.348697317956238e-06, + "loss": 0.4234, + "step": 2964 + }, + { + "epoch": 0.2885644768856448, + "grad_norm": 1.2300623631368495, + "learning_rate": 8.347526704761193e-06, + "loss": 0.2784, + "step": 2965 + }, + { + "epoch": 0.288661800486618, + "grad_norm": 2.179168092375873, + "learning_rate": 8.346355758920364e-06, + "loss": 0.4561, + "step": 2966 + }, + { + "epoch": 0.28875912408759125, + "grad_norm": 1.5135423174141494, + "learning_rate": 8.345184480550104e-06, + "loss": 0.3807, + "step": 2967 + }, + { + "epoch": 0.2888564476885645, + "grad_norm": 1.7005351963186346, + "learning_rate": 8.344012869766808e-06, + "loss": 0.538, + "step": 2968 + }, + { + "epoch": 0.2889537712895377, + "grad_norm": 1.2789157911351394, + "learning_rate": 8.342840926686898e-06, + "loss": 0.2623, + "step": 2969 + }, + { + "epoch": 0.28905109489051095, + "grad_norm": 1.304761873055631, + "learning_rate": 8.34166865142683e-06, + "loss": 0.4219, + "step": 2970 + }, + { + "epoch": 0.2891484184914842, + "grad_norm": 1.6192760894025877, + "learning_rate": 8.340496044103095e-06, + "loss": 0.4378, + "step": 2971 + }, + { + "epoch": 0.2892457420924574, + "grad_norm": 1.4363442626245757, + "learning_rate": 8.339323104832214e-06, + "loss": 0.3819, + "step": 2972 + }, + { + "epoch": 0.28934306569343066, + "grad_norm": 1.5094300127764981, + "learning_rate": 8.338149833730742e-06, + "loss": 0.2769, + "step": 2973 + }, + { + "epoch": 0.2894403892944039, + "grad_norm": 1.6047897202306092, + "learning_rate": 8.33697623091527e-06, + "loss": 0.424, + "step": 2974 + }, + { + "epoch": 0.2895377128953771, + "grad_norm": 1.3129110600868221, + "learning_rate": 8.33580229650242e-06, + "loss": 0.5053, + "step": 2975 + }, + { + "epoch": 0.28963503649635036, + "grad_norm": 1.1812562932245452, + "learning_rate": 8.334628030608845e-06, + "loss": 0.3835, + "step": 2976 + }, + { + "epoch": 0.2897323600973236, + "grad_norm": 1.2211203388582414, + "learning_rate": 8.333453433351233e-06, + "loss": 0.3531, + "step": 2977 + }, + { + "epoch": 0.2898296836982968, + "grad_norm": 1.4620903484748373, + "learning_rate": 8.332278504846303e-06, + "loss": 0.4771, + "step": 2978 + }, + { + "epoch": 0.28992700729927007, + "grad_norm": 0.9704255718501243, + "learning_rate": 8.331103245210812e-06, + "loss": 0.2618, + "step": 2979 + }, + { + "epoch": 0.2900243309002433, + "grad_norm": 1.2827724622455963, + "learning_rate": 8.329927654561544e-06, + "loss": 0.3052, + "step": 2980 + }, + { + "epoch": 0.2901216545012165, + "grad_norm": 1.378581411338256, + "learning_rate": 8.328751733015316e-06, + "loss": 0.3568, + "step": 2981 + }, + { + "epoch": 0.2902189781021898, + "grad_norm": 1.769807570821765, + "learning_rate": 8.327575480688985e-06, + "loss": 0.3102, + "step": 2982 + }, + { + "epoch": 0.290316301703163, + "grad_norm": 1.4326301683333176, + "learning_rate": 8.32639889769943e-06, + "loss": 0.3218, + "step": 2983 + }, + { + "epoch": 0.2904136253041363, + "grad_norm": 1.5418816322088151, + "learning_rate": 8.325221984163575e-06, + "loss": 0.3257, + "step": 2984 + }, + { + "epoch": 0.2905109489051095, + "grad_norm": 1.573484642436306, + "learning_rate": 8.324044740198366e-06, + "loss": 0.5401, + "step": 2985 + }, + { + "epoch": 0.29060827250608273, + "grad_norm": 1.2270555416429247, + "learning_rate": 8.322867165920789e-06, + "loss": 0.3914, + "step": 2986 + }, + { + "epoch": 0.290705596107056, + "grad_norm": 1.1838846887742434, + "learning_rate": 8.321689261447858e-06, + "loss": 0.3282, + "step": 2987 + }, + { + "epoch": 0.2908029197080292, + "grad_norm": 1.5077214188811954, + "learning_rate": 8.320511026896624e-06, + "loss": 0.5279, + "step": 2988 + }, + { + "epoch": 0.29090024330900244, + "grad_norm": 1.1784061774291985, + "learning_rate": 8.31933246238417e-06, + "loss": 0.403, + "step": 2989 + }, + { + "epoch": 0.2909975669099757, + "grad_norm": 1.2176703537151474, + "learning_rate": 8.318153568027607e-06, + "loss": 0.4213, + "step": 2990 + }, + { + "epoch": 0.2910948905109489, + "grad_norm": 1.3475262123063816, + "learning_rate": 8.316974343944085e-06, + "loss": 0.4059, + "step": 2991 + }, + { + "epoch": 0.29119221411192214, + "grad_norm": 1.2398233047847593, + "learning_rate": 8.315794790250784e-06, + "loss": 0.2626, + "step": 2992 + }, + { + "epoch": 0.2912895377128954, + "grad_norm": 1.3862498175549538, + "learning_rate": 8.314614907064915e-06, + "loss": 0.4535, + "step": 2993 + }, + { + "epoch": 0.2913868613138686, + "grad_norm": 1.455622096437578, + "learning_rate": 8.313434694503727e-06, + "loss": 0.4067, + "step": 2994 + }, + { + "epoch": 0.29148418491484185, + "grad_norm": 1.4755183973829757, + "learning_rate": 8.312254152684496e-06, + "loss": 0.6493, + "step": 2995 + }, + { + "epoch": 0.2915815085158151, + "grad_norm": 1.0399713771806027, + "learning_rate": 8.311073281724536e-06, + "loss": 0.3051, + "step": 2996 + }, + { + "epoch": 0.2916788321167883, + "grad_norm": 1.3151300509583979, + "learning_rate": 8.309892081741186e-06, + "loss": 0.3982, + "step": 2997 + }, + { + "epoch": 0.29177615571776155, + "grad_norm": 1.376541833798208, + "learning_rate": 8.308710552851826e-06, + "loss": 0.4749, + "step": 2998 + }, + { + "epoch": 0.2918734793187348, + "grad_norm": 1.2551786912554768, + "learning_rate": 8.307528695173865e-06, + "loss": 0.3118, + "step": 2999 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 2.1707038191553463, + "learning_rate": 8.306346508824746e-06, + "loss": 0.3438, + "step": 3000 + }, + { + "epoch": 0.29206812652068126, + "grad_norm": 1.4299459588569998, + "learning_rate": 8.30516399392194e-06, + "loss": 0.4838, + "step": 3001 + }, + { + "epoch": 0.2921654501216545, + "grad_norm": 1.378341342959643, + "learning_rate": 8.303981150582958e-06, + "loss": 0.5055, + "step": 3002 + }, + { + "epoch": 0.2922627737226277, + "grad_norm": 1.4826508798742193, + "learning_rate": 8.302797978925338e-06, + "loss": 0.3737, + "step": 3003 + }, + { + "epoch": 0.29236009732360096, + "grad_norm": 1.222513403789782, + "learning_rate": 8.301614479066653e-06, + "loss": 0.4587, + "step": 3004 + }, + { + "epoch": 0.2924574209245742, + "grad_norm": 1.3819233250029228, + "learning_rate": 8.300430651124508e-06, + "loss": 0.4021, + "step": 3005 + }, + { + "epoch": 0.29255474452554747, + "grad_norm": 1.2846536784172882, + "learning_rate": 8.29924649521654e-06, + "loss": 0.3609, + "step": 3006 + }, + { + "epoch": 0.29265206812652067, + "grad_norm": 1.4274226525457885, + "learning_rate": 8.298062011460419e-06, + "loss": 0.5267, + "step": 3007 + }, + { + "epoch": 0.2927493917274939, + "grad_norm": 1.4642655922839627, + "learning_rate": 8.296877199973849e-06, + "loss": 0.3499, + "step": 3008 + }, + { + "epoch": 0.2928467153284672, + "grad_norm": 1.4317302181421974, + "learning_rate": 8.295692060874568e-06, + "loss": 0.4979, + "step": 3009 + }, + { + "epoch": 0.2929440389294404, + "grad_norm": 1.3191877461185262, + "learning_rate": 8.294506594280338e-06, + "loss": 0.2835, + "step": 3010 + }, + { + "epoch": 0.29304136253041363, + "grad_norm": 1.0943861065294986, + "learning_rate": 8.293320800308964e-06, + "loss": 0.2138, + "step": 3011 + }, + { + "epoch": 0.2931386861313869, + "grad_norm": 1.2621219805575281, + "learning_rate": 8.292134679078277e-06, + "loss": 0.3027, + "step": 3012 + }, + { + "epoch": 0.2932360097323601, + "grad_norm": 1.556172337566686, + "learning_rate": 8.290948230706145e-06, + "loss": 0.4462, + "step": 3013 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 1.3363658374504028, + "learning_rate": 8.289761455310463e-06, + "loss": 0.373, + "step": 3014 + }, + { + "epoch": 0.2934306569343066, + "grad_norm": 1.4458593210455408, + "learning_rate": 8.288574353009164e-06, + "loss": 0.5566, + "step": 3015 + }, + { + "epoch": 0.2935279805352798, + "grad_norm": 1.5034274044899172, + "learning_rate": 8.287386923920211e-06, + "loss": 0.3837, + "step": 3016 + }, + { + "epoch": 0.29362530413625304, + "grad_norm": 1.484769748600726, + "learning_rate": 8.286199168161598e-06, + "loss": 0.3173, + "step": 3017 + }, + { + "epoch": 0.2937226277372263, + "grad_norm": 1.4336064725306121, + "learning_rate": 8.285011085851353e-06, + "loss": 0.4005, + "step": 3018 + }, + { + "epoch": 0.2938199513381995, + "grad_norm": 1.3857231757141482, + "learning_rate": 8.283822677107539e-06, + "loss": 0.481, + "step": 3019 + }, + { + "epoch": 0.29391727493917275, + "grad_norm": 1.4086307294395457, + "learning_rate": 8.282633942048244e-06, + "loss": 0.4181, + "step": 3020 + }, + { + "epoch": 0.294014598540146, + "grad_norm": 1.4701075671537391, + "learning_rate": 8.2814448807916e-06, + "loss": 0.4041, + "step": 3021 + }, + { + "epoch": 0.2941119221411192, + "grad_norm": 1.5925621393078395, + "learning_rate": 8.28025549345576e-06, + "loss": 0.3062, + "step": 3022 + }, + { + "epoch": 0.29420924574209245, + "grad_norm": 1.6058911141553376, + "learning_rate": 8.279065780158914e-06, + "loss": 0.5534, + "step": 3023 + }, + { + "epoch": 0.2943065693430657, + "grad_norm": 1.4134575830281486, + "learning_rate": 8.277875741019289e-06, + "loss": 0.5017, + "step": 3024 + }, + { + "epoch": 0.2944038929440389, + "grad_norm": 1.6163740830610969, + "learning_rate": 8.276685376155133e-06, + "loss": 0.5513, + "step": 3025 + }, + { + "epoch": 0.29450121654501216, + "grad_norm": 1.3415920762045879, + "learning_rate": 8.275494685684739e-06, + "loss": 0.4209, + "step": 3026 + }, + { + "epoch": 0.2945985401459854, + "grad_norm": 1.699522776097275, + "learning_rate": 8.274303669726427e-06, + "loss": 0.2444, + "step": 3027 + }, + { + "epoch": 0.29469586374695866, + "grad_norm": 1.3118143561432465, + "learning_rate": 8.273112328398545e-06, + "loss": 0.3282, + "step": 3028 + }, + { + "epoch": 0.29479318734793186, + "grad_norm": 1.3608335365502384, + "learning_rate": 8.271920661819479e-06, + "loss": 0.4625, + "step": 3029 + }, + { + "epoch": 0.2948905109489051, + "grad_norm": 1.320965035708582, + "learning_rate": 8.270728670107645e-06, + "loss": 0.4161, + "step": 3030 + }, + { + "epoch": 0.29498783454987837, + "grad_norm": 1.2315684415049128, + "learning_rate": 8.269536353381493e-06, + "loss": 0.3264, + "step": 3031 + }, + { + "epoch": 0.29508515815085157, + "grad_norm": 1.2397754210481065, + "learning_rate": 8.268343711759505e-06, + "loss": 0.3184, + "step": 3032 + }, + { + "epoch": 0.2951824817518248, + "grad_norm": 1.4717261820272485, + "learning_rate": 8.267150745360194e-06, + "loss": 0.381, + "step": 3033 + }, + { + "epoch": 0.2952798053527981, + "grad_norm": 1.7364842416546407, + "learning_rate": 8.265957454302102e-06, + "loss": 0.3639, + "step": 3034 + }, + { + "epoch": 0.2953771289537713, + "grad_norm": 1.6249980192905973, + "learning_rate": 8.264763838703813e-06, + "loss": 0.5112, + "step": 3035 + }, + { + "epoch": 0.2954744525547445, + "grad_norm": 1.682249094263979, + "learning_rate": 8.263569898683934e-06, + "loss": 0.4894, + "step": 3036 + }, + { + "epoch": 0.2955717761557178, + "grad_norm": 1.9200248186176307, + "learning_rate": 8.262375634361108e-06, + "loss": 0.529, + "step": 3037 + }, + { + "epoch": 0.295669099756691, + "grad_norm": 1.4426650259998133, + "learning_rate": 8.261181045854011e-06, + "loss": 0.5037, + "step": 3038 + }, + { + "epoch": 0.29576642335766423, + "grad_norm": 1.6904227765149746, + "learning_rate": 8.259986133281348e-06, + "loss": 0.3632, + "step": 3039 + }, + { + "epoch": 0.2958637469586375, + "grad_norm": 1.3863799205056755, + "learning_rate": 8.25879089676186e-06, + "loss": 0.4148, + "step": 3040 + }, + { + "epoch": 0.2959610705596107, + "grad_norm": 1.627436205526306, + "learning_rate": 8.257595336414317e-06, + "loss": 0.4558, + "step": 3041 + }, + { + "epoch": 0.29605839416058394, + "grad_norm": 1.3163567598814478, + "learning_rate": 8.256399452357524e-06, + "loss": 0.2713, + "step": 3042 + }, + { + "epoch": 0.2961557177615572, + "grad_norm": 1.6072179171276018, + "learning_rate": 8.255203244710316e-06, + "loss": 0.353, + "step": 3043 + }, + { + "epoch": 0.2962530413625304, + "grad_norm": 1.4217719575203627, + "learning_rate": 8.254006713591559e-06, + "loss": 0.3744, + "step": 3044 + }, + { + "epoch": 0.29635036496350364, + "grad_norm": 1.9013012922141048, + "learning_rate": 8.252809859120154e-06, + "loss": 0.209, + "step": 3045 + }, + { + "epoch": 0.2964476885644769, + "grad_norm": 1.390657831725977, + "learning_rate": 8.251612681415035e-06, + "loss": 0.3722, + "step": 3046 + }, + { + "epoch": 0.2965450121654501, + "grad_norm": 1.4478686848472833, + "learning_rate": 8.250415180595167e-06, + "loss": 0.3869, + "step": 3047 + }, + { + "epoch": 0.29664233576642335, + "grad_norm": 1.1443911522017596, + "learning_rate": 8.249217356779544e-06, + "loss": 0.3385, + "step": 3048 + }, + { + "epoch": 0.2967396593673966, + "grad_norm": 1.7245119786652503, + "learning_rate": 8.248019210087195e-06, + "loss": 0.3023, + "step": 3049 + }, + { + "epoch": 0.29683698296836986, + "grad_norm": 1.8030337728763741, + "learning_rate": 8.24682074063718e-06, + "loss": 0.3784, + "step": 3050 + }, + { + "epoch": 0.29693430656934305, + "grad_norm": 1.299417141317702, + "learning_rate": 8.245621948548593e-06, + "loss": 0.2963, + "step": 3051 + }, + { + "epoch": 0.2970316301703163, + "grad_norm": 1.3334468356141627, + "learning_rate": 8.244422833940558e-06, + "loss": 0.3671, + "step": 3052 + }, + { + "epoch": 0.29712895377128956, + "grad_norm": 1.6168488226188178, + "learning_rate": 8.24322339693223e-06, + "loss": 0.5497, + "step": 3053 + }, + { + "epoch": 0.29722627737226276, + "grad_norm": 1.49700230831562, + "learning_rate": 8.242023637642802e-06, + "loss": 0.4567, + "step": 3054 + }, + { + "epoch": 0.297323600973236, + "grad_norm": 1.0494586888942983, + "learning_rate": 8.24082355619149e-06, + "loss": 0.2186, + "step": 3055 + }, + { + "epoch": 0.29742092457420927, + "grad_norm": 1.372792205417397, + "learning_rate": 8.239623152697553e-06, + "loss": 0.5083, + "step": 3056 + }, + { + "epoch": 0.29751824817518246, + "grad_norm": 1.266230497219453, + "learning_rate": 8.238422427280269e-06, + "loss": 0.461, + "step": 3057 + }, + { + "epoch": 0.2976155717761557, + "grad_norm": 1.5041389582539588, + "learning_rate": 8.237221380058959e-06, + "loss": 0.3813, + "step": 3058 + }, + { + "epoch": 0.29771289537712897, + "grad_norm": 1.4593593621079823, + "learning_rate": 8.23602001115297e-06, + "loss": 0.473, + "step": 3059 + }, + { + "epoch": 0.29781021897810217, + "grad_norm": 1.3666083716931219, + "learning_rate": 8.234818320681685e-06, + "loss": 0.4822, + "step": 3060 + }, + { + "epoch": 0.2979075425790754, + "grad_norm": 1.407870228183954, + "learning_rate": 8.233616308764513e-06, + "loss": 0.4012, + "step": 3061 + }, + { + "epoch": 0.2980048661800487, + "grad_norm": 1.4404350668596586, + "learning_rate": 8.232413975520903e-06, + "loss": 0.5057, + "step": 3062 + }, + { + "epoch": 0.2981021897810219, + "grad_norm": 1.3912456713229528, + "learning_rate": 8.231211321070329e-06, + "loss": 0.4578, + "step": 3063 + }, + { + "epoch": 0.29819951338199513, + "grad_norm": 1.3191795228165797, + "learning_rate": 8.2300083455323e-06, + "loss": 0.3888, + "step": 3064 + }, + { + "epoch": 0.2982968369829684, + "grad_norm": 1.4258248936492355, + "learning_rate": 8.228805049026355e-06, + "loss": 0.5108, + "step": 3065 + }, + { + "epoch": 0.2983941605839416, + "grad_norm": 1.4850835614825084, + "learning_rate": 8.22760143167207e-06, + "loss": 0.5968, + "step": 3066 + }, + { + "epoch": 0.29849148418491483, + "grad_norm": 1.2696050534436827, + "learning_rate": 8.226397493589044e-06, + "loss": 0.3328, + "step": 3067 + }, + { + "epoch": 0.2985888077858881, + "grad_norm": 1.1993181516723008, + "learning_rate": 8.225193234896918e-06, + "loss": 0.2682, + "step": 3068 + }, + { + "epoch": 0.2986861313868613, + "grad_norm": 1.3420953543565923, + "learning_rate": 8.223988655715355e-06, + "loss": 0.3865, + "step": 3069 + }, + { + "epoch": 0.29878345498783454, + "grad_norm": 1.305913976862295, + "learning_rate": 8.222783756164061e-06, + "loss": 0.3551, + "step": 3070 + }, + { + "epoch": 0.2988807785888078, + "grad_norm": 1.3385899852932626, + "learning_rate": 8.221578536362764e-06, + "loss": 0.4203, + "step": 3071 + }, + { + "epoch": 0.29897810218978105, + "grad_norm": 1.189534251886867, + "learning_rate": 8.220372996431228e-06, + "loss": 0.2937, + "step": 3072 + }, + { + "epoch": 0.29907542579075425, + "grad_norm": 1.5982329206910104, + "learning_rate": 8.219167136489245e-06, + "loss": 0.6064, + "step": 3073 + }, + { + "epoch": 0.2991727493917275, + "grad_norm": 1.775024980718492, + "learning_rate": 8.217960956656648e-06, + "loss": 0.5517, + "step": 3074 + }, + { + "epoch": 0.29927007299270075, + "grad_norm": 1.4818012612095348, + "learning_rate": 8.216754457053291e-06, + "loss": 0.3574, + "step": 3075 + }, + { + "epoch": 0.29936739659367395, + "grad_norm": 1.5621403089409462, + "learning_rate": 8.215547637799068e-06, + "loss": 0.4108, + "step": 3076 + }, + { + "epoch": 0.2994647201946472, + "grad_norm": 1.4983847186167278, + "learning_rate": 8.214340499013899e-06, + "loss": 0.4644, + "step": 3077 + }, + { + "epoch": 0.29956204379562046, + "grad_norm": 1.5897848132407382, + "learning_rate": 8.213133040817738e-06, + "loss": 0.4894, + "step": 3078 + }, + { + "epoch": 0.29965936739659366, + "grad_norm": 1.6354640621760643, + "learning_rate": 8.211925263330573e-06, + "loss": 0.4583, + "step": 3079 + }, + { + "epoch": 0.2997566909975669, + "grad_norm": 1.4952024987397354, + "learning_rate": 8.21071716667242e-06, + "loss": 0.5976, + "step": 3080 + }, + { + "epoch": 0.29985401459854016, + "grad_norm": 1.0095340308225043, + "learning_rate": 8.20950875096333e-06, + "loss": 0.2524, + "step": 3081 + }, + { + "epoch": 0.29995133819951336, + "grad_norm": 1.4197678935056404, + "learning_rate": 8.208300016323381e-06, + "loss": 0.5514, + "step": 3082 + }, + { + "epoch": 0.3000486618004866, + "grad_norm": 1.249287306745543, + "learning_rate": 8.207090962872688e-06, + "loss": 0.2683, + "step": 3083 + }, + { + "epoch": 0.30014598540145987, + "grad_norm": 1.2420194980085992, + "learning_rate": 8.205881590731394e-06, + "loss": 0.3941, + "step": 3084 + }, + { + "epoch": 0.30024330900243307, + "grad_norm": 1.0228818593574307, + "learning_rate": 8.204671900019676e-06, + "loss": 0.2158, + "step": 3085 + }, + { + "epoch": 0.3003406326034063, + "grad_norm": 1.4988207950368069, + "learning_rate": 8.203461890857743e-06, + "loss": 0.4833, + "step": 3086 + }, + { + "epoch": 0.3004379562043796, + "grad_norm": 1.3402746636459373, + "learning_rate": 8.20225156336583e-06, + "loss": 0.437, + "step": 3087 + }, + { + "epoch": 0.3005352798053528, + "grad_norm": 1.3071666622302105, + "learning_rate": 8.201040917664214e-06, + "loss": 0.3667, + "step": 3088 + }, + { + "epoch": 0.300632603406326, + "grad_norm": 2.001934665501785, + "learning_rate": 8.199829953873192e-06, + "loss": 0.346, + "step": 3089 + }, + { + "epoch": 0.3007299270072993, + "grad_norm": 1.50451394225963, + "learning_rate": 8.198618672113104e-06, + "loss": 0.4897, + "step": 3090 + }, + { + "epoch": 0.3008272506082725, + "grad_norm": 1.5127622960173581, + "learning_rate": 8.197407072504309e-06, + "loss": 0.4301, + "step": 3091 + }, + { + "epoch": 0.30092457420924573, + "grad_norm": 1.409495275402236, + "learning_rate": 8.196195155167211e-06, + "loss": 0.4954, + "step": 3092 + }, + { + "epoch": 0.301021897810219, + "grad_norm": 1.3458224438835962, + "learning_rate": 8.194982920222233e-06, + "loss": 0.5023, + "step": 3093 + }, + { + "epoch": 0.30111922141119224, + "grad_norm": 1.484836707336815, + "learning_rate": 8.19377036778984e-06, + "loss": 0.4471, + "step": 3094 + }, + { + "epoch": 0.30121654501216544, + "grad_norm": 1.4314600061658445, + "learning_rate": 8.192557497990522e-06, + "loss": 0.4519, + "step": 3095 + }, + { + "epoch": 0.3013138686131387, + "grad_norm": 1.228152077257465, + "learning_rate": 8.191344310944803e-06, + "loss": 0.2338, + "step": 3096 + }, + { + "epoch": 0.30141119221411194, + "grad_norm": 1.4025619039626473, + "learning_rate": 8.19013080677324e-06, + "loss": 0.3748, + "step": 3097 + }, + { + "epoch": 0.30150851581508514, + "grad_norm": 1.535338102251852, + "learning_rate": 8.188916985596415e-06, + "loss": 0.3129, + "step": 3098 + }, + { + "epoch": 0.3016058394160584, + "grad_norm": 1.7024230210298346, + "learning_rate": 8.187702847534952e-06, + "loss": 0.5525, + "step": 3099 + }, + { + "epoch": 0.30170316301703165, + "grad_norm": 1.4950690283515784, + "learning_rate": 8.186488392709495e-06, + "loss": 0.5258, + "step": 3100 + }, + { + "epoch": 0.30180048661800485, + "grad_norm": 1.589216178732189, + "learning_rate": 8.18527362124073e-06, + "loss": 0.5745, + "step": 3101 + }, + { + "epoch": 0.3018978102189781, + "grad_norm": 1.5942675928105552, + "learning_rate": 8.184058533249367e-06, + "loss": 0.6344, + "step": 3102 + }, + { + "epoch": 0.30199513381995136, + "grad_norm": 1.3981131065521017, + "learning_rate": 8.18284312885615e-06, + "loss": 0.3369, + "step": 3103 + }, + { + "epoch": 0.30209245742092455, + "grad_norm": 1.6180199585993311, + "learning_rate": 8.181627408181854e-06, + "loss": 0.4014, + "step": 3104 + }, + { + "epoch": 0.3021897810218978, + "grad_norm": 1.6338683004824879, + "learning_rate": 8.180411371347288e-06, + "loss": 0.4983, + "step": 3105 + }, + { + "epoch": 0.30228710462287106, + "grad_norm": 1.5225224020676915, + "learning_rate": 8.17919501847329e-06, + "loss": 0.5016, + "step": 3106 + }, + { + "epoch": 0.30238442822384426, + "grad_norm": 1.23190340238718, + "learning_rate": 8.177978349680727e-06, + "loss": 0.3644, + "step": 3107 + }, + { + "epoch": 0.3024817518248175, + "grad_norm": 1.4496645177592962, + "learning_rate": 8.176761365090503e-06, + "loss": 0.526, + "step": 3108 + }, + { + "epoch": 0.30257907542579077, + "grad_norm": 1.5209859048615393, + "learning_rate": 8.17554406482355e-06, + "loss": 0.3034, + "step": 3109 + }, + { + "epoch": 0.30267639902676396, + "grad_norm": 1.4404359772108442, + "learning_rate": 8.17432644900083e-06, + "loss": 0.4735, + "step": 3110 + }, + { + "epoch": 0.3027737226277372, + "grad_norm": 1.2693525922216498, + "learning_rate": 8.173108517743343e-06, + "loss": 0.4021, + "step": 3111 + }, + { + "epoch": 0.30287104622871047, + "grad_norm": 1.3995736051817393, + "learning_rate": 8.171890271172109e-06, + "loss": 0.3084, + "step": 3112 + }, + { + "epoch": 0.3029683698296837, + "grad_norm": 1.5690384436250255, + "learning_rate": 8.17067170940819e-06, + "loss": 0.4097, + "step": 3113 + }, + { + "epoch": 0.3030656934306569, + "grad_norm": 1.270566641334736, + "learning_rate": 8.169452832572676e-06, + "loss": 0.3813, + "step": 3114 + }, + { + "epoch": 0.3031630170316302, + "grad_norm": 1.1690990375599999, + "learning_rate": 8.168233640786682e-06, + "loss": 0.2898, + "step": 3115 + }, + { + "epoch": 0.30326034063260343, + "grad_norm": 1.5367454476066444, + "learning_rate": 8.167014134171367e-06, + "loss": 0.4167, + "step": 3116 + }, + { + "epoch": 0.30335766423357663, + "grad_norm": 1.113322849500334, + "learning_rate": 8.165794312847912e-06, + "loss": 0.3274, + "step": 3117 + }, + { + "epoch": 0.3034549878345499, + "grad_norm": 1.4711999953076527, + "learning_rate": 8.164574176937527e-06, + "loss": 0.368, + "step": 3118 + }, + { + "epoch": 0.30355231143552314, + "grad_norm": 1.4465621082621003, + "learning_rate": 8.163353726561462e-06, + "loss": 0.2719, + "step": 3119 + }, + { + "epoch": 0.30364963503649633, + "grad_norm": 1.5224694722189016, + "learning_rate": 8.162132961840994e-06, + "loss": 0.3296, + "step": 3120 + }, + { + "epoch": 0.3037469586374696, + "grad_norm": 1.3713377819635104, + "learning_rate": 8.160911882897429e-06, + "loss": 0.3064, + "step": 3121 + }, + { + "epoch": 0.30384428223844284, + "grad_norm": 1.6461819951429466, + "learning_rate": 8.159690489852108e-06, + "loss": 0.3646, + "step": 3122 + }, + { + "epoch": 0.30394160583941604, + "grad_norm": 1.4328493269552467, + "learning_rate": 8.1584687828264e-06, + "loss": 0.4363, + "step": 3123 + }, + { + "epoch": 0.3040389294403893, + "grad_norm": 1.2811535124384867, + "learning_rate": 8.157246761941708e-06, + "loss": 0.4582, + "step": 3124 + }, + { + "epoch": 0.30413625304136255, + "grad_norm": 1.2144846492785035, + "learning_rate": 8.156024427319464e-06, + "loss": 0.2413, + "step": 3125 + }, + { + "epoch": 0.30423357664233575, + "grad_norm": 1.4112474441167293, + "learning_rate": 8.154801779081135e-06, + "loss": 0.4762, + "step": 3126 + }, + { + "epoch": 0.304330900243309, + "grad_norm": 1.4641495751020401, + "learning_rate": 8.153578817348213e-06, + "loss": 0.4905, + "step": 3127 + }, + { + "epoch": 0.30442822384428225, + "grad_norm": 1.7041758523831827, + "learning_rate": 8.152355542242226e-06, + "loss": 0.5396, + "step": 3128 + }, + { + "epoch": 0.30452554744525545, + "grad_norm": 1.2349793608481423, + "learning_rate": 8.151131953884728e-06, + "loss": 0.3847, + "step": 3129 + }, + { + "epoch": 0.3046228710462287, + "grad_norm": 1.4859954822671841, + "learning_rate": 8.149908052397314e-06, + "loss": 0.5907, + "step": 3130 + }, + { + "epoch": 0.30472019464720196, + "grad_norm": 1.2693166698162108, + "learning_rate": 8.148683837901599e-06, + "loss": 0.2636, + "step": 3131 + }, + { + "epoch": 0.30481751824817516, + "grad_norm": 1.1084842598244526, + "learning_rate": 8.147459310519238e-06, + "loss": 0.3103, + "step": 3132 + }, + { + "epoch": 0.3049148418491484, + "grad_norm": 1.2151450124002459, + "learning_rate": 8.146234470371908e-06, + "loss": 0.2734, + "step": 3133 + }, + { + "epoch": 0.30501216545012166, + "grad_norm": 1.3625938628176788, + "learning_rate": 8.145009317581328e-06, + "loss": 0.3757, + "step": 3134 + }, + { + "epoch": 0.3051094890510949, + "grad_norm": 1.3367991206467007, + "learning_rate": 8.143783852269239e-06, + "loss": 0.3469, + "step": 3135 + }, + { + "epoch": 0.3052068126520681, + "grad_norm": 1.5485997458565015, + "learning_rate": 8.142558074557413e-06, + "loss": 0.6068, + "step": 3136 + }, + { + "epoch": 0.30530413625304137, + "grad_norm": 1.4327175362669387, + "learning_rate": 8.141331984567661e-06, + "loss": 0.4495, + "step": 3137 + }, + { + "epoch": 0.3054014598540146, + "grad_norm": 1.4648525390361329, + "learning_rate": 8.140105582421819e-06, + "loss": 0.4855, + "step": 3138 + }, + { + "epoch": 0.3054987834549878, + "grad_norm": 1.147269868566856, + "learning_rate": 8.138878868241755e-06, + "loss": 0.3671, + "step": 3139 + }, + { + "epoch": 0.3055961070559611, + "grad_norm": 1.4274559741070532, + "learning_rate": 8.13765184214937e-06, + "loss": 0.4246, + "step": 3140 + }, + { + "epoch": 0.30569343065693433, + "grad_norm": 1.3864373579762495, + "learning_rate": 8.13642450426659e-06, + "loss": 0.5252, + "step": 3141 + }, + { + "epoch": 0.3057907542579075, + "grad_norm": 1.3508219371380046, + "learning_rate": 8.135196854715382e-06, + "loss": 0.4022, + "step": 3142 + }, + { + "epoch": 0.3058880778588808, + "grad_norm": 1.3974501891292628, + "learning_rate": 8.133968893617734e-06, + "loss": 0.4903, + "step": 3143 + }, + { + "epoch": 0.30598540145985403, + "grad_norm": 1.379672607479744, + "learning_rate": 8.132740621095672e-06, + "loss": 0.4389, + "step": 3144 + }, + { + "epoch": 0.30608272506082723, + "grad_norm": 1.5338640282858476, + "learning_rate": 8.131512037271248e-06, + "loss": 0.5719, + "step": 3145 + }, + { + "epoch": 0.3061800486618005, + "grad_norm": 1.754315640729903, + "learning_rate": 8.130283142266549e-06, + "loss": 0.4684, + "step": 3146 + }, + { + "epoch": 0.30627737226277374, + "grad_norm": 1.3260485404955915, + "learning_rate": 8.129053936203688e-06, + "loss": 0.3967, + "step": 3147 + }, + { + "epoch": 0.30637469586374694, + "grad_norm": 1.3926987348333701, + "learning_rate": 8.127824419204818e-06, + "loss": 0.3916, + "step": 3148 + }, + { + "epoch": 0.3064720194647202, + "grad_norm": 1.4794110467900325, + "learning_rate": 8.126594591392108e-06, + "loss": 0.4127, + "step": 3149 + }, + { + "epoch": 0.30656934306569344, + "grad_norm": 1.5256531082933278, + "learning_rate": 8.125364452887775e-06, + "loss": 0.4219, + "step": 3150 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 1.2765687697220431, + "learning_rate": 8.124134003814054e-06, + "loss": 0.3482, + "step": 3151 + }, + { + "epoch": 0.3067639902676399, + "grad_norm": 1.345009147300955, + "learning_rate": 8.122903244293217e-06, + "loss": 0.2419, + "step": 3152 + }, + { + "epoch": 0.30686131386861315, + "grad_norm": 1.243655794496096, + "learning_rate": 8.121672174447566e-06, + "loss": 0.3132, + "step": 3153 + }, + { + "epoch": 0.30695863746958635, + "grad_norm": 1.4526276096090445, + "learning_rate": 8.120440794399432e-06, + "loss": 0.5369, + "step": 3154 + }, + { + "epoch": 0.3070559610705596, + "grad_norm": 1.4088986355103132, + "learning_rate": 8.119209104271177e-06, + "loss": 0.331, + "step": 3155 + }, + { + "epoch": 0.30715328467153286, + "grad_norm": 1.4915113265208189, + "learning_rate": 8.117977104185198e-06, + "loss": 0.6195, + "step": 3156 + }, + { + "epoch": 0.3072506082725061, + "grad_norm": 1.143472956889321, + "learning_rate": 8.116744794263916e-06, + "loss": 0.2632, + "step": 3157 + }, + { + "epoch": 0.3073479318734793, + "grad_norm": 1.3575606240914238, + "learning_rate": 8.11551217462979e-06, + "loss": 0.3927, + "step": 3158 + }, + { + "epoch": 0.30744525547445256, + "grad_norm": 1.2891794787417357, + "learning_rate": 8.114279245405301e-06, + "loss": 0.3766, + "step": 3159 + }, + { + "epoch": 0.3075425790754258, + "grad_norm": 1.1671813045475161, + "learning_rate": 8.113046006712973e-06, + "loss": 0.3527, + "step": 3160 + }, + { + "epoch": 0.307639902676399, + "grad_norm": 1.2346428689153102, + "learning_rate": 8.111812458675348e-06, + "loss": 0.456, + "step": 3161 + }, + { + "epoch": 0.30773722627737227, + "grad_norm": 1.543619139526521, + "learning_rate": 8.110578601415007e-06, + "loss": 0.419, + "step": 3162 + }, + { + "epoch": 0.3078345498783455, + "grad_norm": 1.5361457722305751, + "learning_rate": 8.109344435054557e-06, + "loss": 0.4477, + "step": 3163 + }, + { + "epoch": 0.3079318734793187, + "grad_norm": 1.0948111699644958, + "learning_rate": 8.108109959716641e-06, + "loss": 0.3469, + "step": 3164 + }, + { + "epoch": 0.30802919708029197, + "grad_norm": 1.4900262169803653, + "learning_rate": 8.106875175523928e-06, + "loss": 0.5066, + "step": 3165 + }, + { + "epoch": 0.3081265206812652, + "grad_norm": 1.2143378476964652, + "learning_rate": 8.105640082599118e-06, + "loss": 0.4016, + "step": 3166 + }, + { + "epoch": 0.3082238442822384, + "grad_norm": 1.227404068812886, + "learning_rate": 8.104404681064943e-06, + "loss": 0.3408, + "step": 3167 + }, + { + "epoch": 0.3083211678832117, + "grad_norm": 1.273486832675327, + "learning_rate": 8.10316897104417e-06, + "loss": 0.3819, + "step": 3168 + }, + { + "epoch": 0.30841849148418493, + "grad_norm": 1.390509439874599, + "learning_rate": 8.101932952659586e-06, + "loss": 0.5108, + "step": 3169 + }, + { + "epoch": 0.30851581508515813, + "grad_norm": 1.1910701089910116, + "learning_rate": 8.100696626034019e-06, + "loss": 0.3579, + "step": 3170 + }, + { + "epoch": 0.3086131386861314, + "grad_norm": 1.6328619260471173, + "learning_rate": 8.099459991290324e-06, + "loss": 0.666, + "step": 3171 + }, + { + "epoch": 0.30871046228710464, + "grad_norm": 1.4085180007277236, + "learning_rate": 8.09822304855138e-06, + "loss": 0.3684, + "step": 3172 + }, + { + "epoch": 0.30880778588807783, + "grad_norm": 1.5920530522626664, + "learning_rate": 8.096985797940111e-06, + "loss": 0.4499, + "step": 3173 + }, + { + "epoch": 0.3089051094890511, + "grad_norm": 1.2895313219583247, + "learning_rate": 8.09574823957946e-06, + "loss": 0.4939, + "step": 3174 + }, + { + "epoch": 0.30900243309002434, + "grad_norm": 1.5242111980147517, + "learning_rate": 8.094510373592403e-06, + "loss": 0.3223, + "step": 3175 + }, + { + "epoch": 0.30909975669099754, + "grad_norm": 1.3628460645839475, + "learning_rate": 8.093272200101946e-06, + "loss": 0.507, + "step": 3176 + }, + { + "epoch": 0.3091970802919708, + "grad_norm": 1.45716247785806, + "learning_rate": 8.092033719231134e-06, + "loss": 0.2011, + "step": 3177 + }, + { + "epoch": 0.30929440389294405, + "grad_norm": 1.0824220688323085, + "learning_rate": 8.090794931103026e-06, + "loss": 0.2127, + "step": 3178 + }, + { + "epoch": 0.3093917274939173, + "grad_norm": 1.3637285345889945, + "learning_rate": 8.089555835840728e-06, + "loss": 0.3567, + "step": 3179 + }, + { + "epoch": 0.3094890510948905, + "grad_norm": 1.352681485055594, + "learning_rate": 8.088316433567369e-06, + "loss": 0.4403, + "step": 3180 + }, + { + "epoch": 0.30958637469586375, + "grad_norm": 1.5330943463849844, + "learning_rate": 8.087076724406106e-06, + "loss": 0.3379, + "step": 3181 + }, + { + "epoch": 0.309683698296837, + "grad_norm": 1.4102464472701088, + "learning_rate": 8.08583670848013e-06, + "loss": 0.5173, + "step": 3182 + }, + { + "epoch": 0.3097810218978102, + "grad_norm": 1.3268465957799758, + "learning_rate": 8.084596385912666e-06, + "loss": 0.2684, + "step": 3183 + }, + { + "epoch": 0.30987834549878346, + "grad_norm": 1.0612856713580272, + "learning_rate": 8.083355756826962e-06, + "loss": 0.2057, + "step": 3184 + }, + { + "epoch": 0.3099756690997567, + "grad_norm": 1.2705392445129486, + "learning_rate": 8.082114821346302e-06, + "loss": 0.4234, + "step": 3185 + }, + { + "epoch": 0.3100729927007299, + "grad_norm": 1.502946661443704, + "learning_rate": 8.080873579593997e-06, + "loss": 0.5134, + "step": 3186 + }, + { + "epoch": 0.31017031630170316, + "grad_norm": 1.4788874331253061, + "learning_rate": 8.079632031693392e-06, + "loss": 0.6157, + "step": 3187 + }, + { + "epoch": 0.3102676399026764, + "grad_norm": 1.0642136610663042, + "learning_rate": 8.078390177767858e-06, + "loss": 0.2667, + "step": 3188 + }, + { + "epoch": 0.3103649635036496, + "grad_norm": 1.2694637339318475, + "learning_rate": 8.0771480179408e-06, + "loss": 0.4189, + "step": 3189 + }, + { + "epoch": 0.31046228710462287, + "grad_norm": 1.40127369638598, + "learning_rate": 8.075905552335652e-06, + "loss": 0.6007, + "step": 3190 + }, + { + "epoch": 0.3105596107055961, + "grad_norm": 1.4654952978073685, + "learning_rate": 8.07466278107588e-06, + "loss": 0.524, + "step": 3191 + }, + { + "epoch": 0.3106569343065693, + "grad_norm": 1.468706008576981, + "learning_rate": 8.073419704284977e-06, + "loss": 0.5511, + "step": 3192 + }, + { + "epoch": 0.3107542579075426, + "grad_norm": 1.3171427220237197, + "learning_rate": 8.072176322086468e-06, + "loss": 0.4903, + "step": 3193 + }, + { + "epoch": 0.31085158150851583, + "grad_norm": 1.5864389312753313, + "learning_rate": 8.07093263460391e-06, + "loss": 0.7036, + "step": 3194 + }, + { + "epoch": 0.310948905109489, + "grad_norm": 1.3892836370042843, + "learning_rate": 8.06968864196089e-06, + "loss": 0.4277, + "step": 3195 + }, + { + "epoch": 0.3110462287104623, + "grad_norm": 1.1676123850477602, + "learning_rate": 8.06844434428102e-06, + "loss": 0.2693, + "step": 3196 + }, + { + "epoch": 0.31114355231143553, + "grad_norm": 1.506360397408636, + "learning_rate": 8.067199741687951e-06, + "loss": 0.4425, + "step": 3197 + }, + { + "epoch": 0.31124087591240873, + "grad_norm": 1.9902872035508865, + "learning_rate": 8.065954834305359e-06, + "loss": 0.4464, + "step": 3198 + }, + { + "epoch": 0.311338199513382, + "grad_norm": 1.3738283887856941, + "learning_rate": 8.06470962225695e-06, + "loss": 0.2504, + "step": 3199 + }, + { + "epoch": 0.31143552311435524, + "grad_norm": 1.5173657046475728, + "learning_rate": 8.063464105666462e-06, + "loss": 0.4145, + "step": 3200 + }, + { + "epoch": 0.3115328467153285, + "grad_norm": 1.5348915862029098, + "learning_rate": 8.062218284657663e-06, + "loss": 0.4182, + "step": 3201 + }, + { + "epoch": 0.3116301703163017, + "grad_norm": 1.6415893626851852, + "learning_rate": 8.06097215935435e-06, + "loss": 0.5928, + "step": 3202 + }, + { + "epoch": 0.31172749391727494, + "grad_norm": 1.2746939048596826, + "learning_rate": 8.059725729880354e-06, + "loss": 0.2945, + "step": 3203 + }, + { + "epoch": 0.3118248175182482, + "grad_norm": 1.354856369246625, + "learning_rate": 8.05847899635953e-06, + "loss": 0.445, + "step": 3204 + }, + { + "epoch": 0.3119221411192214, + "grad_norm": 1.3006695307047205, + "learning_rate": 8.057231958915767e-06, + "loss": 0.3558, + "step": 3205 + }, + { + "epoch": 0.31201946472019465, + "grad_norm": 1.294215405183638, + "learning_rate": 8.05598461767299e-06, + "loss": 0.3764, + "step": 3206 + }, + { + "epoch": 0.3121167883211679, + "grad_norm": 1.4713454344723826, + "learning_rate": 8.054736972755138e-06, + "loss": 0.4945, + "step": 3207 + }, + { + "epoch": 0.3122141119221411, + "grad_norm": 1.1708979284493786, + "learning_rate": 8.053489024286198e-06, + "loss": 0.2419, + "step": 3208 + }, + { + "epoch": 0.31231143552311436, + "grad_norm": 1.2933153684751388, + "learning_rate": 8.052240772390176e-06, + "loss": 0.4624, + "step": 3209 + }, + { + "epoch": 0.3124087591240876, + "grad_norm": 1.2538957446837722, + "learning_rate": 8.050992217191114e-06, + "loss": 0.3305, + "step": 3210 + }, + { + "epoch": 0.3125060827250608, + "grad_norm": 1.5362890980077515, + "learning_rate": 8.049743358813078e-06, + "loss": 0.5151, + "step": 3211 + }, + { + "epoch": 0.31260340632603406, + "grad_norm": 1.3913400504692353, + "learning_rate": 8.04849419738017e-06, + "loss": 0.355, + "step": 3212 + }, + { + "epoch": 0.3127007299270073, + "grad_norm": 1.3089021233990763, + "learning_rate": 8.04724473301652e-06, + "loss": 0.262, + "step": 3213 + }, + { + "epoch": 0.3127980535279805, + "grad_norm": 1.0542807465796966, + "learning_rate": 8.045994965846288e-06, + "loss": 0.3133, + "step": 3214 + }, + { + "epoch": 0.31289537712895377, + "grad_norm": 1.4749126602435412, + "learning_rate": 8.044744895993666e-06, + "loss": 0.46, + "step": 3215 + }, + { + "epoch": 0.312992700729927, + "grad_norm": 1.3746833003058674, + "learning_rate": 8.043494523582871e-06, + "loss": 0.427, + "step": 3216 + }, + { + "epoch": 0.3130900243309002, + "grad_norm": 1.3205796823520726, + "learning_rate": 8.042243848738153e-06, + "loss": 0.3354, + "step": 3217 + }, + { + "epoch": 0.31318734793187347, + "grad_norm": 1.2865057356076828, + "learning_rate": 8.040992871583797e-06, + "loss": 0.3941, + "step": 3218 + }, + { + "epoch": 0.3132846715328467, + "grad_norm": 1.1248365389432082, + "learning_rate": 8.039741592244108e-06, + "loss": 0.2628, + "step": 3219 + }, + { + "epoch": 0.3133819951338199, + "grad_norm": 1.4652698761705358, + "learning_rate": 8.03849001084343e-06, + "loss": 0.3564, + "step": 3220 + }, + { + "epoch": 0.3134793187347932, + "grad_norm": 1.3953599364279132, + "learning_rate": 8.037238127506128e-06, + "loss": 0.4163, + "step": 3221 + }, + { + "epoch": 0.31357664233576643, + "grad_norm": 1.2419892638751415, + "learning_rate": 8.035985942356612e-06, + "loss": 0.354, + "step": 3222 + }, + { + "epoch": 0.3136739659367397, + "grad_norm": 1.715104485156596, + "learning_rate": 8.034733455519303e-06, + "loss": 0.2963, + "step": 3223 + }, + { + "epoch": 0.3137712895377129, + "grad_norm": 1.470040424076559, + "learning_rate": 8.033480667118667e-06, + "loss": 0.4648, + "step": 3224 + }, + { + "epoch": 0.31386861313868614, + "grad_norm": 1.4565317560800817, + "learning_rate": 8.032227577279191e-06, + "loss": 0.512, + "step": 3225 + }, + { + "epoch": 0.3139659367396594, + "grad_norm": 1.3742041452053566, + "learning_rate": 8.030974186125397e-06, + "loss": 0.3956, + "step": 3226 + }, + { + "epoch": 0.3140632603406326, + "grad_norm": 1.3788492769137004, + "learning_rate": 8.029720493781838e-06, + "loss": 0.4509, + "step": 3227 + }, + { + "epoch": 0.31416058394160584, + "grad_norm": 1.3858125546461868, + "learning_rate": 8.028466500373089e-06, + "loss": 0.2106, + "step": 3228 + }, + { + "epoch": 0.3142579075425791, + "grad_norm": 1.3840076863400683, + "learning_rate": 8.027212206023762e-06, + "loss": 0.3038, + "step": 3229 + }, + { + "epoch": 0.3143552311435523, + "grad_norm": 1.3152632009702614, + "learning_rate": 8.0259576108585e-06, + "loss": 0.4801, + "step": 3230 + }, + { + "epoch": 0.31445255474452555, + "grad_norm": 1.3788629368363385, + "learning_rate": 8.024702715001968e-06, + "loss": 0.4245, + "step": 3231 + }, + { + "epoch": 0.3145498783454988, + "grad_norm": 1.814337226771454, + "learning_rate": 8.023447518578868e-06, + "loss": 0.5632, + "step": 3232 + }, + { + "epoch": 0.314647201946472, + "grad_norm": 1.4232062511008752, + "learning_rate": 8.02219202171393e-06, + "loss": 0.3286, + "step": 3233 + }, + { + "epoch": 0.31474452554744525, + "grad_norm": 1.280903932308868, + "learning_rate": 8.020936224531912e-06, + "loss": 0.3626, + "step": 3234 + }, + { + "epoch": 0.3148418491484185, + "grad_norm": 1.0510702763807533, + "learning_rate": 8.019680127157607e-06, + "loss": 0.2524, + "step": 3235 + }, + { + "epoch": 0.3149391727493917, + "grad_norm": 1.7478480050109064, + "learning_rate": 8.018423729715832e-06, + "loss": 0.4348, + "step": 3236 + }, + { + "epoch": 0.31503649635036496, + "grad_norm": 1.5359033994100462, + "learning_rate": 8.017167032331434e-06, + "loss": 0.4124, + "step": 3237 + }, + { + "epoch": 0.3151338199513382, + "grad_norm": 1.506288459888311, + "learning_rate": 8.015910035129294e-06, + "loss": 0.3261, + "step": 3238 + }, + { + "epoch": 0.3152311435523114, + "grad_norm": 1.2745124165082422, + "learning_rate": 8.01465273823432e-06, + "loss": 0.464, + "step": 3239 + }, + { + "epoch": 0.31532846715328466, + "grad_norm": 1.338919807585357, + "learning_rate": 8.01339514177145e-06, + "loss": 0.4172, + "step": 3240 + }, + { + "epoch": 0.3154257907542579, + "grad_norm": 1.486780887173232, + "learning_rate": 8.012137245865654e-06, + "loss": 0.5408, + "step": 3241 + }, + { + "epoch": 0.31552311435523117, + "grad_norm": 1.3714950278620026, + "learning_rate": 8.010879050641927e-06, + "loss": 0.3436, + "step": 3242 + }, + { + "epoch": 0.31562043795620437, + "grad_norm": 1.631965892281063, + "learning_rate": 8.009620556225298e-06, + "loss": 0.4727, + "step": 3243 + }, + { + "epoch": 0.3157177615571776, + "grad_norm": 1.6149761911532763, + "learning_rate": 8.008361762740825e-06, + "loss": 0.4924, + "step": 3244 + }, + { + "epoch": 0.3158150851581509, + "grad_norm": 1.3425533377851226, + "learning_rate": 8.007102670313596e-06, + "loss": 0.3844, + "step": 3245 + }, + { + "epoch": 0.3159124087591241, + "grad_norm": 1.5650571116791179, + "learning_rate": 8.005843279068724e-06, + "loss": 0.5109, + "step": 3246 + }, + { + "epoch": 0.31600973236009733, + "grad_norm": 1.469317468630812, + "learning_rate": 8.004583589131359e-06, + "loss": 0.3981, + "step": 3247 + }, + { + "epoch": 0.3161070559610706, + "grad_norm": 1.3520956586482695, + "learning_rate": 8.003323600626675e-06, + "loss": 0.3628, + "step": 3248 + }, + { + "epoch": 0.3162043795620438, + "grad_norm": 1.306843877245721, + "learning_rate": 8.002063313679881e-06, + "loss": 0.3738, + "step": 3249 + }, + { + "epoch": 0.31630170316301703, + "grad_norm": 1.2062559137545288, + "learning_rate": 8.000802728416209e-06, + "loss": 0.3603, + "step": 3250 + }, + { + "epoch": 0.3163990267639903, + "grad_norm": 1.2270301850514787, + "learning_rate": 7.999541844960926e-06, + "loss": 0.3444, + "step": 3251 + }, + { + "epoch": 0.3164963503649635, + "grad_norm": 1.6301989651574331, + "learning_rate": 7.998280663439325e-06, + "loss": 0.5442, + "step": 3252 + }, + { + "epoch": 0.31659367396593674, + "grad_norm": 1.3633091002846736, + "learning_rate": 7.997019183976732e-06, + "loss": 0.4596, + "step": 3253 + }, + { + "epoch": 0.31669099756691, + "grad_norm": 1.0930243128300028, + "learning_rate": 7.9957574066985e-06, + "loss": 0.259, + "step": 3254 + }, + { + "epoch": 0.3167883211678832, + "grad_norm": 1.3445912322829077, + "learning_rate": 7.994495331730014e-06, + "loss": 0.438, + "step": 3255 + }, + { + "epoch": 0.31688564476885644, + "grad_norm": 1.3651374487790005, + "learning_rate": 7.993232959196687e-06, + "loss": 0.4589, + "step": 3256 + }, + { + "epoch": 0.3169829683698297, + "grad_norm": 1.3424961538987858, + "learning_rate": 7.99197028922396e-06, + "loss": 0.4367, + "step": 3257 + }, + { + "epoch": 0.3170802919708029, + "grad_norm": 1.5932344837064665, + "learning_rate": 7.990707321937308e-06, + "loss": 0.6921, + "step": 3258 + }, + { + "epoch": 0.31717761557177615, + "grad_norm": 1.475521709829228, + "learning_rate": 7.989444057462228e-06, + "loss": 0.4759, + "step": 3259 + }, + { + "epoch": 0.3172749391727494, + "grad_norm": 1.2971398875872913, + "learning_rate": 7.988180495924256e-06, + "loss": 0.4588, + "step": 3260 + }, + { + "epoch": 0.3173722627737226, + "grad_norm": 1.4756687426354647, + "learning_rate": 7.986916637448953e-06, + "loss": 0.4776, + "step": 3261 + }, + { + "epoch": 0.31746958637469586, + "grad_norm": 1.6271934377592354, + "learning_rate": 7.985652482161907e-06, + "loss": 0.4979, + "step": 3262 + }, + { + "epoch": 0.3175669099756691, + "grad_norm": 1.6597644298654206, + "learning_rate": 7.984388030188739e-06, + "loss": 0.6091, + "step": 3263 + }, + { + "epoch": 0.31766423357664236, + "grad_norm": 1.3383482658500123, + "learning_rate": 7.983123281655097e-06, + "loss": 0.4371, + "step": 3264 + }, + { + "epoch": 0.31776155717761556, + "grad_norm": 1.4591756386356707, + "learning_rate": 7.981858236686661e-06, + "loss": 0.4888, + "step": 3265 + }, + { + "epoch": 0.3178588807785888, + "grad_norm": 1.1677482677553854, + "learning_rate": 7.98059289540914e-06, + "loss": 0.348, + "step": 3266 + }, + { + "epoch": 0.31795620437956207, + "grad_norm": 1.280054884121347, + "learning_rate": 7.97932725794827e-06, + "loss": 0.3909, + "step": 3267 + }, + { + "epoch": 0.31805352798053527, + "grad_norm": 1.2818245878680554, + "learning_rate": 7.97806132442982e-06, + "loss": 0.3432, + "step": 3268 + }, + { + "epoch": 0.3181508515815085, + "grad_norm": 1.2511980990717368, + "learning_rate": 7.976795094979586e-06, + "loss": 0.398, + "step": 3269 + }, + { + "epoch": 0.3182481751824818, + "grad_norm": 1.1398243641659185, + "learning_rate": 7.975528569723391e-06, + "loss": 0.3561, + "step": 3270 + }, + { + "epoch": 0.31834549878345497, + "grad_norm": 1.4375913010503336, + "learning_rate": 7.974261748787096e-06, + "loss": 0.4341, + "step": 3271 + }, + { + "epoch": 0.3184428223844282, + "grad_norm": 1.5232808350435216, + "learning_rate": 7.972994632296583e-06, + "loss": 0.443, + "step": 3272 + }, + { + "epoch": 0.3185401459854015, + "grad_norm": 1.1586214953526035, + "learning_rate": 7.971727220377765e-06, + "loss": 0.3709, + "step": 3273 + }, + { + "epoch": 0.3186374695863747, + "grad_norm": 1.5274740880588324, + "learning_rate": 7.970459513156587e-06, + "loss": 0.3699, + "step": 3274 + }, + { + "epoch": 0.31873479318734793, + "grad_norm": 1.3981699767571285, + "learning_rate": 7.969191510759021e-06, + "loss": 0.3678, + "step": 3275 + }, + { + "epoch": 0.3188321167883212, + "grad_norm": 1.934723248663663, + "learning_rate": 7.96792321331107e-06, + "loss": 0.5528, + "step": 3276 + }, + { + "epoch": 0.3189294403892944, + "grad_norm": 1.3063523868424662, + "learning_rate": 7.966654620938765e-06, + "loss": 0.381, + "step": 3277 + }, + { + "epoch": 0.31902676399026764, + "grad_norm": 1.4316009890701873, + "learning_rate": 7.965385733768166e-06, + "loss": 0.3462, + "step": 3278 + }, + { + "epoch": 0.3191240875912409, + "grad_norm": 1.3925079285209303, + "learning_rate": 7.964116551925365e-06, + "loss": 0.3468, + "step": 3279 + }, + { + "epoch": 0.3192214111922141, + "grad_norm": 1.5929020888752208, + "learning_rate": 7.96284707553648e-06, + "loss": 0.5416, + "step": 3280 + }, + { + "epoch": 0.31931873479318734, + "grad_norm": 1.3866900373898063, + "learning_rate": 7.961577304727659e-06, + "loss": 0.3982, + "step": 3281 + }, + { + "epoch": 0.3194160583941606, + "grad_norm": 1.4962340605356037, + "learning_rate": 7.960307239625082e-06, + "loss": 0.4023, + "step": 3282 + }, + { + "epoch": 0.3195133819951338, + "grad_norm": 1.4057559523045156, + "learning_rate": 7.959036880354955e-06, + "loss": 0.495, + "step": 3283 + }, + { + "epoch": 0.31961070559610705, + "grad_norm": 1.2622065590243314, + "learning_rate": 7.957766227043514e-06, + "loss": 0.3581, + "step": 3284 + }, + { + "epoch": 0.3197080291970803, + "grad_norm": 1.5090453488845967, + "learning_rate": 7.956495279817026e-06, + "loss": 0.455, + "step": 3285 + }, + { + "epoch": 0.31980535279805355, + "grad_norm": 1.3133879337401893, + "learning_rate": 7.955224038801785e-06, + "loss": 0.4625, + "step": 3286 + }, + { + "epoch": 0.31990267639902675, + "grad_norm": 2.521541296377749, + "learning_rate": 7.953952504124114e-06, + "loss": 0.4415, + "step": 3287 + }, + { + "epoch": 0.32, + "grad_norm": 1.3567937262050411, + "learning_rate": 7.952680675910365e-06, + "loss": 0.3309, + "step": 3288 + }, + { + "epoch": 0.32009732360097326, + "grad_norm": 1.5421944908903493, + "learning_rate": 7.951408554286926e-06, + "loss": 0.4589, + "step": 3289 + }, + { + "epoch": 0.32019464720194646, + "grad_norm": 1.5998274173642424, + "learning_rate": 7.950136139380204e-06, + "loss": 0.5359, + "step": 3290 + }, + { + "epoch": 0.3202919708029197, + "grad_norm": 1.2725707304657317, + "learning_rate": 7.948863431316639e-06, + "loss": 0.3625, + "step": 3291 + }, + { + "epoch": 0.32038929440389297, + "grad_norm": 1.4290851095226622, + "learning_rate": 7.947590430222702e-06, + "loss": 0.4872, + "step": 3292 + }, + { + "epoch": 0.32048661800486616, + "grad_norm": 1.3420498316619087, + "learning_rate": 7.946317136224894e-06, + "loss": 0.2389, + "step": 3293 + }, + { + "epoch": 0.3205839416058394, + "grad_norm": 1.5002712163507215, + "learning_rate": 7.94504354944974e-06, + "loss": 0.5157, + "step": 3294 + }, + { + "epoch": 0.32068126520681267, + "grad_norm": 1.660535414536618, + "learning_rate": 7.9437696700238e-06, + "loss": 0.4267, + "step": 3295 + }, + { + "epoch": 0.32077858880778587, + "grad_norm": 1.2370297819791147, + "learning_rate": 7.942495498073657e-06, + "loss": 0.3355, + "step": 3296 + }, + { + "epoch": 0.3208759124087591, + "grad_norm": 1.1415276287275913, + "learning_rate": 7.941221033725928e-06, + "loss": 0.2944, + "step": 3297 + }, + { + "epoch": 0.3209732360097324, + "grad_norm": 1.5554788387015477, + "learning_rate": 7.939946277107258e-06, + "loss": 0.4871, + "step": 3298 + }, + { + "epoch": 0.3210705596107056, + "grad_norm": 1.2985241068062738, + "learning_rate": 7.938671228344319e-06, + "loss": 0.3143, + "step": 3299 + }, + { + "epoch": 0.32116788321167883, + "grad_norm": 1.301901578297674, + "learning_rate": 7.937395887563812e-06, + "loss": 0.3965, + "step": 3300 + }, + { + "epoch": 0.3212652068126521, + "grad_norm": 0.9529902878913864, + "learning_rate": 7.936120254892471e-06, + "loss": 0.3083, + "step": 3301 + }, + { + "epoch": 0.3213625304136253, + "grad_norm": 1.442015067028423, + "learning_rate": 7.934844330457056e-06, + "loss": 0.4318, + "step": 3302 + }, + { + "epoch": 0.32145985401459853, + "grad_norm": 0.8997835110854661, + "learning_rate": 7.933568114384358e-06, + "loss": 0.2885, + "step": 3303 + }, + { + "epoch": 0.3215571776155718, + "grad_norm": 1.5293826180265608, + "learning_rate": 7.932291606801192e-06, + "loss": 0.5437, + "step": 3304 + }, + { + "epoch": 0.321654501216545, + "grad_norm": 1.2264156375116992, + "learning_rate": 7.931014807834405e-06, + "loss": 0.4001, + "step": 3305 + }, + { + "epoch": 0.32175182481751824, + "grad_norm": 1.260350527748902, + "learning_rate": 7.929737717610878e-06, + "loss": 0.3847, + "step": 3306 + }, + { + "epoch": 0.3218491484184915, + "grad_norm": 1.1346472253273232, + "learning_rate": 7.92846033625751e-06, + "loss": 0.3766, + "step": 3307 + }, + { + "epoch": 0.32194647201946475, + "grad_norm": 1.1035142206503785, + "learning_rate": 7.927182663901241e-06, + "loss": 0.369, + "step": 3308 + }, + { + "epoch": 0.32204379562043794, + "grad_norm": 1.2980599562576733, + "learning_rate": 7.92590470066903e-06, + "loss": 0.3982, + "step": 3309 + }, + { + "epoch": 0.3221411192214112, + "grad_norm": 1.1742301163722888, + "learning_rate": 7.924626446687871e-06, + "loss": 0.3423, + "step": 3310 + }, + { + "epoch": 0.32223844282238445, + "grad_norm": 1.8345279558348917, + "learning_rate": 7.923347902084784e-06, + "loss": 0.3145, + "step": 3311 + }, + { + "epoch": 0.32233576642335765, + "grad_norm": 1.562283157560816, + "learning_rate": 7.92206906698682e-06, + "loss": 0.4308, + "step": 3312 + }, + { + "epoch": 0.3224330900243309, + "grad_norm": 1.568673716358362, + "learning_rate": 7.920789941521053e-06, + "loss": 0.7025, + "step": 3313 + }, + { + "epoch": 0.32253041362530416, + "grad_norm": 1.409203500738135, + "learning_rate": 7.9195105258146e-06, + "loss": 0.459, + "step": 3314 + }, + { + "epoch": 0.32262773722627736, + "grad_norm": 0.880538645206563, + "learning_rate": 7.918230819994589e-06, + "loss": 0.2786, + "step": 3315 + }, + { + "epoch": 0.3227250608272506, + "grad_norm": 1.1906838149715093, + "learning_rate": 7.916950824188188e-06, + "loss": 0.2686, + "step": 3316 + }, + { + "epoch": 0.32282238442822386, + "grad_norm": 1.5111358859487132, + "learning_rate": 7.91567053852259e-06, + "loss": 0.5147, + "step": 3317 + }, + { + "epoch": 0.32291970802919706, + "grad_norm": 1.1464259378074475, + "learning_rate": 7.914389963125018e-06, + "loss": 0.2685, + "step": 3318 + }, + { + "epoch": 0.3230170316301703, + "grad_norm": 1.581506486679798, + "learning_rate": 7.913109098122726e-06, + "loss": 0.5854, + "step": 3319 + }, + { + "epoch": 0.32311435523114357, + "grad_norm": 1.3242645255760028, + "learning_rate": 7.91182794364299e-06, + "loss": 0.2315, + "step": 3320 + }, + { + "epoch": 0.32321167883211677, + "grad_norm": 1.6094300344001513, + "learning_rate": 7.910546499813125e-06, + "loss": 0.4739, + "step": 3321 + }, + { + "epoch": 0.32330900243309, + "grad_norm": 1.3446882210600364, + "learning_rate": 7.909264766760462e-06, + "loss": 0.4145, + "step": 3322 + }, + { + "epoch": 0.3234063260340633, + "grad_norm": 1.282785844235514, + "learning_rate": 7.907982744612373e-06, + "loss": 0.4324, + "step": 3323 + }, + { + "epoch": 0.32350364963503647, + "grad_norm": 1.8199182173411141, + "learning_rate": 7.90670043349625e-06, + "loss": 0.2738, + "step": 3324 + }, + { + "epoch": 0.3236009732360097, + "grad_norm": 1.3779170997525898, + "learning_rate": 7.90541783353952e-06, + "loss": 0.4386, + "step": 3325 + }, + { + "epoch": 0.323698296836983, + "grad_norm": 1.4651475108226641, + "learning_rate": 7.904134944869631e-06, + "loss": 0.2272, + "step": 3326 + }, + { + "epoch": 0.3237956204379562, + "grad_norm": 1.4444649350514327, + "learning_rate": 7.902851767614069e-06, + "loss": 0.3631, + "step": 3327 + }, + { + "epoch": 0.32389294403892943, + "grad_norm": 1.6625856822026597, + "learning_rate": 7.901568301900343e-06, + "loss": 0.3649, + "step": 3328 + }, + { + "epoch": 0.3239902676399027, + "grad_norm": 1.4506728288423798, + "learning_rate": 7.900284547855992e-06, + "loss": 0.3231, + "step": 3329 + }, + { + "epoch": 0.32408759124087594, + "grad_norm": 1.5932450335999997, + "learning_rate": 7.899000505608583e-06, + "loss": 0.6145, + "step": 3330 + }, + { + "epoch": 0.32418491484184914, + "grad_norm": 1.466657731000094, + "learning_rate": 7.89771617528571e-06, + "loss": 0.4498, + "step": 3331 + }, + { + "epoch": 0.3242822384428224, + "grad_norm": 1.1444340687397683, + "learning_rate": 7.896431557015001e-06, + "loss": 0.3953, + "step": 3332 + }, + { + "epoch": 0.32437956204379564, + "grad_norm": 1.2696327521021862, + "learning_rate": 7.895146650924106e-06, + "loss": 0.3974, + "step": 3333 + }, + { + "epoch": 0.32447688564476884, + "grad_norm": 1.0672959190242737, + "learning_rate": 7.893861457140711e-06, + "loss": 0.3147, + "step": 3334 + }, + { + "epoch": 0.3245742092457421, + "grad_norm": 1.5878275615936528, + "learning_rate": 7.892575975792524e-06, + "loss": 0.5637, + "step": 3335 + }, + { + "epoch": 0.32467153284671535, + "grad_norm": 1.3553067860868797, + "learning_rate": 7.891290207007284e-06, + "loss": 0.3979, + "step": 3336 + }, + { + "epoch": 0.32476885644768855, + "grad_norm": 1.3585205023281657, + "learning_rate": 7.890004150912758e-06, + "loss": 0.5408, + "step": 3337 + }, + { + "epoch": 0.3248661800486618, + "grad_norm": 1.3335333713837063, + "learning_rate": 7.888717807636745e-06, + "loss": 0.5097, + "step": 3338 + }, + { + "epoch": 0.32496350364963505, + "grad_norm": 1.6084982696846433, + "learning_rate": 7.887431177307067e-06, + "loss": 0.6652, + "step": 3339 + }, + { + "epoch": 0.32506082725060825, + "grad_norm": 1.5254713518221517, + "learning_rate": 7.886144260051577e-06, + "loss": 0.5413, + "step": 3340 + }, + { + "epoch": 0.3251581508515815, + "grad_norm": 1.2319691547427678, + "learning_rate": 7.88485705599816e-06, + "loss": 0.3669, + "step": 3341 + }, + { + "epoch": 0.32525547445255476, + "grad_norm": 1.8060436355287317, + "learning_rate": 7.883569565274722e-06, + "loss": 0.332, + "step": 3342 + }, + { + "epoch": 0.32535279805352796, + "grad_norm": 1.2724687132520958, + "learning_rate": 7.882281788009207e-06, + "loss": 0.4156, + "step": 3343 + }, + { + "epoch": 0.3254501216545012, + "grad_norm": 1.2678653056689784, + "learning_rate": 7.880993724329578e-06, + "loss": 0.34, + "step": 3344 + }, + { + "epoch": 0.32554744525547447, + "grad_norm": 1.6551598614012555, + "learning_rate": 7.879705374363831e-06, + "loss": 0.4642, + "step": 3345 + }, + { + "epoch": 0.32564476885644766, + "grad_norm": 1.3206920191183078, + "learning_rate": 7.878416738239991e-06, + "loss": 0.3755, + "step": 3346 + }, + { + "epoch": 0.3257420924574209, + "grad_norm": 1.3858157374277495, + "learning_rate": 7.877127816086109e-06, + "loss": 0.3394, + "step": 3347 + }, + { + "epoch": 0.32583941605839417, + "grad_norm": 2.0774474222482286, + "learning_rate": 7.87583860803027e-06, + "loss": 0.4237, + "step": 3348 + }, + { + "epoch": 0.32593673965936737, + "grad_norm": 2.373250216339497, + "learning_rate": 7.87454911420058e-06, + "loss": 0.4854, + "step": 3349 + }, + { + "epoch": 0.3260340632603406, + "grad_norm": 1.2888779016735306, + "learning_rate": 7.873259334725177e-06, + "loss": 0.2953, + "step": 3350 + }, + { + "epoch": 0.3261313868613139, + "grad_norm": 1.4655511800886896, + "learning_rate": 7.87196926973223e-06, + "loss": 0.5252, + "step": 3351 + }, + { + "epoch": 0.32622871046228713, + "grad_norm": 1.3179348636560488, + "learning_rate": 7.870678919349929e-06, + "loss": 0.3587, + "step": 3352 + }, + { + "epoch": 0.32632603406326033, + "grad_norm": 1.5442264887171864, + "learning_rate": 7.869388283706501e-06, + "loss": 0.3808, + "step": 3353 + }, + { + "epoch": 0.3264233576642336, + "grad_norm": 0.909176934030135, + "learning_rate": 7.868097362930194e-06, + "loss": 0.1721, + "step": 3354 + }, + { + "epoch": 0.32652068126520684, + "grad_norm": 1.5955557971117078, + "learning_rate": 7.866806157149291e-06, + "loss": 0.5127, + "step": 3355 + }, + { + "epoch": 0.32661800486618003, + "grad_norm": 1.260336198155953, + "learning_rate": 7.865514666492096e-06, + "loss": 0.2699, + "step": 3356 + }, + { + "epoch": 0.3267153284671533, + "grad_norm": 1.5004384671553455, + "learning_rate": 7.864222891086948e-06, + "loss": 0.3168, + "step": 3357 + }, + { + "epoch": 0.32681265206812654, + "grad_norm": 1.261076205541224, + "learning_rate": 7.862930831062211e-06, + "loss": 0.3678, + "step": 3358 + }, + { + "epoch": 0.32690997566909974, + "grad_norm": 1.637405579152474, + "learning_rate": 7.861638486546279e-06, + "loss": 0.4613, + "step": 3359 + }, + { + "epoch": 0.327007299270073, + "grad_norm": 1.3584995739472485, + "learning_rate": 7.860345857667571e-06, + "loss": 0.3229, + "step": 3360 + }, + { + "epoch": 0.32710462287104625, + "grad_norm": 1.5428951197572305, + "learning_rate": 7.859052944554537e-06, + "loss": 0.4904, + "step": 3361 + }, + { + "epoch": 0.32720194647201944, + "grad_norm": 1.2674356859582525, + "learning_rate": 7.857759747335652e-06, + "loss": 0.2942, + "step": 3362 + }, + { + "epoch": 0.3272992700729927, + "grad_norm": 1.5802075836374327, + "learning_rate": 7.856466266139426e-06, + "loss": 0.2949, + "step": 3363 + }, + { + "epoch": 0.32739659367396595, + "grad_norm": 1.8209546966028776, + "learning_rate": 7.855172501094394e-06, + "loss": 0.5036, + "step": 3364 + }, + { + "epoch": 0.32749391727493915, + "grad_norm": 1.5829487718400805, + "learning_rate": 7.853878452329113e-06, + "loss": 0.3638, + "step": 3365 + }, + { + "epoch": 0.3275912408759124, + "grad_norm": 1.439883288183698, + "learning_rate": 7.852584119972178e-06, + "loss": 0.4529, + "step": 3366 + }, + { + "epoch": 0.32768856447688566, + "grad_norm": 1.188441154560626, + "learning_rate": 7.851289504152201e-06, + "loss": 0.1984, + "step": 3367 + }, + { + "epoch": 0.32778588807785886, + "grad_norm": 1.3841398947965158, + "learning_rate": 7.84999460499784e-06, + "loss": 0.3274, + "step": 3368 + }, + { + "epoch": 0.3278832116788321, + "grad_norm": 1.3784180119913427, + "learning_rate": 7.848699422637757e-06, + "loss": 0.5186, + "step": 3369 + }, + { + "epoch": 0.32798053527980536, + "grad_norm": 1.6167829071938324, + "learning_rate": 7.847403957200667e-06, + "loss": 0.5905, + "step": 3370 + }, + { + "epoch": 0.32807785888077856, + "grad_norm": 1.5035664756701093, + "learning_rate": 7.846108208815292e-06, + "loss": 0.3502, + "step": 3371 + }, + { + "epoch": 0.3281751824817518, + "grad_norm": 1.3836098651173667, + "learning_rate": 7.844812177610398e-06, + "loss": 0.426, + "step": 3372 + }, + { + "epoch": 0.32827250608272507, + "grad_norm": 1.33208856990685, + "learning_rate": 7.843515863714766e-06, + "loss": 0.38, + "step": 3373 + }, + { + "epoch": 0.3283698296836983, + "grad_norm": 1.3222208170433662, + "learning_rate": 7.842219267257216e-06, + "loss": 0.33, + "step": 3374 + }, + { + "epoch": 0.3284671532846715, + "grad_norm": 1.26421550866771, + "learning_rate": 7.84092238836659e-06, + "loss": 0.3682, + "step": 3375 + }, + { + "epoch": 0.3285644768856448, + "grad_norm": 1.3594451535417975, + "learning_rate": 7.839625227171762e-06, + "loss": 0.4504, + "step": 3376 + }, + { + "epoch": 0.328661800486618, + "grad_norm": 1.2038996790979526, + "learning_rate": 7.838327783801627e-06, + "loss": 0.3675, + "step": 3377 + }, + { + "epoch": 0.3287591240875912, + "grad_norm": 1.2523761100181583, + "learning_rate": 7.837030058385117e-06, + "loss": 0.2582, + "step": 3378 + }, + { + "epoch": 0.3288564476885645, + "grad_norm": 1.38623949822592, + "learning_rate": 7.835732051051188e-06, + "loss": 0.426, + "step": 3379 + }, + { + "epoch": 0.32895377128953773, + "grad_norm": 1.6752196039887233, + "learning_rate": 7.834433761928819e-06, + "loss": 0.5995, + "step": 3380 + }, + { + "epoch": 0.32905109489051093, + "grad_norm": 1.4769893614554368, + "learning_rate": 7.833135191147027e-06, + "loss": 0.4434, + "step": 3381 + }, + { + "epoch": 0.3291484184914842, + "grad_norm": 1.3473793961690093, + "learning_rate": 7.831836338834851e-06, + "loss": 0.4064, + "step": 3382 + }, + { + "epoch": 0.32924574209245744, + "grad_norm": 1.4564612861566626, + "learning_rate": 7.830537205121354e-06, + "loss": 0.5275, + "step": 3383 + }, + { + "epoch": 0.32934306569343064, + "grad_norm": 1.4662477809046923, + "learning_rate": 7.829237790135638e-06, + "loss": 0.3745, + "step": 3384 + }, + { + "epoch": 0.3294403892944039, + "grad_norm": 1.3883338656353856, + "learning_rate": 7.827938094006822e-06, + "loss": 0.4361, + "step": 3385 + }, + { + "epoch": 0.32953771289537714, + "grad_norm": 1.2360062745222065, + "learning_rate": 7.826638116864061e-06, + "loss": 0.2936, + "step": 3386 + }, + { + "epoch": 0.32963503649635034, + "grad_norm": 1.2636393287865908, + "learning_rate": 7.82533785883653e-06, + "loss": 0.3816, + "step": 3387 + }, + { + "epoch": 0.3297323600973236, + "grad_norm": 1.5704874728825693, + "learning_rate": 7.824037320053442e-06, + "loss": 0.4946, + "step": 3388 + }, + { + "epoch": 0.32982968369829685, + "grad_norm": 1.5450366878165769, + "learning_rate": 7.822736500644028e-06, + "loss": 0.5973, + "step": 3389 + }, + { + "epoch": 0.32992700729927005, + "grad_norm": 1.570140468606553, + "learning_rate": 7.821435400737555e-06, + "loss": 0.6187, + "step": 3390 + }, + { + "epoch": 0.3300243309002433, + "grad_norm": 1.404973531589098, + "learning_rate": 7.820134020463311e-06, + "loss": 0.4404, + "step": 3391 + }, + { + "epoch": 0.33012165450121655, + "grad_norm": 1.4221866811013593, + "learning_rate": 7.818832359950615e-06, + "loss": 0.4375, + "step": 3392 + }, + { + "epoch": 0.3302189781021898, + "grad_norm": 1.3514761483715907, + "learning_rate": 7.817530419328815e-06, + "loss": 0.4633, + "step": 3393 + }, + { + "epoch": 0.330316301703163, + "grad_norm": 1.4122938191319212, + "learning_rate": 7.816228198727287e-06, + "loss": 0.4735, + "step": 3394 + }, + { + "epoch": 0.33041362530413626, + "grad_norm": 1.2807472917541904, + "learning_rate": 7.814925698275432e-06, + "loss": 0.2993, + "step": 3395 + }, + { + "epoch": 0.3305109489051095, + "grad_norm": 1.2374164437493267, + "learning_rate": 7.813622918102679e-06, + "loss": 0.4486, + "step": 3396 + }, + { + "epoch": 0.3306082725060827, + "grad_norm": 1.4923726710921128, + "learning_rate": 7.812319858338486e-06, + "loss": 0.3976, + "step": 3397 + }, + { + "epoch": 0.33070559610705597, + "grad_norm": 1.4652422146853137, + "learning_rate": 7.811016519112342e-06, + "loss": 0.509, + "step": 3398 + }, + { + "epoch": 0.3308029197080292, + "grad_norm": 1.3523346564010856, + "learning_rate": 7.80971290055376e-06, + "loss": 0.4045, + "step": 3399 + }, + { + "epoch": 0.3309002433090024, + "grad_norm": 1.4034359644709637, + "learning_rate": 7.808409002792277e-06, + "loss": 0.5252, + "step": 3400 + }, + { + "epoch": 0.33099756690997567, + "grad_norm": 1.5977042267924388, + "learning_rate": 7.807104825957466e-06, + "loss": 0.5708, + "step": 3401 + }, + { + "epoch": 0.3310948905109489, + "grad_norm": 1.194169039851455, + "learning_rate": 7.805800370178925e-06, + "loss": 0.2592, + "step": 3402 + }, + { + "epoch": 0.3311922141119221, + "grad_norm": 1.3572077520529662, + "learning_rate": 7.804495635586274e-06, + "loss": 0.3838, + "step": 3403 + }, + { + "epoch": 0.3312895377128954, + "grad_norm": 1.6103699950857218, + "learning_rate": 7.80319062230917e-06, + "loss": 0.2847, + "step": 3404 + }, + { + "epoch": 0.33138686131386863, + "grad_norm": 1.1427751941761943, + "learning_rate": 7.80188533047729e-06, + "loss": 0.3235, + "step": 3405 + }, + { + "epoch": 0.33148418491484183, + "grad_norm": 1.4205910051862616, + "learning_rate": 7.800579760220343e-06, + "loss": 0.4415, + "step": 3406 + }, + { + "epoch": 0.3315815085158151, + "grad_norm": 1.239833112068907, + "learning_rate": 7.799273911668062e-06, + "loss": 0.296, + "step": 3407 + }, + { + "epoch": 0.33167883211678834, + "grad_norm": 1.382006652854662, + "learning_rate": 7.797967784950215e-06, + "loss": 0.5129, + "step": 3408 + }, + { + "epoch": 0.33177615571776153, + "grad_norm": 1.3910482812533478, + "learning_rate": 7.796661380196587e-06, + "loss": 0.4355, + "step": 3409 + }, + { + "epoch": 0.3318734793187348, + "grad_norm": 1.3166393673557537, + "learning_rate": 7.795354697537e-06, + "loss": 0.3357, + "step": 3410 + }, + { + "epoch": 0.33197080291970804, + "grad_norm": 1.3540344564992455, + "learning_rate": 7.794047737101298e-06, + "loss": 0.2772, + "step": 3411 + }, + { + "epoch": 0.33206812652068124, + "grad_norm": 1.5732997739305445, + "learning_rate": 7.792740499019354e-06, + "loss": 0.368, + "step": 3412 + }, + { + "epoch": 0.3321654501216545, + "grad_norm": 1.00398349093736, + "learning_rate": 7.791432983421071e-06, + "loss": 0.2794, + "step": 3413 + }, + { + "epoch": 0.33226277372262775, + "grad_norm": 1.5831231764140208, + "learning_rate": 7.790125190436378e-06, + "loss": 0.399, + "step": 3414 + }, + { + "epoch": 0.332360097323601, + "grad_norm": 1.301291609070449, + "learning_rate": 7.788817120195228e-06, + "loss": 0.4975, + "step": 3415 + }, + { + "epoch": 0.3324574209245742, + "grad_norm": 3.04667201221868, + "learning_rate": 7.787508772827606e-06, + "loss": 0.3034, + "step": 3416 + }, + { + "epoch": 0.33255474452554745, + "grad_norm": 1.3711038355442808, + "learning_rate": 7.786200148463525e-06, + "loss": 0.4023, + "step": 3417 + }, + { + "epoch": 0.3326520681265207, + "grad_norm": 1.4460757328108422, + "learning_rate": 7.784891247233025e-06, + "loss": 0.5218, + "step": 3418 + }, + { + "epoch": 0.3327493917274939, + "grad_norm": 1.5174415949182483, + "learning_rate": 7.783582069266167e-06, + "loss": 0.5401, + "step": 3419 + }, + { + "epoch": 0.33284671532846716, + "grad_norm": 1.170900270080405, + "learning_rate": 7.78227261469305e-06, + "loss": 0.3093, + "step": 3420 + }, + { + "epoch": 0.3329440389294404, + "grad_norm": 1.1117784496155982, + "learning_rate": 7.78096288364379e-06, + "loss": 0.2159, + "step": 3421 + }, + { + "epoch": 0.3330413625304136, + "grad_norm": 1.385907002729564, + "learning_rate": 7.779652876248541e-06, + "loss": 0.4513, + "step": 3422 + }, + { + "epoch": 0.33313868613138686, + "grad_norm": 1.091845134289533, + "learning_rate": 7.778342592637477e-06, + "loss": 0.249, + "step": 3423 + }, + { + "epoch": 0.3332360097323601, + "grad_norm": 1.1987125682853903, + "learning_rate": 7.7770320329408e-06, + "loss": 0.4583, + "step": 3424 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.150260309114711, + "learning_rate": 7.775721197288746e-06, + "loss": 0.4145, + "step": 3425 + }, + { + "epoch": 0.33343065693430657, + "grad_norm": 1.1244746146994131, + "learning_rate": 7.77441008581157e-06, + "loss": 0.2334, + "step": 3426 + }, + { + "epoch": 0.3335279805352798, + "grad_norm": 2.372002969337908, + "learning_rate": 7.773098698639558e-06, + "loss": 0.3346, + "step": 3427 + }, + { + "epoch": 0.333625304136253, + "grad_norm": 1.460305633169593, + "learning_rate": 7.771787035903023e-06, + "loss": 0.5202, + "step": 3428 + }, + { + "epoch": 0.3337226277372263, + "grad_norm": 1.4552706392676258, + "learning_rate": 7.77047509773231e-06, + "loss": 0.3249, + "step": 3429 + }, + { + "epoch": 0.3338199513381995, + "grad_norm": 26.423500051667432, + "learning_rate": 7.769162884257778e-06, + "loss": 0.3919, + "step": 3430 + }, + { + "epoch": 0.3339172749391727, + "grad_norm": 1.3364724766772538, + "learning_rate": 7.767850395609832e-06, + "loss": 0.4882, + "step": 3431 + }, + { + "epoch": 0.334014598540146, + "grad_norm": 1.162494089255124, + "learning_rate": 7.766537631918888e-06, + "loss": 0.4172, + "step": 3432 + }, + { + "epoch": 0.33411192214111923, + "grad_norm": 1.4584273535075323, + "learning_rate": 7.765224593315402e-06, + "loss": 0.5721, + "step": 3433 + }, + { + "epoch": 0.33420924574209243, + "grad_norm": 1.39936657097592, + "learning_rate": 7.763911279929848e-06, + "loss": 0.4454, + "step": 3434 + }, + { + "epoch": 0.3343065693430657, + "grad_norm": 1.5774498430067907, + "learning_rate": 7.76259769189273e-06, + "loss": 0.6756, + "step": 3435 + }, + { + "epoch": 0.33440389294403894, + "grad_norm": 1.4346477573335101, + "learning_rate": 7.761283829334583e-06, + "loss": 0.4939, + "step": 3436 + }, + { + "epoch": 0.3345012165450122, + "grad_norm": 1.2329772568374064, + "learning_rate": 7.759969692385963e-06, + "loss": 0.3576, + "step": 3437 + }, + { + "epoch": 0.3345985401459854, + "grad_norm": 86.06815068595351, + "learning_rate": 7.75865528117746e-06, + "loss": 0.7983, + "step": 3438 + }, + { + "epoch": 0.33469586374695864, + "grad_norm": 1.4916748444459116, + "learning_rate": 7.757340595839686e-06, + "loss": 0.3408, + "step": 3439 + }, + { + "epoch": 0.3347931873479319, + "grad_norm": 1.204864631425379, + "learning_rate": 7.756025636503281e-06, + "loss": 0.2893, + "step": 3440 + }, + { + "epoch": 0.3348905109489051, + "grad_norm": 1.1483309949418294, + "learning_rate": 7.754710403298915e-06, + "loss": 0.307, + "step": 3441 + }, + { + "epoch": 0.33498783454987835, + "grad_norm": 1.3801437746700074, + "learning_rate": 7.753394896357283e-06, + "loss": 0.5086, + "step": 3442 + }, + { + "epoch": 0.3350851581508516, + "grad_norm": 1.4670528589774587, + "learning_rate": 7.752079115809105e-06, + "loss": 0.5494, + "step": 3443 + }, + { + "epoch": 0.3351824817518248, + "grad_norm": 1.2268331435647832, + "learning_rate": 7.750763061785139e-06, + "loss": 0.3421, + "step": 3444 + }, + { + "epoch": 0.33527980535279805, + "grad_norm": 1.117498287907938, + "learning_rate": 7.749446734416153e-06, + "loss": 0.3583, + "step": 3445 + }, + { + "epoch": 0.3353771289537713, + "grad_norm": 1.6628933950216975, + "learning_rate": 7.748130133832956e-06, + "loss": 0.4265, + "step": 3446 + }, + { + "epoch": 0.3354744525547445, + "grad_norm": 1.4371941282513903, + "learning_rate": 7.746813260166379e-06, + "loss": 0.5499, + "step": 3447 + }, + { + "epoch": 0.33557177615571776, + "grad_norm": 1.4139636094726638, + "learning_rate": 7.74549611354728e-06, + "loss": 0.5113, + "step": 3448 + }, + { + "epoch": 0.335669099756691, + "grad_norm": 1.3607040572953095, + "learning_rate": 7.744178694106545e-06, + "loss": 0.3662, + "step": 3449 + }, + { + "epoch": 0.3357664233576642, + "grad_norm": 1.3497420382405303, + "learning_rate": 7.742861001975086e-06, + "loss": 0.37, + "step": 3450 + }, + { + "epoch": 0.33586374695863747, + "grad_norm": 1.1583812763163044, + "learning_rate": 7.741543037283844e-06, + "loss": 0.2328, + "step": 3451 + }, + { + "epoch": 0.3359610705596107, + "grad_norm": 1.233691835278808, + "learning_rate": 7.740224800163783e-06, + "loss": 0.3023, + "step": 3452 + }, + { + "epoch": 0.3360583941605839, + "grad_norm": 1.770041486932794, + "learning_rate": 7.738906290745902e-06, + "loss": 0.4559, + "step": 3453 + }, + { + "epoch": 0.33615571776155717, + "grad_norm": 1.6249038227669963, + "learning_rate": 7.737587509161218e-06, + "loss": 0.3305, + "step": 3454 + }, + { + "epoch": 0.3362530413625304, + "grad_norm": 1.7123859840919058, + "learning_rate": 7.73626845554078e-06, + "loss": 0.8223, + "step": 3455 + }, + { + "epoch": 0.3363503649635036, + "grad_norm": 1.465565821382168, + "learning_rate": 7.734949130015665e-06, + "loss": 0.3951, + "step": 3456 + }, + { + "epoch": 0.3364476885644769, + "grad_norm": 1.3358312642650005, + "learning_rate": 7.733629532716974e-06, + "loss": 0.3988, + "step": 3457 + }, + { + "epoch": 0.33654501216545013, + "grad_norm": 1.4614411980006665, + "learning_rate": 7.732309663775834e-06, + "loss": 0.4447, + "step": 3458 + }, + { + "epoch": 0.3366423357664234, + "grad_norm": 1.5007446348141111, + "learning_rate": 7.730989523323405e-06, + "loss": 0.5075, + "step": 3459 + }, + { + "epoch": 0.3367396593673966, + "grad_norm": 1.378374467272079, + "learning_rate": 7.72966911149087e-06, + "loss": 0.3713, + "step": 3460 + }, + { + "epoch": 0.33683698296836984, + "grad_norm": 1.264554548276351, + "learning_rate": 7.728348428409434e-06, + "loss": 0.4239, + "step": 3461 + }, + { + "epoch": 0.3369343065693431, + "grad_norm": 1.3654939025524866, + "learning_rate": 7.72702747421034e-06, + "loss": 0.3861, + "step": 3462 + }, + { + "epoch": 0.3370316301703163, + "grad_norm": 1.456076628879786, + "learning_rate": 7.72570624902485e-06, + "loss": 0.4548, + "step": 3463 + }, + { + "epoch": 0.33712895377128954, + "grad_norm": 1.5979907891076075, + "learning_rate": 7.724384752984253e-06, + "loss": 0.5395, + "step": 3464 + }, + { + "epoch": 0.3372262773722628, + "grad_norm": 1.479690145713753, + "learning_rate": 7.723062986219871e-06, + "loss": 0.4676, + "step": 3465 + }, + { + "epoch": 0.337323600973236, + "grad_norm": 1.5794398807050158, + "learning_rate": 7.721740948863044e-06, + "loss": 0.6383, + "step": 3466 + }, + { + "epoch": 0.33742092457420925, + "grad_norm": 1.5077566027780562, + "learning_rate": 7.720418641045147e-06, + "loss": 0.449, + "step": 3467 + }, + { + "epoch": 0.3375182481751825, + "grad_norm": 1.536687422230877, + "learning_rate": 7.719096062897578e-06, + "loss": 0.3885, + "step": 3468 + }, + { + "epoch": 0.3376155717761557, + "grad_norm": 1.4136867972495795, + "learning_rate": 7.717773214551762e-06, + "loss": 0.4262, + "step": 3469 + }, + { + "epoch": 0.33771289537712895, + "grad_norm": 1.7521028146405362, + "learning_rate": 7.71645009613915e-06, + "loss": 0.3553, + "step": 3470 + }, + { + "epoch": 0.3378102189781022, + "grad_norm": 1.28472614917785, + "learning_rate": 7.715126707791223e-06, + "loss": 0.5044, + "step": 3471 + }, + { + "epoch": 0.3379075425790754, + "grad_norm": 1.5285157685020219, + "learning_rate": 7.713803049639485e-06, + "loss": 0.3067, + "step": 3472 + }, + { + "epoch": 0.33800486618004866, + "grad_norm": 1.301311213128161, + "learning_rate": 7.712479121815473e-06, + "loss": 0.4036, + "step": 3473 + }, + { + "epoch": 0.3381021897810219, + "grad_norm": 1.4164505494518185, + "learning_rate": 7.711154924450741e-06, + "loss": 0.3674, + "step": 3474 + }, + { + "epoch": 0.3381995133819951, + "grad_norm": 1.2252498333021546, + "learning_rate": 7.709830457676876e-06, + "loss": 0.273, + "step": 3475 + }, + { + "epoch": 0.33829683698296836, + "grad_norm": 1.4131843411362779, + "learning_rate": 7.708505721625497e-06, + "loss": 0.565, + "step": 3476 + }, + { + "epoch": 0.3383941605839416, + "grad_norm": 1.6947767404353455, + "learning_rate": 7.707180716428237e-06, + "loss": 0.4248, + "step": 3477 + }, + { + "epoch": 0.3384914841849148, + "grad_norm": 1.2913551057356365, + "learning_rate": 7.705855442216766e-06, + "loss": 0.3537, + "step": 3478 + }, + { + "epoch": 0.33858880778588807, + "grad_norm": 1.150600446597589, + "learning_rate": 7.704529899122776e-06, + "loss": 0.3311, + "step": 3479 + }, + { + "epoch": 0.3386861313868613, + "grad_norm": 1.2021311829209522, + "learning_rate": 7.703204087277989e-06, + "loss": 0.4163, + "step": 3480 + }, + { + "epoch": 0.3387834549878346, + "grad_norm": 1.256321657329123, + "learning_rate": 7.70187800681415e-06, + "loss": 0.3609, + "step": 3481 + }, + { + "epoch": 0.3388807785888078, + "grad_norm": 1.3378074898611492, + "learning_rate": 7.70055165786303e-06, + "loss": 0.3365, + "step": 3482 + }, + { + "epoch": 0.338978102189781, + "grad_norm": 1.170985634605712, + "learning_rate": 7.699225040556435e-06, + "loss": 0.2524, + "step": 3483 + }, + { + "epoch": 0.3390754257907543, + "grad_norm": 1.3653491057947706, + "learning_rate": 7.697898155026188e-06, + "loss": 0.282, + "step": 3484 + }, + { + "epoch": 0.3391727493917275, + "grad_norm": 1.3228454670807173, + "learning_rate": 7.696571001404143e-06, + "loss": 0.4489, + "step": 3485 + }, + { + "epoch": 0.33927007299270073, + "grad_norm": 1.568456967406488, + "learning_rate": 7.695243579822179e-06, + "loss": 0.672, + "step": 3486 + }, + { + "epoch": 0.339367396593674, + "grad_norm": 1.5380107558510523, + "learning_rate": 7.693915890412205e-06, + "loss": 0.4099, + "step": 3487 + }, + { + "epoch": 0.3394647201946472, + "grad_norm": 1.522434941547624, + "learning_rate": 7.692587933306152e-06, + "loss": 0.3895, + "step": 3488 + }, + { + "epoch": 0.33956204379562044, + "grad_norm": 1.6424350744273293, + "learning_rate": 7.691259708635983e-06, + "loss": 0.4547, + "step": 3489 + }, + { + "epoch": 0.3396593673965937, + "grad_norm": 1.1285195925950828, + "learning_rate": 7.689931216533682e-06, + "loss": 0.3205, + "step": 3490 + }, + { + "epoch": 0.3397566909975669, + "grad_norm": 1.2211335178765037, + "learning_rate": 7.68860245713126e-06, + "loss": 0.2849, + "step": 3491 + }, + { + "epoch": 0.33985401459854014, + "grad_norm": 1.3440619705365895, + "learning_rate": 7.687273430560763e-06, + "loss": 0.3493, + "step": 3492 + }, + { + "epoch": 0.3399513381995134, + "grad_norm": 1.9270777796759784, + "learning_rate": 7.685944136954252e-06, + "loss": 0.3207, + "step": 3493 + }, + { + "epoch": 0.3400486618004866, + "grad_norm": 1.3294429746967642, + "learning_rate": 7.684614576443821e-06, + "loss": 0.3215, + "step": 3494 + }, + { + "epoch": 0.34014598540145985, + "grad_norm": 1.299183824061712, + "learning_rate": 7.68328474916159e-06, + "loss": 0.3565, + "step": 3495 + }, + { + "epoch": 0.3402433090024331, + "grad_norm": 1.5283573621457682, + "learning_rate": 7.681954655239703e-06, + "loss": 0.4789, + "step": 3496 + }, + { + "epoch": 0.3403406326034063, + "grad_norm": 1.4764556038728422, + "learning_rate": 7.680624294810335e-06, + "loss": 0.4079, + "step": 3497 + }, + { + "epoch": 0.34043795620437955, + "grad_norm": 1.5139159959394148, + "learning_rate": 7.679293668005683e-06, + "loss": 0.5341, + "step": 3498 + }, + { + "epoch": 0.3405352798053528, + "grad_norm": 1.5537302722123498, + "learning_rate": 7.677962774957971e-06, + "loss": 0.478, + "step": 3499 + }, + { + "epoch": 0.340632603406326, + "grad_norm": 1.5827415849591213, + "learning_rate": 7.676631615799453e-06, + "loss": 0.359, + "step": 3500 + }, + { + "epoch": 0.34072992700729926, + "grad_norm": 1.8389434879657838, + "learning_rate": 7.675300190662406e-06, + "loss": 0.3688, + "step": 3501 + }, + { + "epoch": 0.3408272506082725, + "grad_norm": 1.219949474382852, + "learning_rate": 7.673968499679134e-06, + "loss": 0.3099, + "step": 3502 + }, + { + "epoch": 0.34092457420924577, + "grad_norm": 1.5256977236182008, + "learning_rate": 7.67263654298197e-06, + "loss": 0.3838, + "step": 3503 + }, + { + "epoch": 0.34102189781021897, + "grad_norm": 1.4659409139833723, + "learning_rate": 7.671304320703269e-06, + "loss": 0.4845, + "step": 3504 + }, + { + "epoch": 0.3411192214111922, + "grad_norm": 1.667843387368496, + "learning_rate": 7.669971832975417e-06, + "loss": 0.5876, + "step": 3505 + }, + { + "epoch": 0.3412165450121655, + "grad_norm": 1.346414364244355, + "learning_rate": 7.668639079930821e-06, + "loss": 0.4337, + "step": 3506 + }, + { + "epoch": 0.34131386861313867, + "grad_norm": 2.022273962642171, + "learning_rate": 7.66730606170192e-06, + "loss": 0.472, + "step": 3507 + }, + { + "epoch": 0.3414111922141119, + "grad_norm": 1.4021904736753643, + "learning_rate": 7.665972778421175e-06, + "loss": 0.3331, + "step": 3508 + }, + { + "epoch": 0.3415085158150852, + "grad_norm": 1.3964469851310124, + "learning_rate": 7.664639230221081e-06, + "loss": 0.4151, + "step": 3509 + }, + { + "epoch": 0.3416058394160584, + "grad_norm": 1.437167087649688, + "learning_rate": 7.663305417234146e-06, + "loss": 0.3751, + "step": 3510 + }, + { + "epoch": 0.34170316301703163, + "grad_norm": 1.3813486918408102, + "learning_rate": 7.661971339592913e-06, + "loss": 0.2492, + "step": 3511 + }, + { + "epoch": 0.3418004866180049, + "grad_norm": 2.213050506553265, + "learning_rate": 7.660636997429953e-06, + "loss": 0.2442, + "step": 3512 + }, + { + "epoch": 0.3418978102189781, + "grad_norm": 1.4125007730667565, + "learning_rate": 7.659302390877858e-06, + "loss": 0.3901, + "step": 3513 + }, + { + "epoch": 0.34199513381995134, + "grad_norm": 1.5262547193449572, + "learning_rate": 7.657967520069253e-06, + "loss": 0.5142, + "step": 3514 + }, + { + "epoch": 0.3420924574209246, + "grad_norm": 1.419026298338398, + "learning_rate": 7.65663238513678e-06, + "loss": 0.4604, + "step": 3515 + }, + { + "epoch": 0.3421897810218978, + "grad_norm": 1.354197849012477, + "learning_rate": 7.655296986213114e-06, + "loss": 0.3741, + "step": 3516 + }, + { + "epoch": 0.34228710462287104, + "grad_norm": 1.3675444218814583, + "learning_rate": 7.653961323430954e-06, + "loss": 0.4636, + "step": 3517 + }, + { + "epoch": 0.3423844282238443, + "grad_norm": 1.6737094690203054, + "learning_rate": 7.652625396923027e-06, + "loss": 0.5368, + "step": 3518 + }, + { + "epoch": 0.3424817518248175, + "grad_norm": 1.5346015529843957, + "learning_rate": 7.651289206822084e-06, + "loss": 0.5808, + "step": 3519 + }, + { + "epoch": 0.34257907542579075, + "grad_norm": 1.7677803050375525, + "learning_rate": 7.649952753260901e-06, + "loss": 0.4331, + "step": 3520 + }, + { + "epoch": 0.342676399026764, + "grad_norm": 1.5130877149006923, + "learning_rate": 7.648616036372288e-06, + "loss": 0.4343, + "step": 3521 + }, + { + "epoch": 0.34277372262773725, + "grad_norm": 1.6505485143132894, + "learning_rate": 7.647279056289068e-06, + "loss": 0.4256, + "step": 3522 + }, + { + "epoch": 0.34287104622871045, + "grad_norm": 1.1213144756308453, + "learning_rate": 7.6459418131441e-06, + "loss": 0.248, + "step": 3523 + }, + { + "epoch": 0.3429683698296837, + "grad_norm": 1.4978011226878554, + "learning_rate": 7.64460430707027e-06, + "loss": 0.4457, + "step": 3524 + }, + { + "epoch": 0.34306569343065696, + "grad_norm": 1.724927585574558, + "learning_rate": 7.643266538200485e-06, + "loss": 0.5746, + "step": 3525 + }, + { + "epoch": 0.34316301703163016, + "grad_norm": 1.4218186587395187, + "learning_rate": 7.641928506667677e-06, + "loss": 0.4628, + "step": 3526 + }, + { + "epoch": 0.3432603406326034, + "grad_norm": 1.1979163238967183, + "learning_rate": 7.640590212604813e-06, + "loss": 0.3276, + "step": 3527 + }, + { + "epoch": 0.34335766423357666, + "grad_norm": 1.5459257353579257, + "learning_rate": 7.639251656144873e-06, + "loss": 0.5543, + "step": 3528 + }, + { + "epoch": 0.34345498783454986, + "grad_norm": 2.168708523840157, + "learning_rate": 7.637912837420876e-06, + "loss": 0.4451, + "step": 3529 + }, + { + "epoch": 0.3435523114355231, + "grad_norm": 1.3295572972455665, + "learning_rate": 7.63657375656586e-06, + "loss": 0.3659, + "step": 3530 + }, + { + "epoch": 0.34364963503649637, + "grad_norm": 1.3940981237720802, + "learning_rate": 7.635234413712886e-06, + "loss": 0.3305, + "step": 3531 + }, + { + "epoch": 0.34374695863746957, + "grad_norm": 1.668696796269181, + "learning_rate": 7.63389480899505e-06, + "loss": 0.262, + "step": 3532 + }, + { + "epoch": 0.3438442822384428, + "grad_norm": 1.3235762846506705, + "learning_rate": 7.632554942545468e-06, + "loss": 0.354, + "step": 3533 + }, + { + "epoch": 0.3439416058394161, + "grad_norm": 1.774595647074867, + "learning_rate": 7.631214814497283e-06, + "loss": 0.4181, + "step": 3534 + }, + { + "epoch": 0.3440389294403893, + "grad_norm": 1.3699960689070612, + "learning_rate": 7.629874424983664e-06, + "loss": 0.4893, + "step": 3535 + }, + { + "epoch": 0.3441362530413625, + "grad_norm": 1.0054517790481798, + "learning_rate": 7.628533774137809e-06, + "loss": 0.2678, + "step": 3536 + }, + { + "epoch": 0.3442335766423358, + "grad_norm": 1.2585804176689974, + "learning_rate": 7.627192862092936e-06, + "loss": 0.3145, + "step": 3537 + }, + { + "epoch": 0.344330900243309, + "grad_norm": 1.4486458769434574, + "learning_rate": 7.625851688982293e-06, + "loss": 0.5018, + "step": 3538 + }, + { + "epoch": 0.34442822384428223, + "grad_norm": 1.6761010493950546, + "learning_rate": 7.624510254939155e-06, + "loss": 0.5786, + "step": 3539 + }, + { + "epoch": 0.3445255474452555, + "grad_norm": 1.1326870552848924, + "learning_rate": 7.623168560096819e-06, + "loss": 0.2714, + "step": 3540 + }, + { + "epoch": 0.3446228710462287, + "grad_norm": 1.380514271735255, + "learning_rate": 7.62182660458861e-06, + "loss": 0.4435, + "step": 3541 + }, + { + "epoch": 0.34472019464720194, + "grad_norm": 1.3847938610044406, + "learning_rate": 7.620484388547881e-06, + "loss": 0.456, + "step": 3542 + }, + { + "epoch": 0.3448175182481752, + "grad_norm": 1.4774572497281164, + "learning_rate": 7.619141912108008e-06, + "loss": 0.4016, + "step": 3543 + }, + { + "epoch": 0.34491484184914845, + "grad_norm": 1.4155087320104913, + "learning_rate": 7.617799175402392e-06, + "loss": 0.4672, + "step": 3544 + }, + { + "epoch": 0.34501216545012164, + "grad_norm": 1.2347049030843364, + "learning_rate": 7.616456178564463e-06, + "loss": 0.4701, + "step": 3545 + }, + { + "epoch": 0.3451094890510949, + "grad_norm": 1.3617965352115597, + "learning_rate": 7.615112921727677e-06, + "loss": 0.4411, + "step": 3546 + }, + { + "epoch": 0.34520681265206815, + "grad_norm": 1.5080403368283026, + "learning_rate": 7.613769405025511e-06, + "loss": 0.446, + "step": 3547 + }, + { + "epoch": 0.34530413625304135, + "grad_norm": 1.3587460871748047, + "learning_rate": 7.612425628591473e-06, + "loss": 0.4618, + "step": 3548 + }, + { + "epoch": 0.3454014598540146, + "grad_norm": 1.5594894397765757, + "learning_rate": 7.611081592559095e-06, + "loss": 0.6454, + "step": 3549 + }, + { + "epoch": 0.34549878345498786, + "grad_norm": 1.5472776871393785, + "learning_rate": 7.609737297061934e-06, + "loss": 0.4209, + "step": 3550 + }, + { + "epoch": 0.34559610705596105, + "grad_norm": 1.1387925779252865, + "learning_rate": 7.608392742233573e-06, + "loss": 0.2542, + "step": 3551 + }, + { + "epoch": 0.3456934306569343, + "grad_norm": 2.1308610533262753, + "learning_rate": 7.6070479282076226e-06, + "loss": 0.4232, + "step": 3552 + }, + { + "epoch": 0.34579075425790756, + "grad_norm": 1.655772230226882, + "learning_rate": 7.605702855117717e-06, + "loss": 0.316, + "step": 3553 + }, + { + "epoch": 0.34588807785888076, + "grad_norm": 1.2773133253854652, + "learning_rate": 7.604357523097518e-06, + "loss": 0.3933, + "step": 3554 + }, + { + "epoch": 0.345985401459854, + "grad_norm": 1.5157476945746606, + "learning_rate": 7.6030119322807105e-06, + "loss": 0.4895, + "step": 3555 + }, + { + "epoch": 0.34608272506082727, + "grad_norm": 1.5384836415390328, + "learning_rate": 7.601666082801007e-06, + "loss": 0.4571, + "step": 3556 + }, + { + "epoch": 0.34618004866180047, + "grad_norm": 1.3494696317056387, + "learning_rate": 7.600319974792145e-06, + "loss": 0.3585, + "step": 3557 + }, + { + "epoch": 0.3462773722627737, + "grad_norm": 1.5639692646479821, + "learning_rate": 7.59897360838789e-06, + "loss": 0.4913, + "step": 3558 + }, + { + "epoch": 0.346374695863747, + "grad_norm": 1.5771201683321747, + "learning_rate": 7.59762698372203e-06, + "loss": 0.644, + "step": 3559 + }, + { + "epoch": 0.34647201946472017, + "grad_norm": 1.445755844165086, + "learning_rate": 7.596280100928379e-06, + "loss": 0.4662, + "step": 3560 + }, + { + "epoch": 0.3465693430656934, + "grad_norm": 1.0169653044814775, + "learning_rate": 7.59493296014078e-06, + "loss": 0.1873, + "step": 3561 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 1.3573861316393436, + "learning_rate": 7.593585561493098e-06, + "loss": 0.3621, + "step": 3562 + }, + { + "epoch": 0.3467639902676399, + "grad_norm": 1.3679634725925294, + "learning_rate": 7.592237905119224e-06, + "loss": 0.3714, + "step": 3563 + }, + { + "epoch": 0.34686131386861313, + "grad_norm": 1.6153262572349345, + "learning_rate": 7.590889991153076e-06, + "loss": 0.1934, + "step": 3564 + }, + { + "epoch": 0.3469586374695864, + "grad_norm": 1.3501706757209104, + "learning_rate": 7.589541819728597e-06, + "loss": 0.3771, + "step": 3565 + }, + { + "epoch": 0.34705596107055964, + "grad_norm": 1.4359834383485004, + "learning_rate": 7.588193390979756e-06, + "loss": 0.4021, + "step": 3566 + }, + { + "epoch": 0.34715328467153284, + "grad_norm": 1.6178918309073458, + "learning_rate": 7.5868447050405456e-06, + "loss": 0.5326, + "step": 3567 + }, + { + "epoch": 0.3472506082725061, + "grad_norm": 1.2161304982080974, + "learning_rate": 7.585495762044989e-06, + "loss": 0.3215, + "step": 3568 + }, + { + "epoch": 0.34734793187347934, + "grad_norm": 1.2514780842366786, + "learning_rate": 7.584146562127128e-06, + "loss": 0.2619, + "step": 3569 + }, + { + "epoch": 0.34744525547445254, + "grad_norm": 1.292347877079837, + "learning_rate": 7.5827971054210334e-06, + "loss": 0.3722, + "step": 3570 + }, + { + "epoch": 0.3475425790754258, + "grad_norm": 1.428500991174703, + "learning_rate": 7.581447392060806e-06, + "loss": 0.4681, + "step": 3571 + }, + { + "epoch": 0.34763990267639905, + "grad_norm": 1.6187169310973553, + "learning_rate": 7.5800974221805635e-06, + "loss": 0.3123, + "step": 3572 + }, + { + "epoch": 0.34773722627737225, + "grad_norm": 1.2828962403261588, + "learning_rate": 7.5787471959144535e-06, + "loss": 0.3426, + "step": 3573 + }, + { + "epoch": 0.3478345498783455, + "grad_norm": 1.3424794600068044, + "learning_rate": 7.577396713396649e-06, + "loss": 0.2749, + "step": 3574 + }, + { + "epoch": 0.34793187347931875, + "grad_norm": 1.311429995391481, + "learning_rate": 7.576045974761352e-06, + "loss": 0.1912, + "step": 3575 + }, + { + "epoch": 0.34802919708029195, + "grad_norm": 1.9475934853598964, + "learning_rate": 7.57469498014278e-06, + "loss": 0.3568, + "step": 3576 + }, + { + "epoch": 0.3481265206812652, + "grad_norm": 1.5712590946732445, + "learning_rate": 7.573343729675187e-06, + "loss": 0.4361, + "step": 3577 + }, + { + "epoch": 0.34822384428223846, + "grad_norm": 1.3460143771601136, + "learning_rate": 7.5719922234928435e-06, + "loss": 0.3258, + "step": 3578 + }, + { + "epoch": 0.34832116788321166, + "grad_norm": 1.3457032135851295, + "learning_rate": 7.5706404617300544e-06, + "loss": 0.2679, + "step": 3579 + }, + { + "epoch": 0.3484184914841849, + "grad_norm": 1.602637400100207, + "learning_rate": 7.569288444521141e-06, + "loss": 0.4349, + "step": 3580 + }, + { + "epoch": 0.34851581508515816, + "grad_norm": 1.193898879748489, + "learning_rate": 7.567936172000456e-06, + "loss": 0.3823, + "step": 3581 + }, + { + "epoch": 0.34861313868613136, + "grad_norm": 1.7213934943419156, + "learning_rate": 7.5665836443023764e-06, + "loss": 0.4477, + "step": 3582 + }, + { + "epoch": 0.3487104622871046, + "grad_norm": 2.0896889317289262, + "learning_rate": 7.5652308615613025e-06, + "loss": 0.548, + "step": 3583 + }, + { + "epoch": 0.34880778588807787, + "grad_norm": 1.2133584709672542, + "learning_rate": 7.563877823911661e-06, + "loss": 0.3342, + "step": 3584 + }, + { + "epoch": 0.34890510948905107, + "grad_norm": 1.3518803834565474, + "learning_rate": 7.562524531487902e-06, + "loss": 0.4428, + "step": 3585 + }, + { + "epoch": 0.3490024330900243, + "grad_norm": 1.5646028482749865, + "learning_rate": 7.561170984424509e-06, + "loss": 0.4805, + "step": 3586 + }, + { + "epoch": 0.3490997566909976, + "grad_norm": 1.725109749219042, + "learning_rate": 7.5598171828559775e-06, + "loss": 0.3953, + "step": 3587 + }, + { + "epoch": 0.34919708029197083, + "grad_norm": 1.6751385503374157, + "learning_rate": 7.558463126916842e-06, + "loss": 0.3466, + "step": 3588 + }, + { + "epoch": 0.349294403892944, + "grad_norm": 1.577501457080682, + "learning_rate": 7.557108816741651e-06, + "loss": 0.387, + "step": 3589 + }, + { + "epoch": 0.3493917274939173, + "grad_norm": 1.5319961753926956, + "learning_rate": 7.5557542524649866e-06, + "loss": 0.2916, + "step": 3590 + }, + { + "epoch": 0.34948905109489053, + "grad_norm": 2.5209551905243845, + "learning_rate": 7.554399434221449e-06, + "loss": 0.2941, + "step": 3591 + }, + { + "epoch": 0.34958637469586373, + "grad_norm": 1.1980216546975295, + "learning_rate": 7.553044362145672e-06, + "loss": 0.3867, + "step": 3592 + }, + { + "epoch": 0.349683698296837, + "grad_norm": 1.425370917650529, + "learning_rate": 7.551689036372306e-06, + "loss": 0.4788, + "step": 3593 + }, + { + "epoch": 0.34978102189781024, + "grad_norm": 1.5504040775883203, + "learning_rate": 7.550333457036032e-06, + "loss": 0.5355, + "step": 3594 + }, + { + "epoch": 0.34987834549878344, + "grad_norm": 1.253372887044746, + "learning_rate": 7.5489776242715564e-06, + "loss": 0.4783, + "step": 3595 + }, + { + "epoch": 0.3499756690997567, + "grad_norm": 1.4985868632126829, + "learning_rate": 7.547621538213607e-06, + "loss": 0.4225, + "step": 3596 + }, + { + "epoch": 0.35007299270072995, + "grad_norm": 1.4435970539146459, + "learning_rate": 7.5462651989969385e-06, + "loss": 0.3745, + "step": 3597 + }, + { + "epoch": 0.35017031630170314, + "grad_norm": 1.3981266208958172, + "learning_rate": 7.5449086067563314e-06, + "loss": 0.3456, + "step": 3598 + }, + { + "epoch": 0.3502676399026764, + "grad_norm": 1.8315759023798575, + "learning_rate": 7.543551761626594e-06, + "loss": 0.5542, + "step": 3599 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 1.0610077420382762, + "learning_rate": 7.542194663742553e-06, + "loss": 0.3009, + "step": 3600 + }, + { + "epoch": 0.35046228710462285, + "grad_norm": 1.4599599576710758, + "learning_rate": 7.5408373132390674e-06, + "loss": 0.3322, + "step": 3601 + }, + { + "epoch": 0.3505596107055961, + "grad_norm": 1.357290752856556, + "learning_rate": 7.539479710251014e-06, + "loss": 0.4219, + "step": 3602 + }, + { + "epoch": 0.35065693430656936, + "grad_norm": 1.3466674715612543, + "learning_rate": 7.538121854913303e-06, + "loss": 0.4688, + "step": 3603 + }, + { + "epoch": 0.35075425790754255, + "grad_norm": 1.9207221814789595, + "learning_rate": 7.536763747360863e-06, + "loss": 0.5269, + "step": 3604 + }, + { + "epoch": 0.3508515815085158, + "grad_norm": 1.4616085693980927, + "learning_rate": 7.535405387728649e-06, + "loss": 0.5216, + "step": 3605 + }, + { + "epoch": 0.35094890510948906, + "grad_norm": 1.638466859707607, + "learning_rate": 7.534046776151645e-06, + "loss": 0.4155, + "step": 3606 + }, + { + "epoch": 0.35104622871046226, + "grad_norm": 1.8610570015640786, + "learning_rate": 7.532687912764853e-06, + "loss": 0.4385, + "step": 3607 + }, + { + "epoch": 0.3511435523114355, + "grad_norm": 1.390595082164788, + "learning_rate": 7.531328797703308e-06, + "loss": 0.4791, + "step": 3608 + }, + { + "epoch": 0.35124087591240877, + "grad_norm": 1.4107542592778146, + "learning_rate": 7.529969431102063e-06, + "loss": 0.5517, + "step": 3609 + }, + { + "epoch": 0.351338199513382, + "grad_norm": 1.2873358057005926, + "learning_rate": 7.528609813096203e-06, + "loss": 0.2964, + "step": 3610 + }, + { + "epoch": 0.3514355231143552, + "grad_norm": 1.585587787123838, + "learning_rate": 7.527249943820831e-06, + "loss": 0.5375, + "step": 3611 + }, + { + "epoch": 0.3515328467153285, + "grad_norm": 1.307663133994867, + "learning_rate": 7.525889823411076e-06, + "loss": 0.2655, + "step": 3612 + }, + { + "epoch": 0.3516301703163017, + "grad_norm": 1.3238605926395333, + "learning_rate": 7.524529452002099e-06, + "loss": 0.3678, + "step": 3613 + }, + { + "epoch": 0.3517274939172749, + "grad_norm": 1.1603226975769314, + "learning_rate": 7.523168829729078e-06, + "loss": 0.294, + "step": 3614 + }, + { + "epoch": 0.3518248175182482, + "grad_norm": 0.9756816706305802, + "learning_rate": 7.52180795672722e-06, + "loss": 0.1748, + "step": 3615 + }, + { + "epoch": 0.35192214111922143, + "grad_norm": 1.6381757346588681, + "learning_rate": 7.520446833131756e-06, + "loss": 0.3671, + "step": 3616 + }, + { + "epoch": 0.35201946472019463, + "grad_norm": 1.6012410655431273, + "learning_rate": 7.51908545907794e-06, + "loss": 0.7845, + "step": 3617 + }, + { + "epoch": 0.3521167883211679, + "grad_norm": 1.513216444842459, + "learning_rate": 7.517723834701054e-06, + "loss": 0.5675, + "step": 3618 + }, + { + "epoch": 0.35221411192214114, + "grad_norm": 1.5876840830126697, + "learning_rate": 7.516361960136403e-06, + "loss": 0.4117, + "step": 3619 + }, + { + "epoch": 0.35231143552311434, + "grad_norm": 1.2568334714306615, + "learning_rate": 7.514999835519318e-06, + "loss": 0.3623, + "step": 3620 + }, + { + "epoch": 0.3524087591240876, + "grad_norm": 1.430201120305146, + "learning_rate": 7.513637460985153e-06, + "loss": 0.4618, + "step": 3621 + }, + { + "epoch": 0.35250608272506084, + "grad_norm": 1.6954496126059004, + "learning_rate": 7.512274836669288e-06, + "loss": 0.4018, + "step": 3622 + }, + { + "epoch": 0.35260340632603404, + "grad_norm": 1.3717482871101743, + "learning_rate": 7.510911962707128e-06, + "loss": 0.5364, + "step": 3623 + }, + { + "epoch": 0.3527007299270073, + "grad_norm": 1.3632873709904672, + "learning_rate": 7.509548839234102e-06, + "loss": 0.3786, + "step": 3624 + }, + { + "epoch": 0.35279805352798055, + "grad_norm": 1.5766225782578598, + "learning_rate": 7.508185466385667e-06, + "loss": 0.6176, + "step": 3625 + }, + { + "epoch": 0.35289537712895375, + "grad_norm": 1.6300483194455977, + "learning_rate": 7.506821844297301e-06, + "loss": 0.4239, + "step": 3626 + }, + { + "epoch": 0.352992700729927, + "grad_norm": 1.244455438425305, + "learning_rate": 7.505457973104506e-06, + "loss": 0.3627, + "step": 3627 + }, + { + "epoch": 0.35309002433090025, + "grad_norm": 1.441382927434537, + "learning_rate": 7.504093852942815e-06, + "loss": 0.3853, + "step": 3628 + }, + { + "epoch": 0.35318734793187345, + "grad_norm": 1.3700542367515711, + "learning_rate": 7.502729483947776e-06, + "loss": 0.5312, + "step": 3629 + }, + { + "epoch": 0.3532846715328467, + "grad_norm": 1.3101033627157612, + "learning_rate": 7.50136486625497e-06, + "loss": 0.3997, + "step": 3630 + }, + { + "epoch": 0.35338199513381996, + "grad_norm": 1.4700150418971674, + "learning_rate": 7.500000000000001e-06, + "loss": 0.4536, + "step": 3631 + }, + { + "epoch": 0.3534793187347932, + "grad_norm": 1.4399856969334963, + "learning_rate": 7.4986348853184944e-06, + "loss": 0.4301, + "step": 3632 + }, + { + "epoch": 0.3535766423357664, + "grad_norm": 1.5002693093456838, + "learning_rate": 7.497269522346105e-06, + "loss": 0.5339, + "step": 3633 + }, + { + "epoch": 0.35367396593673966, + "grad_norm": 1.4332148495576913, + "learning_rate": 7.4959039112185065e-06, + "loss": 0.4218, + "step": 3634 + }, + { + "epoch": 0.3537712895377129, + "grad_norm": 1.430533403803271, + "learning_rate": 7.494538052071403e-06, + "loss": 0.4658, + "step": 3635 + }, + { + "epoch": 0.3538686131386861, + "grad_norm": 1.7972458166545215, + "learning_rate": 7.4931719450405185e-06, + "loss": 0.2642, + "step": 3636 + }, + { + "epoch": 0.35396593673965937, + "grad_norm": 1.138544720049106, + "learning_rate": 7.491805590261607e-06, + "loss": 0.3429, + "step": 3637 + }, + { + "epoch": 0.3540632603406326, + "grad_norm": 1.9786831181417208, + "learning_rate": 7.490438987870443e-06, + "loss": 0.4378, + "step": 3638 + }, + { + "epoch": 0.3541605839416058, + "grad_norm": 1.136048581491554, + "learning_rate": 7.489072138002825e-06, + "loss": 0.2668, + "step": 3639 + }, + { + "epoch": 0.3542579075425791, + "grad_norm": 1.3441611259868125, + "learning_rate": 7.4877050407945796e-06, + "loss": 0.468, + "step": 3640 + }, + { + "epoch": 0.35435523114355233, + "grad_norm": 1.1560682205410908, + "learning_rate": 7.486337696381554e-06, + "loss": 0.3363, + "step": 3641 + }, + { + "epoch": 0.3544525547445255, + "grad_norm": 1.0393622403442615, + "learning_rate": 7.484970104899624e-06, + "loss": 0.2803, + "step": 3642 + }, + { + "epoch": 0.3545498783454988, + "grad_norm": 1.514160943546232, + "learning_rate": 7.483602266484686e-06, + "loss": 0.4441, + "step": 3643 + }, + { + "epoch": 0.35464720194647203, + "grad_norm": 1.580897624441714, + "learning_rate": 7.482234181272666e-06, + "loss": 0.3502, + "step": 3644 + }, + { + "epoch": 0.35474452554744523, + "grad_norm": 1.2716040020289627, + "learning_rate": 7.480865849399508e-06, + "loss": 0.3551, + "step": 3645 + }, + { + "epoch": 0.3548418491484185, + "grad_norm": 1.6397132883430414, + "learning_rate": 7.4794972710011885e-06, + "loss": 0.54, + "step": 3646 + }, + { + "epoch": 0.35493917274939174, + "grad_norm": 1.4383012777464312, + "learning_rate": 7.478128446213699e-06, + "loss": 0.3954, + "step": 3647 + }, + { + "epoch": 0.35503649635036494, + "grad_norm": 1.4020794377643642, + "learning_rate": 7.476759375173063e-06, + "loss": 0.2869, + "step": 3648 + }, + { + "epoch": 0.3551338199513382, + "grad_norm": 1.420695895411819, + "learning_rate": 7.475390058015326e-06, + "loss": 0.4162, + "step": 3649 + }, + { + "epoch": 0.35523114355231145, + "grad_norm": 1.3856407596171976, + "learning_rate": 7.474020494876556e-06, + "loss": 0.403, + "step": 3650 + }, + { + "epoch": 0.35532846715328464, + "grad_norm": 1.6355333792797258, + "learning_rate": 7.472650685892851e-06, + "loss": 0.6147, + "step": 3651 + }, + { + "epoch": 0.3554257907542579, + "grad_norm": 1.3284128213805295, + "learning_rate": 7.471280631200325e-06, + "loss": 0.2128, + "step": 3652 + }, + { + "epoch": 0.35552311435523115, + "grad_norm": 1.357840581851552, + "learning_rate": 7.469910330935126e-06, + "loss": 0.3483, + "step": 3653 + }, + { + "epoch": 0.3556204379562044, + "grad_norm": 1.2770810104938104, + "learning_rate": 7.468539785233417e-06, + "loss": 0.2812, + "step": 3654 + }, + { + "epoch": 0.3557177615571776, + "grad_norm": 1.7995385373189825, + "learning_rate": 7.467168994231394e-06, + "loss": 0.2944, + "step": 3655 + }, + { + "epoch": 0.35581508515815086, + "grad_norm": 2.0088307214551846, + "learning_rate": 7.465797958065272e-06, + "loss": 0.3204, + "step": 3656 + }, + { + "epoch": 0.3559124087591241, + "grad_norm": 1.3573976363618303, + "learning_rate": 7.46442667687129e-06, + "loss": 0.3666, + "step": 3657 + }, + { + "epoch": 0.3560097323600973, + "grad_norm": 1.4772002442457595, + "learning_rate": 7.463055150785715e-06, + "loss": 0.3756, + "step": 3658 + }, + { + "epoch": 0.35610705596107056, + "grad_norm": 1.7169887882459811, + "learning_rate": 7.461683379944835e-06, + "loss": 0.6085, + "step": 3659 + }, + { + "epoch": 0.3562043795620438, + "grad_norm": 1.753449718301559, + "learning_rate": 7.460311364484964e-06, + "loss": 0.691, + "step": 3660 + }, + { + "epoch": 0.356301703163017, + "grad_norm": 1.9142906208727815, + "learning_rate": 7.458939104542442e-06, + "loss": 0.5569, + "step": 3661 + }, + { + "epoch": 0.35639902676399027, + "grad_norm": 1.564692980111203, + "learning_rate": 7.457566600253631e-06, + "loss": 0.417, + "step": 3662 + }, + { + "epoch": 0.3564963503649635, + "grad_norm": 1.4241868392709103, + "learning_rate": 7.4561938517549136e-06, + "loss": 0.3702, + "step": 3663 + }, + { + "epoch": 0.3565936739659367, + "grad_norm": 1.0774182753758312, + "learning_rate": 7.4548208591827056e-06, + "loss": 0.325, + "step": 3664 + }, + { + "epoch": 0.35669099756691, + "grad_norm": 1.3486569434899873, + "learning_rate": 7.4534476226734384e-06, + "loss": 0.3906, + "step": 3665 + }, + { + "epoch": 0.3567883211678832, + "grad_norm": 1.2303476425395732, + "learning_rate": 7.452074142363573e-06, + "loss": 0.3073, + "step": 3666 + }, + { + "epoch": 0.3568856447688564, + "grad_norm": 1.40711240815019, + "learning_rate": 7.450700418389594e-06, + "loss": 0.386, + "step": 3667 + }, + { + "epoch": 0.3569829683698297, + "grad_norm": 1.4572695621598242, + "learning_rate": 7.449326450888007e-06, + "loss": 0.5228, + "step": 3668 + }, + { + "epoch": 0.35708029197080293, + "grad_norm": 1.4210722617044638, + "learning_rate": 7.4479522399953465e-06, + "loss": 0.4409, + "step": 3669 + }, + { + "epoch": 0.35717761557177613, + "grad_norm": 1.508447701068191, + "learning_rate": 7.446577785848166e-06, + "loss": 0.4571, + "step": 3670 + }, + { + "epoch": 0.3572749391727494, + "grad_norm": 1.3197481653540741, + "learning_rate": 7.445203088583047e-06, + "loss": 0.3886, + "step": 3671 + }, + { + "epoch": 0.35737226277372264, + "grad_norm": 1.7619697747209029, + "learning_rate": 7.443828148336594e-06, + "loss": 0.5652, + "step": 3672 + }, + { + "epoch": 0.3574695863746959, + "grad_norm": 1.2741586714286437, + "learning_rate": 7.442452965245437e-06, + "loss": 0.2068, + "step": 3673 + }, + { + "epoch": 0.3575669099756691, + "grad_norm": 1.6171437133168665, + "learning_rate": 7.4410775394462285e-06, + "loss": 0.4785, + "step": 3674 + }, + { + "epoch": 0.35766423357664234, + "grad_norm": 1.3575662879144605, + "learning_rate": 7.4397018710756415e-06, + "loss": 0.3851, + "step": 3675 + }, + { + "epoch": 0.3577615571776156, + "grad_norm": 1.5452516302727513, + "learning_rate": 7.438325960270382e-06, + "loss": 0.4154, + "step": 3676 + }, + { + "epoch": 0.3578588807785888, + "grad_norm": 1.4057841059024094, + "learning_rate": 7.436949807167172e-06, + "loss": 0.4309, + "step": 3677 + }, + { + "epoch": 0.35795620437956205, + "grad_norm": 1.3402694159440718, + "learning_rate": 7.435573411902763e-06, + "loss": 0.3905, + "step": 3678 + }, + { + "epoch": 0.3580535279805353, + "grad_norm": 1.899972993257223, + "learning_rate": 7.434196774613926e-06, + "loss": 0.324, + "step": 3679 + }, + { + "epoch": 0.3581508515815085, + "grad_norm": 1.517104694845036, + "learning_rate": 7.432819895437461e-06, + "loss": 0.5038, + "step": 3680 + }, + { + "epoch": 0.35824817518248175, + "grad_norm": 1.5026505417391594, + "learning_rate": 7.431442774510186e-06, + "loss": 0.5613, + "step": 3681 + }, + { + "epoch": 0.358345498783455, + "grad_norm": 1.6998698072507739, + "learning_rate": 7.4300654119689475e-06, + "loss": 0.4362, + "step": 3682 + }, + { + "epoch": 0.3584428223844282, + "grad_norm": 1.5499969728501817, + "learning_rate": 7.4286878079506175e-06, + "loss": 0.5288, + "step": 3683 + }, + { + "epoch": 0.35854014598540146, + "grad_norm": 1.7237862062144214, + "learning_rate": 7.4273099625920866e-06, + "loss": 0.2981, + "step": 3684 + }, + { + "epoch": 0.3586374695863747, + "grad_norm": 1.5399796130329546, + "learning_rate": 7.4259318760302725e-06, + "loss": 0.3486, + "step": 3685 + }, + { + "epoch": 0.3587347931873479, + "grad_norm": 1.734733819358604, + "learning_rate": 7.424553548402116e-06, + "loss": 0.4681, + "step": 3686 + }, + { + "epoch": 0.35883211678832116, + "grad_norm": 1.4858189837407711, + "learning_rate": 7.423174979844583e-06, + "loss": 0.4624, + "step": 3687 + }, + { + "epoch": 0.3589294403892944, + "grad_norm": 2.0317094723346556, + "learning_rate": 7.421796170494664e-06, + "loss": 0.421, + "step": 3688 + }, + { + "epoch": 0.3590267639902676, + "grad_norm": 1.6561811900690029, + "learning_rate": 7.42041712048937e-06, + "loss": 0.4593, + "step": 3689 + }, + { + "epoch": 0.35912408759124087, + "grad_norm": 1.3298486632098465, + "learning_rate": 7.41903782996574e-06, + "loss": 0.3328, + "step": 3690 + }, + { + "epoch": 0.3592214111922141, + "grad_norm": 1.6636689486346872, + "learning_rate": 7.417658299060834e-06, + "loss": 0.4999, + "step": 3691 + }, + { + "epoch": 0.3593187347931873, + "grad_norm": 1.566657143760038, + "learning_rate": 7.4162785279117354e-06, + "loss": 0.6945, + "step": 3692 + }, + { + "epoch": 0.3594160583941606, + "grad_norm": 1.1976745803136968, + "learning_rate": 7.414898516655555e-06, + "loss": 0.3368, + "step": 3693 + }, + { + "epoch": 0.35951338199513383, + "grad_norm": 1.5599198973704254, + "learning_rate": 7.413518265429427e-06, + "loss": 0.3875, + "step": 3694 + }, + { + "epoch": 0.3596107055961071, + "grad_norm": 1.45764719852823, + "learning_rate": 7.412137774370502e-06, + "loss": 0.4665, + "step": 3695 + }, + { + "epoch": 0.3597080291970803, + "grad_norm": 1.2276949665906944, + "learning_rate": 7.410757043615966e-06, + "loss": 0.285, + "step": 3696 + }, + { + "epoch": 0.35980535279805353, + "grad_norm": 1.5359072801599356, + "learning_rate": 7.40937607330302e-06, + "loss": 0.376, + "step": 3697 + }, + { + "epoch": 0.3599026763990268, + "grad_norm": 1.209800432472448, + "learning_rate": 7.4079948635688925e-06, + "loss": 0.3298, + "step": 3698 + }, + { + "epoch": 0.36, + "grad_norm": 1.4892209107876908, + "learning_rate": 7.406613414550835e-06, + "loss": 0.3474, + "step": 3699 + }, + { + "epoch": 0.36009732360097324, + "grad_norm": 1.4539822746623663, + "learning_rate": 7.405231726386124e-06, + "loss": 0.3756, + "step": 3700 + }, + { + "epoch": 0.3601946472019465, + "grad_norm": 1.4980607212426436, + "learning_rate": 7.403849799212057e-06, + "loss": 0.4841, + "step": 3701 + }, + { + "epoch": 0.3602919708029197, + "grad_norm": 1.3698188527628243, + "learning_rate": 7.40246763316596e-06, + "loss": 0.3881, + "step": 3702 + }, + { + "epoch": 0.36038929440389295, + "grad_norm": 1.3253201655674622, + "learning_rate": 7.401085228385177e-06, + "loss": 0.3933, + "step": 3703 + }, + { + "epoch": 0.3604866180048662, + "grad_norm": 1.4396239636102728, + "learning_rate": 7.399702585007077e-06, + "loss": 0.408, + "step": 3704 + }, + { + "epoch": 0.3605839416058394, + "grad_norm": 1.5453004626353954, + "learning_rate": 7.398319703169058e-06, + "loss": 0.5276, + "step": 3705 + }, + { + "epoch": 0.36068126520681265, + "grad_norm": 1.403106821872738, + "learning_rate": 7.396936583008535e-06, + "loss": 0.4362, + "step": 3706 + }, + { + "epoch": 0.3607785888077859, + "grad_norm": 1.6766339602355043, + "learning_rate": 7.395553224662952e-06, + "loss": 0.5511, + "step": 3707 + }, + { + "epoch": 0.3608759124087591, + "grad_norm": 1.6187272412677542, + "learning_rate": 7.394169628269771e-06, + "loss": 0.455, + "step": 3708 + }, + { + "epoch": 0.36097323600973236, + "grad_norm": 1.2227561836718068, + "learning_rate": 7.392785793966483e-06, + "loss": 0.3885, + "step": 3709 + }, + { + "epoch": 0.3610705596107056, + "grad_norm": 1.3724156281155415, + "learning_rate": 7.391401721890599e-06, + "loss": 0.2664, + "step": 3710 + }, + { + "epoch": 0.3611678832116788, + "grad_norm": 1.2230528654367878, + "learning_rate": 7.390017412179658e-06, + "loss": 0.3302, + "step": 3711 + }, + { + "epoch": 0.36126520681265206, + "grad_norm": 1.1928082331280525, + "learning_rate": 7.388632864971217e-06, + "loss": 0.2629, + "step": 3712 + }, + { + "epoch": 0.3613625304136253, + "grad_norm": 1.6037454753999223, + "learning_rate": 7.38724808040286e-06, + "loss": 0.3905, + "step": 3713 + }, + { + "epoch": 0.3614598540145985, + "grad_norm": 1.3621648463732532, + "learning_rate": 7.3858630586121926e-06, + "loss": 0.403, + "step": 3714 + }, + { + "epoch": 0.36155717761557177, + "grad_norm": 1.829925954185779, + "learning_rate": 7.384477799736848e-06, + "loss": 0.5973, + "step": 3715 + }, + { + "epoch": 0.361654501216545, + "grad_norm": 1.4268953334113146, + "learning_rate": 7.383092303914479e-06, + "loss": 0.4476, + "step": 3716 + }, + { + "epoch": 0.3617518248175183, + "grad_norm": 1.4218010516377573, + "learning_rate": 7.381706571282762e-06, + "loss": 0.3333, + "step": 3717 + }, + { + "epoch": 0.3618491484184915, + "grad_norm": 1.4149794161212645, + "learning_rate": 7.3803206019794004e-06, + "loss": 0.4466, + "step": 3718 + }, + { + "epoch": 0.3619464720194647, + "grad_norm": 1.3528540289292441, + "learning_rate": 7.378934396142116e-06, + "loss": 0.4781, + "step": 3719 + }, + { + "epoch": 0.362043795620438, + "grad_norm": 1.7444608763847531, + "learning_rate": 7.3775479539086595e-06, + "loss": 0.3911, + "step": 3720 + }, + { + "epoch": 0.3621411192214112, + "grad_norm": 1.3328673124502177, + "learning_rate": 7.376161275416802e-06, + "loss": 0.4312, + "step": 3721 + }, + { + "epoch": 0.36223844282238443, + "grad_norm": 1.5999365764206797, + "learning_rate": 7.374774360804337e-06, + "loss": 0.4827, + "step": 3722 + }, + { + "epoch": 0.3623357664233577, + "grad_norm": 1.3911965727851072, + "learning_rate": 7.3733872102090846e-06, + "loss": 0.3357, + "step": 3723 + }, + { + "epoch": 0.3624330900243309, + "grad_norm": 2.108972016446877, + "learning_rate": 7.371999823768885e-06, + "loss": 0.534, + "step": 3724 + }, + { + "epoch": 0.36253041362530414, + "grad_norm": 1.3184107339533155, + "learning_rate": 7.370612201621606e-06, + "loss": 0.3319, + "step": 3725 + }, + { + "epoch": 0.3626277372262774, + "grad_norm": 1.5480195252175315, + "learning_rate": 7.369224343905135e-06, + "loss": 0.4569, + "step": 3726 + }, + { + "epoch": 0.3627250608272506, + "grad_norm": 1.5137208598424667, + "learning_rate": 7.3678362507573855e-06, + "loss": 0.4966, + "step": 3727 + }, + { + "epoch": 0.36282238442822384, + "grad_norm": 1.9044576254266983, + "learning_rate": 7.366447922316292e-06, + "loss": 0.3348, + "step": 3728 + }, + { + "epoch": 0.3629197080291971, + "grad_norm": 1.5245853892303196, + "learning_rate": 7.365059358719814e-06, + "loss": 0.4253, + "step": 3729 + }, + { + "epoch": 0.3630170316301703, + "grad_norm": 1.122262909962616, + "learning_rate": 7.3636705601059344e-06, + "loss": 0.3222, + "step": 3730 + }, + { + "epoch": 0.36311435523114355, + "grad_norm": 1.8386225119764608, + "learning_rate": 7.362281526612657e-06, + "loss": 0.2037, + "step": 3731 + }, + { + "epoch": 0.3632116788321168, + "grad_norm": 1.4527104104207418, + "learning_rate": 7.360892258378014e-06, + "loss": 0.4542, + "step": 3732 + }, + { + "epoch": 0.36330900243309, + "grad_norm": 1.2351545455698876, + "learning_rate": 7.359502755540054e-06, + "loss": 0.3215, + "step": 3733 + }, + { + "epoch": 0.36340632603406325, + "grad_norm": 1.2618923481187536, + "learning_rate": 7.358113018236856e-06, + "loss": 0.3527, + "step": 3734 + }, + { + "epoch": 0.3635036496350365, + "grad_norm": 1.659524308148738, + "learning_rate": 7.356723046606519e-06, + "loss": 0.4318, + "step": 3735 + }, + { + "epoch": 0.3636009732360097, + "grad_norm": 1.565399683654683, + "learning_rate": 7.355332840787164e-06, + "loss": 0.5299, + "step": 3736 + }, + { + "epoch": 0.36369829683698296, + "grad_norm": 1.6178264737718506, + "learning_rate": 7.353942400916936e-06, + "loss": 0.397, + "step": 3737 + }, + { + "epoch": 0.3637956204379562, + "grad_norm": 1.6534303379570627, + "learning_rate": 7.352551727134005e-06, + "loss": 0.6081, + "step": 3738 + }, + { + "epoch": 0.36389294403892947, + "grad_norm": 1.247872353830009, + "learning_rate": 7.351160819576564e-06, + "loss": 0.3425, + "step": 3739 + }, + { + "epoch": 0.36399026763990266, + "grad_norm": 1.5020180478592695, + "learning_rate": 7.349769678382826e-06, + "loss": 0.5016, + "step": 3740 + }, + { + "epoch": 0.3640875912408759, + "grad_norm": 1.5279869032792845, + "learning_rate": 7.34837830369103e-06, + "loss": 0.3526, + "step": 3741 + }, + { + "epoch": 0.36418491484184917, + "grad_norm": 1.3094265238017966, + "learning_rate": 7.346986695639439e-06, + "loss": 0.3748, + "step": 3742 + }, + { + "epoch": 0.36428223844282237, + "grad_norm": 1.4537907377719694, + "learning_rate": 7.34559485436634e-06, + "loss": 0.1896, + "step": 3743 + }, + { + "epoch": 0.3643795620437956, + "grad_norm": 1.329137618047409, + "learning_rate": 7.344202780010036e-06, + "loss": 0.3121, + "step": 3744 + }, + { + "epoch": 0.3644768856447689, + "grad_norm": 1.6436280150893918, + "learning_rate": 7.342810472708861e-06, + "loss": 0.4822, + "step": 3745 + }, + { + "epoch": 0.3645742092457421, + "grad_norm": 1.3725794380420981, + "learning_rate": 7.341417932601169e-06, + "loss": 0.3409, + "step": 3746 + }, + { + "epoch": 0.36467153284671533, + "grad_norm": 1.5052359115379705, + "learning_rate": 7.34002515982534e-06, + "loss": 0.2377, + "step": 3747 + }, + { + "epoch": 0.3647688564476886, + "grad_norm": 1.3552921410971608, + "learning_rate": 7.3386321545197715e-06, + "loss": 0.3572, + "step": 3748 + }, + { + "epoch": 0.3648661800486618, + "grad_norm": 1.5018874056046176, + "learning_rate": 7.337238916822888e-06, + "loss": 0.4986, + "step": 3749 + }, + { + "epoch": 0.36496350364963503, + "grad_norm": 1.8358104905369528, + "learning_rate": 7.335845446873137e-06, + "loss": 0.3914, + "step": 3750 + }, + { + "epoch": 0.3650608272506083, + "grad_norm": 1.1168351051108918, + "learning_rate": 7.334451744808988e-06, + "loss": 0.2733, + "step": 3751 + }, + { + "epoch": 0.3651581508515815, + "grad_norm": 1.3776079933771972, + "learning_rate": 7.333057810768934e-06, + "loss": 0.3787, + "step": 3752 + }, + { + "epoch": 0.36525547445255474, + "grad_norm": 1.341701987297211, + "learning_rate": 7.331663644891492e-06, + "loss": 0.3567, + "step": 3753 + }, + { + "epoch": 0.365352798053528, + "grad_norm": 1.5110248875717456, + "learning_rate": 7.3302692473152e-06, + "loss": 0.345, + "step": 3754 + }, + { + "epoch": 0.3654501216545012, + "grad_norm": 1.3479707654958055, + "learning_rate": 7.328874618178621e-06, + "loss": 0.3874, + "step": 3755 + }, + { + "epoch": 0.36554744525547445, + "grad_norm": 1.4702256277367285, + "learning_rate": 7.32747975762034e-06, + "loss": 0.2701, + "step": 3756 + }, + { + "epoch": 0.3656447688564477, + "grad_norm": 1.2848329456259648, + "learning_rate": 7.326084665778965e-06, + "loss": 0.428, + "step": 3757 + }, + { + "epoch": 0.3657420924574209, + "grad_norm": 1.3129833784540297, + "learning_rate": 7.324689342793125e-06, + "loss": 0.3708, + "step": 3758 + }, + { + "epoch": 0.36583941605839415, + "grad_norm": 1.585613649874231, + "learning_rate": 7.323293788801478e-06, + "loss": 0.5306, + "step": 3759 + }, + { + "epoch": 0.3659367396593674, + "grad_norm": 1.829241550219066, + "learning_rate": 7.3218980039427e-06, + "loss": 0.4737, + "step": 3760 + }, + { + "epoch": 0.36603406326034066, + "grad_norm": 1.395899680918888, + "learning_rate": 7.320501988355488e-06, + "loss": 0.4636, + "step": 3761 + }, + { + "epoch": 0.36613138686131386, + "grad_norm": 1.5046483178350247, + "learning_rate": 7.319105742178568e-06, + "loss": 0.4155, + "step": 3762 + }, + { + "epoch": 0.3662287104622871, + "grad_norm": 1.280400622675231, + "learning_rate": 7.317709265550685e-06, + "loss": 0.3885, + "step": 3763 + }, + { + "epoch": 0.36632603406326036, + "grad_norm": 1.217266157593417, + "learning_rate": 7.316312558610608e-06, + "loss": 0.3962, + "step": 3764 + }, + { + "epoch": 0.36642335766423356, + "grad_norm": 1.263633008120406, + "learning_rate": 7.314915621497129e-06, + "loss": 0.478, + "step": 3765 + }, + { + "epoch": 0.3665206812652068, + "grad_norm": 1.3563025380870863, + "learning_rate": 7.31351845434906e-06, + "loss": 0.4117, + "step": 3766 + }, + { + "epoch": 0.36661800486618007, + "grad_norm": 1.2737220025097429, + "learning_rate": 7.312121057305241e-06, + "loss": 0.424, + "step": 3767 + }, + { + "epoch": 0.36671532846715327, + "grad_norm": 1.291092078272358, + "learning_rate": 7.3107234305045324e-06, + "loss": 0.334, + "step": 3768 + }, + { + "epoch": 0.3668126520681265, + "grad_norm": 1.3126844548992553, + "learning_rate": 7.309325574085815e-06, + "loss": 0.4322, + "step": 3769 + }, + { + "epoch": 0.3669099756690998, + "grad_norm": 1.1245662064795194, + "learning_rate": 7.307927488187997e-06, + "loss": 0.2236, + "step": 3770 + }, + { + "epoch": 0.367007299270073, + "grad_norm": 1.5102285684556136, + "learning_rate": 7.306529172950006e-06, + "loss": 0.5623, + "step": 3771 + }, + { + "epoch": 0.3671046228710462, + "grad_norm": 1.685763702060817, + "learning_rate": 7.305130628510792e-06, + "loss": 0.6844, + "step": 3772 + }, + { + "epoch": 0.3672019464720195, + "grad_norm": 1.2974033428491907, + "learning_rate": 7.30373185500933e-06, + "loss": 0.423, + "step": 3773 + }, + { + "epoch": 0.3672992700729927, + "grad_norm": 1.5850183990084508, + "learning_rate": 7.302332852584619e-06, + "loss": 0.5261, + "step": 3774 + }, + { + "epoch": 0.36739659367396593, + "grad_norm": 1.2629993659748047, + "learning_rate": 7.3009336213756775e-06, + "loss": 0.4166, + "step": 3775 + }, + { + "epoch": 0.3674939172749392, + "grad_norm": 1.5305199731858852, + "learning_rate": 7.299534161521548e-06, + "loss": 0.3868, + "step": 3776 + }, + { + "epoch": 0.3675912408759124, + "grad_norm": 1.3498481506141689, + "learning_rate": 7.298134473161293e-06, + "loss": 0.4383, + "step": 3777 + }, + { + "epoch": 0.36768856447688564, + "grad_norm": 1.1775129878791764, + "learning_rate": 7.296734556434006e-06, + "loss": 0.3227, + "step": 3778 + }, + { + "epoch": 0.3677858880778589, + "grad_norm": 1.2659702536536892, + "learning_rate": 7.295334411478793e-06, + "loss": 0.3261, + "step": 3779 + }, + { + "epoch": 0.3678832116788321, + "grad_norm": 1.29047857012686, + "learning_rate": 7.293934038434789e-06, + "loss": 0.3675, + "step": 3780 + }, + { + "epoch": 0.36798053527980534, + "grad_norm": 1.4852568566742426, + "learning_rate": 7.292533437441149e-06, + "loss": 0.5919, + "step": 3781 + }, + { + "epoch": 0.3680778588807786, + "grad_norm": 1.6696818575227366, + "learning_rate": 7.291132608637053e-06, + "loss": 0.574, + "step": 3782 + }, + { + "epoch": 0.36817518248175185, + "grad_norm": 1.1373482236107322, + "learning_rate": 7.289731552161701e-06, + "loss": 0.2637, + "step": 3783 + }, + { + "epoch": 0.36827250608272505, + "grad_norm": 1.3365823182266015, + "learning_rate": 7.288330268154318e-06, + "loss": 0.3472, + "step": 3784 + }, + { + "epoch": 0.3683698296836983, + "grad_norm": 1.372948416466753, + "learning_rate": 7.286928756754148e-06, + "loss": 0.328, + "step": 3785 + }, + { + "epoch": 0.36846715328467156, + "grad_norm": 1.4321229130137978, + "learning_rate": 7.285527018100464e-06, + "loss": 0.4256, + "step": 3786 + }, + { + "epoch": 0.36856447688564475, + "grad_norm": 1.5800422881297964, + "learning_rate": 7.284125052332554e-06, + "loss": 0.6671, + "step": 3787 + }, + { + "epoch": 0.368661800486618, + "grad_norm": 1.2921074248553535, + "learning_rate": 7.282722859589734e-06, + "loss": 0.2557, + "step": 3788 + }, + { + "epoch": 0.36875912408759126, + "grad_norm": 1.5092560968103714, + "learning_rate": 7.281320440011339e-06, + "loss": 0.483, + "step": 3789 + }, + { + "epoch": 0.36885644768856446, + "grad_norm": 1.6123103806541292, + "learning_rate": 7.279917793736732e-06, + "loss": 0.6551, + "step": 3790 + }, + { + "epoch": 0.3689537712895377, + "grad_norm": 1.4808246114399204, + "learning_rate": 7.278514920905291e-06, + "loss": 0.2513, + "step": 3791 + }, + { + "epoch": 0.36905109489051097, + "grad_norm": 1.5663493234083814, + "learning_rate": 7.277111821656423e-06, + "loss": 0.571, + "step": 3792 + }, + { + "epoch": 0.36914841849148416, + "grad_norm": 1.363238577188513, + "learning_rate": 7.275708496129552e-06, + "loss": 0.4348, + "step": 3793 + }, + { + "epoch": 0.3692457420924574, + "grad_norm": 1.4565243900455658, + "learning_rate": 7.27430494446413e-06, + "loss": 0.4819, + "step": 3794 + }, + { + "epoch": 0.36934306569343067, + "grad_norm": 1.384454947890668, + "learning_rate": 7.272901166799628e-06, + "loss": 0.3765, + "step": 3795 + }, + { + "epoch": 0.36944038929440387, + "grad_norm": 1.4124479402383887, + "learning_rate": 7.27149716327554e-06, + "loss": 0.4979, + "step": 3796 + }, + { + "epoch": 0.3695377128953771, + "grad_norm": 1.5746058137286074, + "learning_rate": 7.270092934031383e-06, + "loss": 0.6907, + "step": 3797 + }, + { + "epoch": 0.3696350364963504, + "grad_norm": 1.149224148134044, + "learning_rate": 7.268688479206694e-06, + "loss": 0.2767, + "step": 3798 + }, + { + "epoch": 0.3697323600973236, + "grad_norm": 1.3675830377895282, + "learning_rate": 7.267283798941038e-06, + "loss": 0.4495, + "step": 3799 + }, + { + "epoch": 0.36982968369829683, + "grad_norm": 1.4280450276827679, + "learning_rate": 7.265878893373996e-06, + "loss": 0.3899, + "step": 3800 + }, + { + "epoch": 0.3699270072992701, + "grad_norm": 1.49782627276864, + "learning_rate": 7.264473762645178e-06, + "loss": 0.4774, + "step": 3801 + }, + { + "epoch": 0.37002433090024334, + "grad_norm": 1.3952118310784345, + "learning_rate": 7.263068406894209e-06, + "loss": 0.5262, + "step": 3802 + }, + { + "epoch": 0.37012165450121653, + "grad_norm": 1.05962510209006, + "learning_rate": 7.261662826260741e-06, + "loss": 0.285, + "step": 3803 + }, + { + "epoch": 0.3702189781021898, + "grad_norm": 1.1322437831904242, + "learning_rate": 7.260257020884448e-06, + "loss": 0.3328, + "step": 3804 + }, + { + "epoch": 0.37031630170316304, + "grad_norm": 1.578419195268038, + "learning_rate": 7.2588509909050254e-06, + "loss": 0.6624, + "step": 3805 + }, + { + "epoch": 0.37041362530413624, + "grad_norm": 1.6753162370187171, + "learning_rate": 7.257444736462193e-06, + "loss": 0.6607, + "step": 3806 + }, + { + "epoch": 0.3705109489051095, + "grad_norm": 1.2155863954352035, + "learning_rate": 7.2560382576956875e-06, + "loss": 0.385, + "step": 3807 + }, + { + "epoch": 0.37060827250608275, + "grad_norm": 1.3086656237549659, + "learning_rate": 7.254631554745275e-06, + "loss": 0.3251, + "step": 3808 + }, + { + "epoch": 0.37070559610705595, + "grad_norm": 1.6932808088932305, + "learning_rate": 7.253224627750738e-06, + "loss": 0.5078, + "step": 3809 + }, + { + "epoch": 0.3708029197080292, + "grad_norm": 1.140327832951829, + "learning_rate": 7.251817476851886e-06, + "loss": 0.3278, + "step": 3810 + }, + { + "epoch": 0.37090024330900245, + "grad_norm": 1.2897084785014141, + "learning_rate": 7.2504101021885475e-06, + "loss": 0.3223, + "step": 3811 + }, + { + "epoch": 0.37099756690997565, + "grad_norm": 1.4467933285693058, + "learning_rate": 7.249002503900573e-06, + "loss": 0.3934, + "step": 3812 + }, + { + "epoch": 0.3710948905109489, + "grad_norm": 1.208314429606629, + "learning_rate": 7.2475946821278374e-06, + "loss": 0.4511, + "step": 3813 + }, + { + "epoch": 0.37119221411192216, + "grad_norm": 1.3518646431718864, + "learning_rate": 7.2461866370102354e-06, + "loss": 0.4939, + "step": 3814 + }, + { + "epoch": 0.37128953771289536, + "grad_norm": 1.360244692009117, + "learning_rate": 7.244778368687688e-06, + "loss": 0.3937, + "step": 3815 + }, + { + "epoch": 0.3713868613138686, + "grad_norm": 1.3528197069652437, + "learning_rate": 7.243369877300135e-06, + "loss": 0.3492, + "step": 3816 + }, + { + "epoch": 0.37148418491484186, + "grad_norm": 1.2751734783858382, + "learning_rate": 7.2419611629875386e-06, + "loss": 0.4052, + "step": 3817 + }, + { + "epoch": 0.37158150851581506, + "grad_norm": 1.2580260101504126, + "learning_rate": 7.240552225889882e-06, + "loss": 0.3386, + "step": 3818 + }, + { + "epoch": 0.3716788321167883, + "grad_norm": 1.3674139216806946, + "learning_rate": 7.239143066147174e-06, + "loss": 0.2891, + "step": 3819 + }, + { + "epoch": 0.37177615571776157, + "grad_norm": 1.0530572795884554, + "learning_rate": 7.237733683899444e-06, + "loss": 0.2657, + "step": 3820 + }, + { + "epoch": 0.37187347931873477, + "grad_norm": 2.205291656616187, + "learning_rate": 7.236324079286742e-06, + "loss": 0.2303, + "step": 3821 + }, + { + "epoch": 0.371970802919708, + "grad_norm": 1.1572222874558429, + "learning_rate": 7.234914252449141e-06, + "loss": 0.2307, + "step": 3822 + }, + { + "epoch": 0.3720681265206813, + "grad_norm": 1.3550158356870847, + "learning_rate": 7.233504203526738e-06, + "loss": 0.4026, + "step": 3823 + }, + { + "epoch": 0.37216545012165453, + "grad_norm": 1.1916885791889118, + "learning_rate": 7.232093932659648e-06, + "loss": 0.384, + "step": 3824 + }, + { + "epoch": 0.3722627737226277, + "grad_norm": 1.3181641384661131, + "learning_rate": 7.230683439988013e-06, + "loss": 0.3978, + "step": 3825 + }, + { + "epoch": 0.372360097323601, + "grad_norm": 1.4449003714564748, + "learning_rate": 7.229272725651995e-06, + "loss": 0.4663, + "step": 3826 + }, + { + "epoch": 0.37245742092457423, + "grad_norm": 1.110719215634453, + "learning_rate": 7.2278617897917734e-06, + "loss": 0.3073, + "step": 3827 + }, + { + "epoch": 0.37255474452554743, + "grad_norm": 1.3075317628405025, + "learning_rate": 7.226450632547558e-06, + "loss": 0.341, + "step": 3828 + }, + { + "epoch": 0.3726520681265207, + "grad_norm": 1.490372677606144, + "learning_rate": 7.225039254059574e-06, + "loss": 0.483, + "step": 3829 + }, + { + "epoch": 0.37274939172749394, + "grad_norm": 1.3137386471795909, + "learning_rate": 7.223627654468072e-06, + "loss": 0.3739, + "step": 3830 + }, + { + "epoch": 0.37284671532846714, + "grad_norm": 1.242608245033651, + "learning_rate": 7.2222158339133245e-06, + "loss": 0.2848, + "step": 3831 + }, + { + "epoch": 0.3729440389294404, + "grad_norm": 1.4129798917912184, + "learning_rate": 7.220803792535621e-06, + "loss": 0.3657, + "step": 3832 + }, + { + "epoch": 0.37304136253041364, + "grad_norm": 1.7023996330847022, + "learning_rate": 7.2193915304752815e-06, + "loss": 0.601, + "step": 3833 + }, + { + "epoch": 0.37313868613138684, + "grad_norm": 1.587053329225552, + "learning_rate": 7.2179790478726405e-06, + "loss": 0.3858, + "step": 3834 + }, + { + "epoch": 0.3732360097323601, + "grad_norm": 1.50243862921725, + "learning_rate": 7.216566344868059e-06, + "loss": 0.3792, + "step": 3835 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 1.2996422990779706, + "learning_rate": 7.215153421601917e-06, + "loss": 0.3695, + "step": 3836 + }, + { + "epoch": 0.37343065693430655, + "grad_norm": 1.2661927915794233, + "learning_rate": 7.2137402782146185e-06, + "loss": 0.2922, + "step": 3837 + }, + { + "epoch": 0.3735279805352798, + "grad_norm": 1.342404468128034, + "learning_rate": 7.212326914846587e-06, + "loss": 0.3832, + "step": 3838 + }, + { + "epoch": 0.37362530413625306, + "grad_norm": 1.3085065670293223, + "learning_rate": 7.2109133316382716e-06, + "loss": 0.3435, + "step": 3839 + }, + { + "epoch": 0.37372262773722625, + "grad_norm": 1.4136536570943345, + "learning_rate": 7.209499528730138e-06, + "loss": 0.3996, + "step": 3840 + }, + { + "epoch": 0.3738199513381995, + "grad_norm": 1.3042560709248439, + "learning_rate": 7.208085506262679e-06, + "loss": 0.3547, + "step": 3841 + }, + { + "epoch": 0.37391727493917276, + "grad_norm": 1.1956805990066928, + "learning_rate": 7.206671264376406e-06, + "loss": 0.3349, + "step": 3842 + }, + { + "epoch": 0.37401459854014596, + "grad_norm": 1.3725427866174449, + "learning_rate": 7.205256803211852e-06, + "loss": 0.3136, + "step": 3843 + }, + { + "epoch": 0.3741119221411192, + "grad_norm": 1.1819946263218575, + "learning_rate": 7.203842122909576e-06, + "loss": 0.3854, + "step": 3844 + }, + { + "epoch": 0.37420924574209247, + "grad_norm": 1.1749292879944346, + "learning_rate": 7.202427223610153e-06, + "loss": 0.2567, + "step": 3845 + }, + { + "epoch": 0.3743065693430657, + "grad_norm": 1.6177760725542194, + "learning_rate": 7.201012105454181e-06, + "loss": 0.6999, + "step": 3846 + }, + { + "epoch": 0.3744038929440389, + "grad_norm": 1.5266511327183672, + "learning_rate": 7.199596768582284e-06, + "loss": 0.5089, + "step": 3847 + }, + { + "epoch": 0.37450121654501217, + "grad_norm": 1.351768037018499, + "learning_rate": 7.198181213135107e-06, + "loss": 0.3024, + "step": 3848 + }, + { + "epoch": 0.3745985401459854, + "grad_norm": 2.0647138793608883, + "learning_rate": 7.19676543925331e-06, + "loss": 0.4637, + "step": 3849 + }, + { + "epoch": 0.3746958637469586, + "grad_norm": 1.0901772593808003, + "learning_rate": 7.19534944707758e-06, + "loss": 0.3197, + "step": 3850 + }, + { + "epoch": 0.3747931873479319, + "grad_norm": 1.5563466788905995, + "learning_rate": 7.193933236748627e-06, + "loss": 0.4692, + "step": 3851 + }, + { + "epoch": 0.37489051094890513, + "grad_norm": 1.6383898814090063, + "learning_rate": 7.192516808407179e-06, + "loss": 0.4814, + "step": 3852 + }, + { + "epoch": 0.37498783454987833, + "grad_norm": 1.234160440846945, + "learning_rate": 7.191100162193989e-06, + "loss": 0.4099, + "step": 3853 + }, + { + "epoch": 0.3750851581508516, + "grad_norm": 1.5270305810050784, + "learning_rate": 7.189683298249829e-06, + "loss": 0.553, + "step": 3854 + }, + { + "epoch": 0.37518248175182484, + "grad_norm": 1.4979140559754238, + "learning_rate": 7.1882662167154935e-06, + "loss": 0.2972, + "step": 3855 + }, + { + "epoch": 0.37527980535279803, + "grad_norm": 1.4793577831895819, + "learning_rate": 7.186848917731799e-06, + "loss": 0.517, + "step": 3856 + }, + { + "epoch": 0.3753771289537713, + "grad_norm": 1.1834990208552847, + "learning_rate": 7.1854314014395836e-06, + "loss": 0.2198, + "step": 3857 + }, + { + "epoch": 0.37547445255474454, + "grad_norm": 1.4192969456874913, + "learning_rate": 7.184013667979707e-06, + "loss": 0.4349, + "step": 3858 + }, + { + "epoch": 0.37557177615571774, + "grad_norm": 1.411758906981302, + "learning_rate": 7.1825957174930495e-06, + "loss": 0.6343, + "step": 3859 + }, + { + "epoch": 0.375669099756691, + "grad_norm": 1.4732647759178685, + "learning_rate": 7.181177550120514e-06, + "loss": 0.4049, + "step": 3860 + }, + { + "epoch": 0.37576642335766425, + "grad_norm": 1.3463082468649135, + "learning_rate": 7.1797591660030245e-06, + "loss": 0.4196, + "step": 3861 + }, + { + "epoch": 0.37586374695863745, + "grad_norm": 1.2161261851692835, + "learning_rate": 7.178340565281527e-06, + "loss": 0.3326, + "step": 3862 + }, + { + "epoch": 0.3759610705596107, + "grad_norm": 1.3015426377776436, + "learning_rate": 7.176921748096987e-06, + "loss": 0.3816, + "step": 3863 + }, + { + "epoch": 0.37605839416058395, + "grad_norm": 1.2745689561042768, + "learning_rate": 7.175502714590398e-06, + "loss": 0.365, + "step": 3864 + }, + { + "epoch": 0.37615571776155715, + "grad_norm": 1.3894630463948126, + "learning_rate": 7.174083464902765e-06, + "loss": 0.5022, + "step": 3865 + }, + { + "epoch": 0.3762530413625304, + "grad_norm": 1.6621696201258571, + "learning_rate": 7.172663999175123e-06, + "loss": 0.6661, + "step": 3866 + }, + { + "epoch": 0.37635036496350366, + "grad_norm": 1.6093508524147906, + "learning_rate": 7.171244317548522e-06, + "loss": 0.2752, + "step": 3867 + }, + { + "epoch": 0.3764476885644769, + "grad_norm": 1.2470356123121078, + "learning_rate": 7.16982442016404e-06, + "loss": 0.2922, + "step": 3868 + }, + { + "epoch": 0.3765450121654501, + "grad_norm": 1.4271789388815392, + "learning_rate": 7.168404307162773e-06, + "loss": 0.3537, + "step": 3869 + }, + { + "epoch": 0.37664233576642336, + "grad_norm": 1.2732277445845157, + "learning_rate": 7.166983978685835e-06, + "loss": 0.2861, + "step": 3870 + }, + { + "epoch": 0.3767396593673966, + "grad_norm": 1.5040576412741455, + "learning_rate": 7.165563434874367e-06, + "loss": 0.5636, + "step": 3871 + }, + { + "epoch": 0.3768369829683698, + "grad_norm": 1.305952270015343, + "learning_rate": 7.164142675869531e-06, + "loss": 0.3189, + "step": 3872 + }, + { + "epoch": 0.37693430656934307, + "grad_norm": 1.297847888444762, + "learning_rate": 7.162721701812506e-06, + "loss": 0.2904, + "step": 3873 + }, + { + "epoch": 0.3770316301703163, + "grad_norm": 1.4659816543867576, + "learning_rate": 7.161300512844496e-06, + "loss": 0.2955, + "step": 3874 + }, + { + "epoch": 0.3771289537712895, + "grad_norm": 1.2035904516565876, + "learning_rate": 7.159879109106726e-06, + "loss": 0.2557, + "step": 3875 + }, + { + "epoch": 0.3772262773722628, + "grad_norm": 1.598650858223209, + "learning_rate": 7.158457490740442e-06, + "loss": 0.7335, + "step": 3876 + }, + { + "epoch": 0.37732360097323603, + "grad_norm": 1.2305628180498833, + "learning_rate": 7.157035657886911e-06, + "loss": 0.2654, + "step": 3877 + }, + { + "epoch": 0.3774209245742092, + "grad_norm": 1.709426282151801, + "learning_rate": 7.1556136106874195e-06, + "loss": 0.4445, + "step": 3878 + }, + { + "epoch": 0.3775182481751825, + "grad_norm": 1.5920181588666038, + "learning_rate": 7.154191349283278e-06, + "loss": 0.4233, + "step": 3879 + }, + { + "epoch": 0.37761557177615573, + "grad_norm": 1.4519282509492475, + "learning_rate": 7.152768873815819e-06, + "loss": 0.4399, + "step": 3880 + }, + { + "epoch": 0.37771289537712893, + "grad_norm": 1.118109262781078, + "learning_rate": 7.151346184426394e-06, + "loss": 0.2138, + "step": 3881 + }, + { + "epoch": 0.3778102189781022, + "grad_norm": 1.233018443573734, + "learning_rate": 7.1499232812563765e-06, + "loss": 0.266, + "step": 3882 + }, + { + "epoch": 0.37790754257907544, + "grad_norm": 1.4646778063558124, + "learning_rate": 7.148500164447159e-06, + "loss": 0.4118, + "step": 3883 + }, + { + "epoch": 0.37800486618004864, + "grad_norm": 1.1142467929519195, + "learning_rate": 7.147076834140163e-06, + "loss": 0.3422, + "step": 3884 + }, + { + "epoch": 0.3781021897810219, + "grad_norm": 1.2249131016174075, + "learning_rate": 7.145653290476821e-06, + "loss": 0.3973, + "step": 3885 + }, + { + "epoch": 0.37819951338199514, + "grad_norm": 1.3075094712827902, + "learning_rate": 7.144229533598593e-06, + "loss": 0.4029, + "step": 3886 + }, + { + "epoch": 0.37829683698296834, + "grad_norm": 1.2831548162044333, + "learning_rate": 7.142805563646957e-06, + "loss": 0.3863, + "step": 3887 + }, + { + "epoch": 0.3783941605839416, + "grad_norm": 0.8015397364189331, + "learning_rate": 7.1413813807634144e-06, + "loss": 0.1929, + "step": 3888 + }, + { + "epoch": 0.37849148418491485, + "grad_norm": 2.1960941972871186, + "learning_rate": 7.1399569850894886e-06, + "loss": 0.4767, + "step": 3889 + }, + { + "epoch": 0.3785888077858881, + "grad_norm": 1.4481987255363193, + "learning_rate": 7.138532376766722e-06, + "loss": 0.4454, + "step": 3890 + }, + { + "epoch": 0.3786861313868613, + "grad_norm": 0.951708820286833, + "learning_rate": 7.13710755593668e-06, + "loss": 0.186, + "step": 3891 + }, + { + "epoch": 0.37878345498783456, + "grad_norm": 1.6714364305481455, + "learning_rate": 7.1356825227409455e-06, + "loss": 0.6873, + "step": 3892 + }, + { + "epoch": 0.3788807785888078, + "grad_norm": 2.6374385953146238, + "learning_rate": 7.134257277321126e-06, + "loss": 0.3606, + "step": 3893 + }, + { + "epoch": 0.378978102189781, + "grad_norm": 1.326824044728447, + "learning_rate": 7.13283181981885e-06, + "loss": 0.2763, + "step": 3894 + }, + { + "epoch": 0.37907542579075426, + "grad_norm": 1.3996167884606898, + "learning_rate": 7.131406150375764e-06, + "loss": 0.5331, + "step": 3895 + }, + { + "epoch": 0.3791727493917275, + "grad_norm": 1.2228137027647799, + "learning_rate": 7.129980269133539e-06, + "loss": 0.27, + "step": 3896 + }, + { + "epoch": 0.3792700729927007, + "grad_norm": 1.4657212540958298, + "learning_rate": 7.128554176233865e-06, + "loss": 0.3137, + "step": 3897 + }, + { + "epoch": 0.37936739659367397, + "grad_norm": 1.9918813432796558, + "learning_rate": 7.127127871818455e-06, + "loss": 0.3948, + "step": 3898 + }, + { + "epoch": 0.3794647201946472, + "grad_norm": 1.8850475673593161, + "learning_rate": 7.12570135602904e-06, + "loss": 0.775, + "step": 3899 + }, + { + "epoch": 0.3795620437956204, + "grad_norm": 1.6679949077189995, + "learning_rate": 7.124274629007375e-06, + "loss": 0.6015, + "step": 3900 + }, + { + "epoch": 0.37965936739659367, + "grad_norm": 1.2107596482139231, + "learning_rate": 7.122847690895235e-06, + "loss": 0.2573, + "step": 3901 + }, + { + "epoch": 0.3797566909975669, + "grad_norm": 1.352463967342193, + "learning_rate": 7.1214205418344155e-06, + "loss": 0.428, + "step": 3902 + }, + { + "epoch": 0.3798540145985401, + "grad_norm": 1.6246032230437741, + "learning_rate": 7.1199931819667316e-06, + "loss": 0.5207, + "step": 3903 + }, + { + "epoch": 0.3799513381995134, + "grad_norm": 1.4207183828680439, + "learning_rate": 7.118565611434023e-06, + "loss": 0.3472, + "step": 3904 + }, + { + "epoch": 0.38004866180048663, + "grad_norm": 1.1713460109772222, + "learning_rate": 7.117137830378147e-06, + "loss": 0.3043, + "step": 3905 + }, + { + "epoch": 0.38014598540145983, + "grad_norm": 1.4955277530157316, + "learning_rate": 7.115709838940983e-06, + "loss": 0.5889, + "step": 3906 + }, + { + "epoch": 0.3802433090024331, + "grad_norm": 1.4020682146561894, + "learning_rate": 7.114281637264433e-06, + "loss": 0.5088, + "step": 3907 + }, + { + "epoch": 0.38034063260340634, + "grad_norm": 1.547124360918337, + "learning_rate": 7.112853225490417e-06, + "loss": 0.3316, + "step": 3908 + }, + { + "epoch": 0.38043795620437953, + "grad_norm": 1.2645380509557882, + "learning_rate": 7.111424603760877e-06, + "loss": 0.4013, + "step": 3909 + }, + { + "epoch": 0.3805352798053528, + "grad_norm": 1.7019140855865837, + "learning_rate": 7.109995772217776e-06, + "loss": 0.6348, + "step": 3910 + }, + { + "epoch": 0.38063260340632604, + "grad_norm": 1.304725235261976, + "learning_rate": 7.108566731003099e-06, + "loss": 0.4496, + "step": 3911 + }, + { + "epoch": 0.3807299270072993, + "grad_norm": 1.324816183619312, + "learning_rate": 7.1071374802588496e-06, + "loss": 0.3335, + "step": 3912 + }, + { + "epoch": 0.3808272506082725, + "grad_norm": 1.3880021724436575, + "learning_rate": 7.1057080201270535e-06, + "loss": 0.46, + "step": 3913 + }, + { + "epoch": 0.38092457420924575, + "grad_norm": 1.5324573817091165, + "learning_rate": 7.104278350749757e-06, + "loss": 0.4837, + "step": 3914 + }, + { + "epoch": 0.381021897810219, + "grad_norm": 1.415341468021005, + "learning_rate": 7.1028484722690275e-06, + "loss": 0.2941, + "step": 3915 + }, + { + "epoch": 0.3811192214111922, + "grad_norm": 1.2327883216411208, + "learning_rate": 7.101418384826953e-06, + "loss": 0.4483, + "step": 3916 + }, + { + "epoch": 0.38121654501216545, + "grad_norm": 1.4959339823461342, + "learning_rate": 7.099988088565642e-06, + "loss": 0.4297, + "step": 3917 + }, + { + "epoch": 0.3813138686131387, + "grad_norm": 1.5384484629808992, + "learning_rate": 7.098557583627224e-06, + "loss": 0.4326, + "step": 3918 + }, + { + "epoch": 0.3814111922141119, + "grad_norm": 1.6679471659994731, + "learning_rate": 7.097126870153849e-06, + "loss": 0.5069, + "step": 3919 + }, + { + "epoch": 0.38150851581508516, + "grad_norm": 1.45117080708974, + "learning_rate": 7.095695948287686e-06, + "loss": 0.3661, + "step": 3920 + }, + { + "epoch": 0.3816058394160584, + "grad_norm": 1.4481947743840338, + "learning_rate": 7.094264818170931e-06, + "loss": 0.3961, + "step": 3921 + }, + { + "epoch": 0.3817031630170316, + "grad_norm": 1.348596217427556, + "learning_rate": 7.092833479945793e-06, + "loss": 0.4029, + "step": 3922 + }, + { + "epoch": 0.38180048661800486, + "grad_norm": 1.3981471692690721, + "learning_rate": 7.091401933754507e-06, + "loss": 0.3814, + "step": 3923 + }, + { + "epoch": 0.3818978102189781, + "grad_norm": 1.8099756090617796, + "learning_rate": 7.089970179739323e-06, + "loss": 0.3998, + "step": 3924 + }, + { + "epoch": 0.3819951338199513, + "grad_norm": 1.5103246288100645, + "learning_rate": 7.088538218042519e-06, + "loss": 0.5697, + "step": 3925 + }, + { + "epoch": 0.38209245742092457, + "grad_norm": 1.1477205283158238, + "learning_rate": 7.087106048806388e-06, + "loss": 0.3023, + "step": 3926 + }, + { + "epoch": 0.3821897810218978, + "grad_norm": 1.2535758370464407, + "learning_rate": 7.085673672173247e-06, + "loss": 0.3408, + "step": 3927 + }, + { + "epoch": 0.382287104622871, + "grad_norm": 1.1952324136071744, + "learning_rate": 7.0842410882854305e-06, + "loss": 0.3339, + "step": 3928 + }, + { + "epoch": 0.3823844282238443, + "grad_norm": 1.2651491565748119, + "learning_rate": 7.082808297285296e-06, + "loss": 0.3817, + "step": 3929 + }, + { + "epoch": 0.38248175182481753, + "grad_norm": 1.5429938591954184, + "learning_rate": 7.081375299315221e-06, + "loss": 0.5274, + "step": 3930 + }, + { + "epoch": 0.3825790754257907, + "grad_norm": 1.3843509715042235, + "learning_rate": 7.0799420945176026e-06, + "loss": 0.4354, + "step": 3931 + }, + { + "epoch": 0.382676399026764, + "grad_norm": 1.2382754737887756, + "learning_rate": 7.078508683034862e-06, + "loss": 0.4135, + "step": 3932 + }, + { + "epoch": 0.38277372262773723, + "grad_norm": 1.659344068335914, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.357, + "step": 3933 + }, + { + "epoch": 0.3828710462287105, + "grad_norm": 1.2935879886930641, + "learning_rate": 7.0756412405837795e-06, + "loss": 0.3475, + "step": 3934 + }, + { + "epoch": 0.3829683698296837, + "grad_norm": 1.279920120428988, + "learning_rate": 7.07420720990038e-06, + "loss": 0.2811, + "step": 3935 + }, + { + "epoch": 0.38306569343065694, + "grad_norm": 1.5715982667528452, + "learning_rate": 7.072772973101735e-06, + "loss": 0.2954, + "step": 3936 + }, + { + "epoch": 0.3831630170316302, + "grad_norm": 1.4996379574324572, + "learning_rate": 7.071338530330365e-06, + "loss": 0.3162, + "step": 3937 + }, + { + "epoch": 0.3832603406326034, + "grad_norm": 1.323656350948574, + "learning_rate": 7.069903881728815e-06, + "loss": 0.4149, + "step": 3938 + }, + { + "epoch": 0.38335766423357664, + "grad_norm": 1.5522008591091048, + "learning_rate": 7.068469027439642e-06, + "loss": 0.5585, + "step": 3939 + }, + { + "epoch": 0.3834549878345499, + "grad_norm": 1.6876876161618577, + "learning_rate": 7.06703396760543e-06, + "loss": 0.3946, + "step": 3940 + }, + { + "epoch": 0.3835523114355231, + "grad_norm": 1.5052911896213828, + "learning_rate": 7.065598702368782e-06, + "loss": 0.5529, + "step": 3941 + }, + { + "epoch": 0.38364963503649635, + "grad_norm": 1.7507247786327806, + "learning_rate": 7.0641632318723205e-06, + "loss": 0.6298, + "step": 3942 + }, + { + "epoch": 0.3837469586374696, + "grad_norm": 1.295827327522188, + "learning_rate": 7.062727556258693e-06, + "loss": 0.3517, + "step": 3943 + }, + { + "epoch": 0.3838442822384428, + "grad_norm": 1.037020059823786, + "learning_rate": 7.061291675670557e-06, + "loss": 0.2667, + "step": 3944 + }, + { + "epoch": 0.38394160583941606, + "grad_norm": 1.4712371869692, + "learning_rate": 7.059855590250604e-06, + "loss": 0.4601, + "step": 3945 + }, + { + "epoch": 0.3840389294403893, + "grad_norm": 1.4848893729981114, + "learning_rate": 7.058419300141531e-06, + "loss": 0.458, + "step": 3946 + }, + { + "epoch": 0.3841362530413625, + "grad_norm": 1.3674844473838341, + "learning_rate": 7.056982805486069e-06, + "loss": 0.4278, + "step": 3947 + }, + { + "epoch": 0.38423357664233576, + "grad_norm": 1.6438745197663538, + "learning_rate": 7.055546106426961e-06, + "loss": 0.6002, + "step": 3948 + }, + { + "epoch": 0.384330900243309, + "grad_norm": 1.2362894591542046, + "learning_rate": 7.054109203106974e-06, + "loss": 0.3796, + "step": 3949 + }, + { + "epoch": 0.3844282238442822, + "grad_norm": 1.6016145279039653, + "learning_rate": 7.052672095668891e-06, + "loss": 0.4956, + "step": 3950 + }, + { + "epoch": 0.38452554744525547, + "grad_norm": 1.578296991928203, + "learning_rate": 7.0512347842555205e-06, + "loss": 0.3424, + "step": 3951 + }, + { + "epoch": 0.3846228710462287, + "grad_norm": 1.1907848300937582, + "learning_rate": 7.049797269009689e-06, + "loss": 0.361, + "step": 3952 + }, + { + "epoch": 0.384720194647202, + "grad_norm": 1.4905027295394417, + "learning_rate": 7.048359550074244e-06, + "loss": 0.4279, + "step": 3953 + }, + { + "epoch": 0.38481751824817517, + "grad_norm": 1.4651231016184674, + "learning_rate": 7.046921627592051e-06, + "loss": 0.4622, + "step": 3954 + }, + { + "epoch": 0.3849148418491484, + "grad_norm": 1.3954586503570294, + "learning_rate": 7.045483501705997e-06, + "loss": 0.2737, + "step": 3955 + }, + { + "epoch": 0.3850121654501217, + "grad_norm": 1.4949563585883912, + "learning_rate": 7.044045172558991e-06, + "loss": 0.3092, + "step": 3956 + }, + { + "epoch": 0.3851094890510949, + "grad_norm": 1.3655729515230581, + "learning_rate": 7.042606640293958e-06, + "loss": 0.4943, + "step": 3957 + }, + { + "epoch": 0.38520681265206813, + "grad_norm": 1.350946010163059, + "learning_rate": 7.04116790505385e-06, + "loss": 0.5314, + "step": 3958 + }, + { + "epoch": 0.3853041362530414, + "grad_norm": 1.3729520632050074, + "learning_rate": 7.039728966981632e-06, + "loss": 0.4056, + "step": 3959 + }, + { + "epoch": 0.3854014598540146, + "grad_norm": 1.4800883184320073, + "learning_rate": 7.038289826220292e-06, + "loss": 0.4511, + "step": 3960 + }, + { + "epoch": 0.38549878345498784, + "grad_norm": 1.4839457644194014, + "learning_rate": 7.036850482912841e-06, + "loss": 0.5634, + "step": 3961 + }, + { + "epoch": 0.3855961070559611, + "grad_norm": 1.4722550146581384, + "learning_rate": 7.035410937202303e-06, + "loss": 0.4885, + "step": 3962 + }, + { + "epoch": 0.3856934306569343, + "grad_norm": 1.385565264579537, + "learning_rate": 7.033971189231731e-06, + "loss": 0.4708, + "step": 3963 + }, + { + "epoch": 0.38579075425790754, + "grad_norm": 1.4522732114770711, + "learning_rate": 7.032531239144192e-06, + "loss": 0.5378, + "step": 3964 + }, + { + "epoch": 0.3858880778588808, + "grad_norm": 1.14748867569532, + "learning_rate": 7.031091087082773e-06, + "loss": 0.3676, + "step": 3965 + }, + { + "epoch": 0.385985401459854, + "grad_norm": 1.3321032015253773, + "learning_rate": 7.029650733190585e-06, + "loss": 0.4896, + "step": 3966 + }, + { + "epoch": 0.38608272506082725, + "grad_norm": 1.2135132498408774, + "learning_rate": 7.028210177610755e-06, + "loss": 0.3284, + "step": 3967 + }, + { + "epoch": 0.3861800486618005, + "grad_norm": 1.5809800411366401, + "learning_rate": 7.026769420486435e-06, + "loss": 0.5651, + "step": 3968 + }, + { + "epoch": 0.3862773722627737, + "grad_norm": 1.2932935231921396, + "learning_rate": 7.025328461960791e-06, + "loss": 0.3895, + "step": 3969 + }, + { + "epoch": 0.38637469586374695, + "grad_norm": 1.1609928698062038, + "learning_rate": 7.023887302177013e-06, + "loss": 0.3102, + "step": 3970 + }, + { + "epoch": 0.3864720194647202, + "grad_norm": 1.3129146011919008, + "learning_rate": 7.022445941278308e-06, + "loss": 0.3229, + "step": 3971 + }, + { + "epoch": 0.3865693430656934, + "grad_norm": 1.5073708000931252, + "learning_rate": 7.02100437940791e-06, + "loss": 0.4405, + "step": 3972 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 1.525639512470862, + "learning_rate": 7.019562616709061e-06, + "loss": 0.4644, + "step": 3973 + }, + { + "epoch": 0.3867639902676399, + "grad_norm": 1.6237386062609962, + "learning_rate": 7.018120653325037e-06, + "loss": 0.6601, + "step": 3974 + }, + { + "epoch": 0.38686131386861317, + "grad_norm": 1.345911062533185, + "learning_rate": 7.016678489399121e-06, + "loss": 0.5065, + "step": 3975 + }, + { + "epoch": 0.38695863746958636, + "grad_norm": 1.514524079658759, + "learning_rate": 7.015236125074626e-06, + "loss": 0.2811, + "step": 3976 + }, + { + "epoch": 0.3870559610705596, + "grad_norm": 1.4135796159763112, + "learning_rate": 7.013793560494877e-06, + "loss": 0.4894, + "step": 3977 + }, + { + "epoch": 0.38715328467153287, + "grad_norm": 1.4104178653563566, + "learning_rate": 7.012350795803223e-06, + "loss": 0.5016, + "step": 3978 + }, + { + "epoch": 0.38725060827250607, + "grad_norm": 1.3045531335042206, + "learning_rate": 7.010907831143035e-06, + "loss": 0.2089, + "step": 3979 + }, + { + "epoch": 0.3873479318734793, + "grad_norm": 1.4042038760740407, + "learning_rate": 7.009464666657701e-06, + "loss": 0.3769, + "step": 3980 + }, + { + "epoch": 0.3874452554744526, + "grad_norm": 1.479965245122337, + "learning_rate": 7.008021302490626e-06, + "loss": 0.4625, + "step": 3981 + }, + { + "epoch": 0.3875425790754258, + "grad_norm": 1.4175283205139955, + "learning_rate": 7.0065777387852405e-06, + "loss": 0.3618, + "step": 3982 + }, + { + "epoch": 0.38763990267639903, + "grad_norm": 1.5162882975673595, + "learning_rate": 7.005133975684992e-06, + "loss": 0.4722, + "step": 3983 + }, + { + "epoch": 0.3877372262773723, + "grad_norm": 1.4933222106252568, + "learning_rate": 7.003690013333348e-06, + "loss": 0.2983, + "step": 3984 + }, + { + "epoch": 0.3878345498783455, + "grad_norm": 1.3557382804846623, + "learning_rate": 7.002245851873794e-06, + "loss": 0.5318, + "step": 3985 + }, + { + "epoch": 0.38793187347931873, + "grad_norm": 1.167640461283426, + "learning_rate": 7.000801491449843e-06, + "loss": 0.2841, + "step": 3986 + }, + { + "epoch": 0.388029197080292, + "grad_norm": 1.4266522816945901, + "learning_rate": 6.9993569322050145e-06, + "loss": 0.492, + "step": 3987 + }, + { + "epoch": 0.3881265206812652, + "grad_norm": 1.4821564005694208, + "learning_rate": 6.997912174282859e-06, + "loss": 0.4676, + "step": 3988 + }, + { + "epoch": 0.38822384428223844, + "grad_norm": 1.2497816848843788, + "learning_rate": 6.996467217826944e-06, + "loss": 0.3818, + "step": 3989 + }, + { + "epoch": 0.3883211678832117, + "grad_norm": 1.3419552727592317, + "learning_rate": 6.995022062980854e-06, + "loss": 0.3393, + "step": 3990 + }, + { + "epoch": 0.3884184914841849, + "grad_norm": 1.8154719749479167, + "learning_rate": 6.993576709888196e-06, + "loss": 0.4484, + "step": 3991 + }, + { + "epoch": 0.38851581508515814, + "grad_norm": 1.351000409855417, + "learning_rate": 6.992131158692594e-06, + "loss": 0.4945, + "step": 3992 + }, + { + "epoch": 0.3886131386861314, + "grad_norm": 1.6363889229872557, + "learning_rate": 6.9906854095376946e-06, + "loss": 0.5338, + "step": 3993 + }, + { + "epoch": 0.3887104622871046, + "grad_norm": 1.7802129035237775, + "learning_rate": 6.989239462567162e-06, + "loss": 0.3984, + "step": 3994 + }, + { + "epoch": 0.38880778588807785, + "grad_norm": 1.5793157183409696, + "learning_rate": 6.987793317924683e-06, + "loss": 0.3973, + "step": 3995 + }, + { + "epoch": 0.3889051094890511, + "grad_norm": 1.5416565542490828, + "learning_rate": 6.986346975753958e-06, + "loss": 0.5045, + "step": 3996 + }, + { + "epoch": 0.38900243309002436, + "grad_norm": 0.8915263517146853, + "learning_rate": 6.984900436198715e-06, + "loss": 0.1697, + "step": 3997 + }, + { + "epoch": 0.38909975669099756, + "grad_norm": 1.5546582089832586, + "learning_rate": 6.983453699402695e-06, + "loss": 0.4891, + "step": 3998 + }, + { + "epoch": 0.3891970802919708, + "grad_norm": 1.5087179519043437, + "learning_rate": 6.9820067655096615e-06, + "loss": 0.526, + "step": 3999 + }, + { + "epoch": 0.38929440389294406, + "grad_norm": 1.5145233712620612, + "learning_rate": 6.980559634663397e-06, + "loss": 0.4787, + "step": 4000 + }, + { + "epoch": 0.38939172749391726, + "grad_norm": 1.418350630690952, + "learning_rate": 6.979112307007705e-06, + "loss": 0.5409, + "step": 4001 + }, + { + "epoch": 0.3894890510948905, + "grad_norm": 1.6433890174783798, + "learning_rate": 6.977664782686406e-06, + "loss": 0.4894, + "step": 4002 + }, + { + "epoch": 0.38958637469586377, + "grad_norm": 1.1028189021006047, + "learning_rate": 6.976217061843343e-06, + "loss": 0.2523, + "step": 4003 + }, + { + "epoch": 0.38968369829683697, + "grad_norm": 1.1910178630203534, + "learning_rate": 6.974769144622374e-06, + "loss": 0.261, + "step": 4004 + }, + { + "epoch": 0.3897810218978102, + "grad_norm": 1.416321272912632, + "learning_rate": 6.9733210311673826e-06, + "loss": 0.3422, + "step": 4005 + }, + { + "epoch": 0.3898783454987835, + "grad_norm": 1.1678284015343599, + "learning_rate": 6.971872721622268e-06, + "loss": 0.2577, + "step": 4006 + }, + { + "epoch": 0.38997566909975667, + "grad_norm": 1.6751662328419257, + "learning_rate": 6.970424216130949e-06, + "loss": 0.4419, + "step": 4007 + }, + { + "epoch": 0.3900729927007299, + "grad_norm": 1.3122627861372418, + "learning_rate": 6.968975514837364e-06, + "loss": 0.3431, + "step": 4008 + }, + { + "epoch": 0.3901703163017032, + "grad_norm": 1.4908254908438412, + "learning_rate": 6.967526617885471e-06, + "loss": 0.5071, + "step": 4009 + }, + { + "epoch": 0.3902676399026764, + "grad_norm": 1.273008168870797, + "learning_rate": 6.966077525419249e-06, + "loss": 0.3637, + "step": 4010 + }, + { + "epoch": 0.39036496350364963, + "grad_norm": 1.8505049634947817, + "learning_rate": 6.964628237582696e-06, + "loss": 0.6389, + "step": 4011 + }, + { + "epoch": 0.3904622871046229, + "grad_norm": 1.350943274678673, + "learning_rate": 6.963178754519826e-06, + "loss": 0.3458, + "step": 4012 + }, + { + "epoch": 0.3905596107055961, + "grad_norm": 1.6230761069776574, + "learning_rate": 6.961729076374679e-06, + "loss": 0.5931, + "step": 4013 + }, + { + "epoch": 0.39065693430656934, + "grad_norm": 1.4687261376573049, + "learning_rate": 6.960279203291305e-06, + "loss": 0.5103, + "step": 4014 + }, + { + "epoch": 0.3907542579075426, + "grad_norm": 2.279060462648419, + "learning_rate": 6.958829135413783e-06, + "loss": 0.3421, + "step": 4015 + }, + { + "epoch": 0.3908515815085158, + "grad_norm": 1.4453991838183962, + "learning_rate": 6.957378872886205e-06, + "loss": 0.4648, + "step": 4016 + }, + { + "epoch": 0.39094890510948904, + "grad_norm": 1.2752498064098947, + "learning_rate": 6.955928415852686e-06, + "loss": 0.2475, + "step": 4017 + }, + { + "epoch": 0.3910462287104623, + "grad_norm": 1.3162378944934356, + "learning_rate": 6.954477764457359e-06, + "loss": 0.5026, + "step": 4018 + }, + { + "epoch": 0.39114355231143555, + "grad_norm": 1.4455647016550808, + "learning_rate": 6.953026918844375e-06, + "loss": 0.3693, + "step": 4019 + }, + { + "epoch": 0.39124087591240875, + "grad_norm": 1.7197570624779743, + "learning_rate": 6.951575879157904e-06, + "loss": 0.5467, + "step": 4020 + }, + { + "epoch": 0.391338199513382, + "grad_norm": 1.3683763838921565, + "learning_rate": 6.950124645542139e-06, + "loss": 0.3125, + "step": 4021 + }, + { + "epoch": 0.39143552311435525, + "grad_norm": 1.3684386713615748, + "learning_rate": 6.948673218141291e-06, + "loss": 0.4659, + "step": 4022 + }, + { + "epoch": 0.39153284671532845, + "grad_norm": 2.7578384354968497, + "learning_rate": 6.947221597099585e-06, + "loss": 0.4887, + "step": 4023 + }, + { + "epoch": 0.3916301703163017, + "grad_norm": 1.331068524451763, + "learning_rate": 6.945769782561273e-06, + "loss": 0.3888, + "step": 4024 + }, + { + "epoch": 0.39172749391727496, + "grad_norm": 1.7490396589403405, + "learning_rate": 6.944317774670622e-06, + "loss": 0.3748, + "step": 4025 + }, + { + "epoch": 0.39182481751824816, + "grad_norm": 1.4248992281969828, + "learning_rate": 6.942865573571919e-06, + "loss": 0.3915, + "step": 4026 + }, + { + "epoch": 0.3919221411192214, + "grad_norm": 1.485964355867862, + "learning_rate": 6.941413179409468e-06, + "loss": 0.3346, + "step": 4027 + }, + { + "epoch": 0.39201946472019467, + "grad_norm": 1.5466928085770633, + "learning_rate": 6.939960592327599e-06, + "loss": 0.5374, + "step": 4028 + }, + { + "epoch": 0.39211678832116786, + "grad_norm": 1.5883887378893773, + "learning_rate": 6.938507812470652e-06, + "loss": 0.5226, + "step": 4029 + }, + { + "epoch": 0.3922141119221411, + "grad_norm": 2.786093191780367, + "learning_rate": 6.937054839982993e-06, + "loss": 0.2959, + "step": 4030 + }, + { + "epoch": 0.39231143552311437, + "grad_norm": 1.3276365840641022, + "learning_rate": 6.935601675009003e-06, + "loss": 0.4711, + "step": 4031 + }, + { + "epoch": 0.39240875912408757, + "grad_norm": 1.289716288913328, + "learning_rate": 6.934148317693083e-06, + "loss": 0.2954, + "step": 4032 + }, + { + "epoch": 0.3925060827250608, + "grad_norm": 1.44676589060076, + "learning_rate": 6.932694768179659e-06, + "loss": 0.326, + "step": 4033 + }, + { + "epoch": 0.3926034063260341, + "grad_norm": 1.6207833969674215, + "learning_rate": 6.9312410266131665e-06, + "loss": 0.4315, + "step": 4034 + }, + { + "epoch": 0.3927007299270073, + "grad_norm": 1.5436789121746861, + "learning_rate": 6.929787093138067e-06, + "loss": 0.249, + "step": 4035 + }, + { + "epoch": 0.39279805352798053, + "grad_norm": 1.3498642235046783, + "learning_rate": 6.9283329678988375e-06, + "loss": 0.4106, + "step": 4036 + }, + { + "epoch": 0.3928953771289538, + "grad_norm": 1.3134123038417191, + "learning_rate": 6.926878651039975e-06, + "loss": 0.3761, + "step": 4037 + }, + { + "epoch": 0.392992700729927, + "grad_norm": 1.4258818290455284, + "learning_rate": 6.925424142705997e-06, + "loss": 0.3464, + "step": 4038 + }, + { + "epoch": 0.39309002433090023, + "grad_norm": 1.4508029884759852, + "learning_rate": 6.92396944304144e-06, + "loss": 0.3037, + "step": 4039 + }, + { + "epoch": 0.3931873479318735, + "grad_norm": 2.156543435295907, + "learning_rate": 6.922514552190856e-06, + "loss": 0.6332, + "step": 4040 + }, + { + "epoch": 0.39328467153284674, + "grad_norm": 1.6813909365209494, + "learning_rate": 6.921059470298819e-06, + "loss": 0.7023, + "step": 4041 + }, + { + "epoch": 0.39338199513381994, + "grad_norm": 1.3183413808089122, + "learning_rate": 6.91960419750992e-06, + "loss": 0.4137, + "step": 4042 + }, + { + "epoch": 0.3934793187347932, + "grad_norm": 1.626376595990447, + "learning_rate": 6.918148733968774e-06, + "loss": 0.5167, + "step": 4043 + }, + { + "epoch": 0.39357664233576645, + "grad_norm": 1.3584619722516016, + "learning_rate": 6.916693079820009e-06, + "loss": 0.4243, + "step": 4044 + }, + { + "epoch": 0.39367396593673964, + "grad_norm": 2.692105804461361, + "learning_rate": 6.915237235208274e-06, + "loss": 0.2159, + "step": 4045 + }, + { + "epoch": 0.3937712895377129, + "grad_norm": 1.4379984779190689, + "learning_rate": 6.913781200278239e-06, + "loss": 0.3612, + "step": 4046 + }, + { + "epoch": 0.39386861313868615, + "grad_norm": 1.2761654800240798, + "learning_rate": 6.9123249751745866e-06, + "loss": 0.3696, + "step": 4047 + }, + { + "epoch": 0.39396593673965935, + "grad_norm": 1.0634612055178503, + "learning_rate": 6.91086856004203e-06, + "loss": 0.3053, + "step": 4048 + }, + { + "epoch": 0.3940632603406326, + "grad_norm": 1.3282406027353941, + "learning_rate": 6.90941195502529e-06, + "loss": 0.3465, + "step": 4049 + }, + { + "epoch": 0.39416058394160586, + "grad_norm": 1.1979553824540066, + "learning_rate": 6.907955160269107e-06, + "loss": 0.3624, + "step": 4050 + }, + { + "epoch": 0.39425790754257906, + "grad_norm": 1.479713025623162, + "learning_rate": 6.90649817591825e-06, + "loss": 0.5207, + "step": 4051 + }, + { + "epoch": 0.3943552311435523, + "grad_norm": 1.2910853380088252, + "learning_rate": 6.905041002117494e-06, + "loss": 0.3805, + "step": 4052 + }, + { + "epoch": 0.39445255474452556, + "grad_norm": 1.322518756928142, + "learning_rate": 6.903583639011647e-06, + "loss": 0.3741, + "step": 4053 + }, + { + "epoch": 0.39454987834549876, + "grad_norm": 1.388861062512862, + "learning_rate": 6.902126086745521e-06, + "loss": 0.3978, + "step": 4054 + }, + { + "epoch": 0.394647201946472, + "grad_norm": 1.431056050315178, + "learning_rate": 6.900668345463958e-06, + "loss": 0.4779, + "step": 4055 + }, + { + "epoch": 0.39474452554744527, + "grad_norm": 1.3089722954111018, + "learning_rate": 6.8992104153118124e-06, + "loss": 0.2481, + "step": 4056 + }, + { + "epoch": 0.39484184914841847, + "grad_norm": 1.571119717801153, + "learning_rate": 6.8977522964339596e-06, + "loss": 0.422, + "step": 4057 + }, + { + "epoch": 0.3949391727493917, + "grad_norm": 1.6272627408819231, + "learning_rate": 6.896293988975297e-06, + "loss": 0.5442, + "step": 4058 + }, + { + "epoch": 0.395036496350365, + "grad_norm": 1.5163652637569427, + "learning_rate": 6.894835493080733e-06, + "loss": 0.4726, + "step": 4059 + }, + { + "epoch": 0.39513381995133817, + "grad_norm": 2.0784976022874897, + "learning_rate": 6.8933768088952025e-06, + "loss": 0.4263, + "step": 4060 + }, + { + "epoch": 0.3952311435523114, + "grad_norm": 1.7443301449253348, + "learning_rate": 6.8919179365636546e-06, + "loss": 0.3929, + "step": 4061 + }, + { + "epoch": 0.3953284671532847, + "grad_norm": 1.7924502086241416, + "learning_rate": 6.8904588762310586e-06, + "loss": 0.5579, + "step": 4062 + }, + { + "epoch": 0.39542579075425793, + "grad_norm": 1.0193876552673304, + "learning_rate": 6.888999628042401e-06, + "loss": 0.2416, + "step": 4063 + }, + { + "epoch": 0.39552311435523113, + "grad_norm": 1.341170474322962, + "learning_rate": 6.887540192142691e-06, + "loss": 0.3074, + "step": 4064 + }, + { + "epoch": 0.3956204379562044, + "grad_norm": 1.8133739469107961, + "learning_rate": 6.88608056867695e-06, + "loss": 0.6511, + "step": 4065 + }, + { + "epoch": 0.39571776155717764, + "grad_norm": 1.6797513727862947, + "learning_rate": 6.884620757790226e-06, + "loss": 0.5998, + "step": 4066 + }, + { + "epoch": 0.39581508515815084, + "grad_norm": 1.3474983454536655, + "learning_rate": 6.883160759627577e-06, + "loss": 0.4278, + "step": 4067 + }, + { + "epoch": 0.3959124087591241, + "grad_norm": 1.2209023274964024, + "learning_rate": 6.881700574334087e-06, + "loss": 0.2868, + "step": 4068 + }, + { + "epoch": 0.39600973236009734, + "grad_norm": 1.5229304325648796, + "learning_rate": 6.880240202054854e-06, + "loss": 0.4164, + "step": 4069 + }, + { + "epoch": 0.39610705596107054, + "grad_norm": 1.3625612735094477, + "learning_rate": 6.878779642934996e-06, + "loss": 0.3048, + "step": 4070 + }, + { + "epoch": 0.3962043795620438, + "grad_norm": 1.4190083205534945, + "learning_rate": 6.8773188971196515e-06, + "loss": 0.385, + "step": 4071 + }, + { + "epoch": 0.39630170316301705, + "grad_norm": 1.3525245197492086, + "learning_rate": 6.875857964753973e-06, + "loss": 0.3608, + "step": 4072 + }, + { + "epoch": 0.39639902676399025, + "grad_norm": 1.4522778082083179, + "learning_rate": 6.874396845983134e-06, + "loss": 0.5594, + "step": 4073 + }, + { + "epoch": 0.3964963503649635, + "grad_norm": 1.4455429305856882, + "learning_rate": 6.87293554095233e-06, + "loss": 0.4933, + "step": 4074 + }, + { + "epoch": 0.39659367396593675, + "grad_norm": 1.4604158337092326, + "learning_rate": 6.871474049806771e-06, + "loss": 0.5305, + "step": 4075 + }, + { + "epoch": 0.39669099756690995, + "grad_norm": 1.3489977365676973, + "learning_rate": 6.870012372691685e-06, + "loss": 0.4778, + "step": 4076 + }, + { + "epoch": 0.3967883211678832, + "grad_norm": 1.356695757623522, + "learning_rate": 6.86855050975232e-06, + "loss": 0.3774, + "step": 4077 + }, + { + "epoch": 0.39688564476885646, + "grad_norm": 1.1716870983796823, + "learning_rate": 6.867088461133941e-06, + "loss": 0.3492, + "step": 4078 + }, + { + "epoch": 0.39698296836982966, + "grad_norm": 1.478028392255781, + "learning_rate": 6.865626226981834e-06, + "loss": 0.4141, + "step": 4079 + }, + { + "epoch": 0.3970802919708029, + "grad_norm": 1.239501688643302, + "learning_rate": 6.864163807441304e-06, + "loss": 0.3388, + "step": 4080 + }, + { + "epoch": 0.39717761557177617, + "grad_norm": 1.5351372249288278, + "learning_rate": 6.86270120265767e-06, + "loss": 0.5215, + "step": 4081 + }, + { + "epoch": 0.3972749391727494, + "grad_norm": 1.382749480627249, + "learning_rate": 6.861238412776272e-06, + "loss": 0.5118, + "step": 4082 + }, + { + "epoch": 0.3973722627737226, + "grad_norm": 1.3595228443646563, + "learning_rate": 6.8597754379424695e-06, + "loss": 0.3972, + "step": 4083 + }, + { + "epoch": 0.39746958637469587, + "grad_norm": 1.9244235129217742, + "learning_rate": 6.858312278301638e-06, + "loss": 0.4423, + "step": 4084 + }, + { + "epoch": 0.3975669099756691, + "grad_norm": 1.4893887621731592, + "learning_rate": 6.856848933999174e-06, + "loss": 0.4281, + "step": 4085 + }, + { + "epoch": 0.3976642335766423, + "grad_norm": 1.159936646507266, + "learning_rate": 6.85538540518049e-06, + "loss": 0.2303, + "step": 4086 + }, + { + "epoch": 0.3977615571776156, + "grad_norm": 1.5864417866949991, + "learning_rate": 6.853921691991018e-06, + "loss": 0.301, + "step": 4087 + }, + { + "epoch": 0.39785888077858883, + "grad_norm": 1.2139649969393456, + "learning_rate": 6.852457794576207e-06, + "loss": 0.3066, + "step": 4088 + }, + { + "epoch": 0.39795620437956203, + "grad_norm": 1.4984653887972654, + "learning_rate": 6.850993713081527e-06, + "loss": 0.4157, + "step": 4089 + }, + { + "epoch": 0.3980535279805353, + "grad_norm": 1.3362834103353325, + "learning_rate": 6.8495294476524636e-06, + "loss": 0.2316, + "step": 4090 + }, + { + "epoch": 0.39815085158150854, + "grad_norm": 1.4160149515084606, + "learning_rate": 6.848064998434523e-06, + "loss": 0.4297, + "step": 4091 + }, + { + "epoch": 0.39824817518248173, + "grad_norm": 1.4266675728649458, + "learning_rate": 6.846600365573226e-06, + "loss": 0.3893, + "step": 4092 + }, + { + "epoch": 0.398345498783455, + "grad_norm": 1.4688794505110578, + "learning_rate": 6.845135549214117e-06, + "loss": 0.2136, + "step": 4093 + }, + { + "epoch": 0.39844282238442824, + "grad_norm": 1.3580044820287391, + "learning_rate": 6.843670549502755e-06, + "loss": 0.335, + "step": 4094 + }, + { + "epoch": 0.39854014598540144, + "grad_norm": 1.4677889500126287, + "learning_rate": 6.842205366584716e-06, + "loss": 0.1751, + "step": 4095 + }, + { + "epoch": 0.3986374695863747, + "grad_norm": 1.3678338838909818, + "learning_rate": 6.840740000605598e-06, + "loss": 0.4195, + "step": 4096 + }, + { + "epoch": 0.39873479318734795, + "grad_norm": 1.611968238148494, + "learning_rate": 6.8392744517110135e-06, + "loss": 0.4716, + "step": 4097 + }, + { + "epoch": 0.39883211678832114, + "grad_norm": 1.2798081635725236, + "learning_rate": 6.837808720046598e-06, + "loss": 0.2324, + "step": 4098 + }, + { + "epoch": 0.3989294403892944, + "grad_norm": 1.2301645538697144, + "learning_rate": 6.836342805758e-06, + "loss": 0.3178, + "step": 4099 + }, + { + "epoch": 0.39902676399026765, + "grad_norm": 1.3243492759809474, + "learning_rate": 6.834876708990887e-06, + "loss": 0.3202, + "step": 4100 + }, + { + "epoch": 0.39912408759124085, + "grad_norm": 1.6002987225145617, + "learning_rate": 6.833410429890948e-06, + "loss": 0.3685, + "step": 4101 + }, + { + "epoch": 0.3992214111922141, + "grad_norm": 1.3678479152711107, + "learning_rate": 6.8319439686038905e-06, + "loss": 0.4836, + "step": 4102 + }, + { + "epoch": 0.39931873479318736, + "grad_norm": 1.4215810283225752, + "learning_rate": 6.830477325275432e-06, + "loss": 0.433, + "step": 4103 + }, + { + "epoch": 0.3994160583941606, + "grad_norm": 1.2240968936921577, + "learning_rate": 6.829010500051319e-06, + "loss": 0.3181, + "step": 4104 + }, + { + "epoch": 0.3995133819951338, + "grad_norm": 1.3093225983032648, + "learning_rate": 6.8275434930773065e-06, + "loss": 0.3464, + "step": 4105 + }, + { + "epoch": 0.39961070559610706, + "grad_norm": 1.3776375009966235, + "learning_rate": 6.826076304499174e-06, + "loss": 0.4843, + "step": 4106 + }, + { + "epoch": 0.3997080291970803, + "grad_norm": 1.2816280179506763, + "learning_rate": 6.8246089344627174e-06, + "loss": 0.3877, + "step": 4107 + }, + { + "epoch": 0.3998053527980535, + "grad_norm": 1.739257469798359, + "learning_rate": 6.823141383113748e-06, + "loss": 0.5034, + "step": 4108 + }, + { + "epoch": 0.39990267639902677, + "grad_norm": 1.433615286766659, + "learning_rate": 6.8216736505981e-06, + "loss": 0.456, + "step": 4109 + }, + { + "epoch": 0.4, + "grad_norm": 1.352765027942841, + "learning_rate": 6.820205737061621e-06, + "loss": 0.4045, + "step": 4110 + }, + { + "epoch": 0.4000973236009732, + "grad_norm": 1.5427269854846495, + "learning_rate": 6.8187376426501795e-06, + "loss": 0.5184, + "step": 4111 + }, + { + "epoch": 0.4001946472019465, + "grad_norm": 1.3420595581039498, + "learning_rate": 6.81726936750966e-06, + "loss": 0.4387, + "step": 4112 + }, + { + "epoch": 0.4002919708029197, + "grad_norm": 1.7226489838946462, + "learning_rate": 6.815800911785968e-06, + "loss": 0.6075, + "step": 4113 + }, + { + "epoch": 0.4003892944038929, + "grad_norm": 1.4330949045341035, + "learning_rate": 6.814332275625024e-06, + "loss": 0.4566, + "step": 4114 + }, + { + "epoch": 0.4004866180048662, + "grad_norm": 1.4735337129897166, + "learning_rate": 6.812863459172765e-06, + "loss": 0.3747, + "step": 4115 + }, + { + "epoch": 0.40058394160583943, + "grad_norm": 1.1425351387480063, + "learning_rate": 6.811394462575149e-06, + "loss": 0.2628, + "step": 4116 + }, + { + "epoch": 0.40068126520681263, + "grad_norm": 1.43999339592267, + "learning_rate": 6.809925285978152e-06, + "loss": 0.5026, + "step": 4117 + }, + { + "epoch": 0.4007785888077859, + "grad_norm": 1.4172325219654425, + "learning_rate": 6.808455929527768e-06, + "loss": 0.4025, + "step": 4118 + }, + { + "epoch": 0.40087591240875914, + "grad_norm": 1.7910927659189848, + "learning_rate": 6.806986393370006e-06, + "loss": 0.6098, + "step": 4119 + }, + { + "epoch": 0.40097323600973234, + "grad_norm": 1.2075775786568461, + "learning_rate": 6.805516677650896e-06, + "loss": 0.3196, + "step": 4120 + }, + { + "epoch": 0.4010705596107056, + "grad_norm": 1.211944505922398, + "learning_rate": 6.804046782516483e-06, + "loss": 0.3276, + "step": 4121 + }, + { + "epoch": 0.40116788321167884, + "grad_norm": 1.4396684296663345, + "learning_rate": 6.802576708112834e-06, + "loss": 0.3646, + "step": 4122 + }, + { + "epoch": 0.40126520681265204, + "grad_norm": 1.3856408026331406, + "learning_rate": 6.801106454586028e-06, + "loss": 0.4237, + "step": 4123 + }, + { + "epoch": 0.4013625304136253, + "grad_norm": 1.3806721993456297, + "learning_rate": 6.799636022082168e-06, + "loss": 0.3891, + "step": 4124 + }, + { + "epoch": 0.40145985401459855, + "grad_norm": 2.5197331783786487, + "learning_rate": 6.79816541074737e-06, + "loss": 0.373, + "step": 4125 + }, + { + "epoch": 0.4015571776155718, + "grad_norm": 1.619394611459261, + "learning_rate": 6.796694620727768e-06, + "loss": 0.4296, + "step": 4126 + }, + { + "epoch": 0.401654501216545, + "grad_norm": 1.462883942898207, + "learning_rate": 6.795223652169519e-06, + "loss": 0.3997, + "step": 4127 + }, + { + "epoch": 0.40175182481751825, + "grad_norm": 1.5479315030125844, + "learning_rate": 6.793752505218791e-06, + "loss": 0.3695, + "step": 4128 + }, + { + "epoch": 0.4018491484184915, + "grad_norm": 1.6944874808399704, + "learning_rate": 6.792281180021776e-06, + "loss": 0.5353, + "step": 4129 + }, + { + "epoch": 0.4019464720194647, + "grad_norm": 1.4648916821584979, + "learning_rate": 6.790809676724677e-06, + "loss": 0.5375, + "step": 4130 + }, + { + "epoch": 0.40204379562043796, + "grad_norm": 1.9179370512353164, + "learning_rate": 6.7893379954737195e-06, + "loss": 0.5927, + "step": 4131 + }, + { + "epoch": 0.4021411192214112, + "grad_norm": 1.7096259956264923, + "learning_rate": 6.787866136415148e-06, + "loss": 0.2986, + "step": 4132 + }, + { + "epoch": 0.4022384428223844, + "grad_norm": 1.5884427732942605, + "learning_rate": 6.786394099695217e-06, + "loss": 0.3867, + "step": 4133 + }, + { + "epoch": 0.40233576642335767, + "grad_norm": 1.2109930210306181, + "learning_rate": 6.784921885460207e-06, + "loss": 0.2347, + "step": 4134 + }, + { + "epoch": 0.4024330900243309, + "grad_norm": 1.675584901630843, + "learning_rate": 6.783449493856412e-06, + "loss": 0.3507, + "step": 4135 + }, + { + "epoch": 0.4025304136253041, + "grad_norm": 0.9536842206516954, + "learning_rate": 6.781976925030145e-06, + "loss": 0.1722, + "step": 4136 + }, + { + "epoch": 0.40262773722627737, + "grad_norm": 1.3099520338173072, + "learning_rate": 6.780504179127735e-06, + "loss": 0.2402, + "step": 4137 + }, + { + "epoch": 0.4027250608272506, + "grad_norm": 1.3666225901140812, + "learning_rate": 6.779031256295532e-06, + "loss": 0.4447, + "step": 4138 + }, + { + "epoch": 0.4028223844282238, + "grad_norm": 1.3539060879909202, + "learning_rate": 6.777558156679898e-06, + "loss": 0.2859, + "step": 4139 + }, + { + "epoch": 0.4029197080291971, + "grad_norm": 1.3557388080611188, + "learning_rate": 6.7760848804272184e-06, + "loss": 0.4089, + "step": 4140 + }, + { + "epoch": 0.40301703163017033, + "grad_norm": 2.077118594049834, + "learning_rate": 6.774611427683891e-06, + "loss": 0.4151, + "step": 4141 + }, + { + "epoch": 0.40311435523114353, + "grad_norm": 1.5204722964451547, + "learning_rate": 6.773137798596336e-06, + "loss": 0.385, + "step": 4142 + }, + { + "epoch": 0.4032116788321168, + "grad_norm": 1.2910878309871092, + "learning_rate": 6.77166399331099e-06, + "loss": 0.3261, + "step": 4143 + }, + { + "epoch": 0.40330900243309004, + "grad_norm": 1.5435785947461726, + "learning_rate": 6.770190011974302e-06, + "loss": 0.487, + "step": 4144 + }, + { + "epoch": 0.40340632603406323, + "grad_norm": 1.4881633401455383, + "learning_rate": 6.768715854732745e-06, + "loss": 0.4287, + "step": 4145 + }, + { + "epoch": 0.4035036496350365, + "grad_norm": 1.7748194616136683, + "learning_rate": 6.767241521732806e-06, + "loss": 0.4205, + "step": 4146 + }, + { + "epoch": 0.40360097323600974, + "grad_norm": 2.512912685985909, + "learning_rate": 6.76576701312099e-06, + "loss": 0.466, + "step": 4147 + }, + { + "epoch": 0.403698296836983, + "grad_norm": 1.143470246335317, + "learning_rate": 6.7642923290438215e-06, + "loss": 0.2462, + "step": 4148 + }, + { + "epoch": 0.4037956204379562, + "grad_norm": 1.8432461324879108, + "learning_rate": 6.76281746964784e-06, + "loss": 0.3258, + "step": 4149 + }, + { + "epoch": 0.40389294403892945, + "grad_norm": 1.4157626221983473, + "learning_rate": 6.761342435079604e-06, + "loss": 0.3768, + "step": 4150 + }, + { + "epoch": 0.4039902676399027, + "grad_norm": 1.47964929936211, + "learning_rate": 6.7598672254856864e-06, + "loss": 0.3548, + "step": 4151 + }, + { + "epoch": 0.4040875912408759, + "grad_norm": 1.3968724705205968, + "learning_rate": 6.75839184101268e-06, + "loss": 0.4182, + "step": 4152 + }, + { + "epoch": 0.40418491484184915, + "grad_norm": 1.529068307372447, + "learning_rate": 6.7569162818071975e-06, + "loss": 0.3009, + "step": 4153 + }, + { + "epoch": 0.4042822384428224, + "grad_norm": 1.6397759556670952, + "learning_rate": 6.755440548015864e-06, + "loss": 0.4027, + "step": 4154 + }, + { + "epoch": 0.4043795620437956, + "grad_norm": 1.7545455752932866, + "learning_rate": 6.753964639785322e-06, + "loss": 0.4641, + "step": 4155 + }, + { + "epoch": 0.40447688564476886, + "grad_norm": 1.4464207704525447, + "learning_rate": 6.752488557262239e-06, + "loss": 0.4019, + "step": 4156 + }, + { + "epoch": 0.4045742092457421, + "grad_norm": 1.241485877391119, + "learning_rate": 6.7510123005932885e-06, + "loss": 0.38, + "step": 4157 + }, + { + "epoch": 0.4046715328467153, + "grad_norm": 1.5953709456211367, + "learning_rate": 6.74953586992517e-06, + "loss": 0.3787, + "step": 4158 + }, + { + "epoch": 0.40476885644768856, + "grad_norm": 1.5557586966812027, + "learning_rate": 6.748059265404598e-06, + "loss": 0.5311, + "step": 4159 + }, + { + "epoch": 0.4048661800486618, + "grad_norm": 1.3580123385569711, + "learning_rate": 6.746582487178299e-06, + "loss": 0.3035, + "step": 4160 + }, + { + "epoch": 0.404963503649635, + "grad_norm": 1.4942228805738602, + "learning_rate": 6.745105535393029e-06, + "loss": 0.6116, + "step": 4161 + }, + { + "epoch": 0.40506082725060827, + "grad_norm": 1.3580092661770409, + "learning_rate": 6.7436284101955465e-06, + "loss": 0.3137, + "step": 4162 + }, + { + "epoch": 0.4051581508515815, + "grad_norm": 1.8589871426774351, + "learning_rate": 6.7421511117326376e-06, + "loss": 0.6386, + "step": 4163 + }, + { + "epoch": 0.4052554744525547, + "grad_norm": 1.437956695986945, + "learning_rate": 6.740673640151102e-06, + "loss": 0.4982, + "step": 4164 + }, + { + "epoch": 0.405352798053528, + "grad_norm": 1.535934910815706, + "learning_rate": 6.739195995597757e-06, + "loss": 0.4673, + "step": 4165 + }, + { + "epoch": 0.4054501216545012, + "grad_norm": 1.5681489682180827, + "learning_rate": 6.737718178219437e-06, + "loss": 0.5436, + "step": 4166 + }, + { + "epoch": 0.4055474452554744, + "grad_norm": 1.6606402959323747, + "learning_rate": 6.736240188162995e-06, + "loss": 0.5384, + "step": 4167 + }, + { + "epoch": 0.4056447688564477, + "grad_norm": 1.4296428068919353, + "learning_rate": 6.7347620255752955e-06, + "loss": 0.5502, + "step": 4168 + }, + { + "epoch": 0.40574209245742093, + "grad_norm": 1.1693215589274866, + "learning_rate": 6.733283690603228e-06, + "loss": 0.3637, + "step": 4169 + }, + { + "epoch": 0.4058394160583942, + "grad_norm": 1.7804220182572144, + "learning_rate": 6.731805183393696e-06, + "loss": 0.3001, + "step": 4170 + }, + { + "epoch": 0.4059367396593674, + "grad_norm": 1.3982681939127966, + "learning_rate": 6.7303265040936185e-06, + "loss": 0.3245, + "step": 4171 + }, + { + "epoch": 0.40603406326034064, + "grad_norm": 1.2062326400377312, + "learning_rate": 6.728847652849933e-06, + "loss": 0.2714, + "step": 4172 + }, + { + "epoch": 0.4061313868613139, + "grad_norm": 1.4699173204694587, + "learning_rate": 6.727368629809592e-06, + "loss": 0.5233, + "step": 4173 + }, + { + "epoch": 0.4062287104622871, + "grad_norm": 1.9308275569367828, + "learning_rate": 6.725889435119568e-06, + "loss": 0.4341, + "step": 4174 + }, + { + "epoch": 0.40632603406326034, + "grad_norm": 1.2643530305470765, + "learning_rate": 6.724410068926852e-06, + "loss": 0.282, + "step": 4175 + }, + { + "epoch": 0.4064233576642336, + "grad_norm": 1.5309913572519684, + "learning_rate": 6.722930531378446e-06, + "loss": 0.3865, + "step": 4176 + }, + { + "epoch": 0.4065206812652068, + "grad_norm": 1.394521077436856, + "learning_rate": 6.721450822621376e-06, + "loss": 0.3337, + "step": 4177 + }, + { + "epoch": 0.40661800486618005, + "grad_norm": 1.445225810982358, + "learning_rate": 6.719970942802678e-06, + "loss": 0.3605, + "step": 4178 + }, + { + "epoch": 0.4067153284671533, + "grad_norm": 1.5344015892195602, + "learning_rate": 6.7184908920694115e-06, + "loss": 0.4307, + "step": 4179 + }, + { + "epoch": 0.4068126520681265, + "grad_norm": 1.396113291701313, + "learning_rate": 6.717010670568648e-06, + "loss": 0.444, + "step": 4180 + }, + { + "epoch": 0.40690997566909975, + "grad_norm": 1.4766415367973333, + "learning_rate": 6.715530278447479e-06, + "loss": 0.3435, + "step": 4181 + }, + { + "epoch": 0.407007299270073, + "grad_norm": 1.750623183326092, + "learning_rate": 6.714049715853012e-06, + "loss": 0.3844, + "step": 4182 + }, + { + "epoch": 0.4071046228710462, + "grad_norm": 1.4982933826375, + "learning_rate": 6.712568982932372e-06, + "loss": 0.3814, + "step": 4183 + }, + { + "epoch": 0.40720194647201946, + "grad_norm": 1.3051462794493567, + "learning_rate": 6.711088079832697e-06, + "loss": 0.346, + "step": 4184 + }, + { + "epoch": 0.4072992700729927, + "grad_norm": 1.6547203197592473, + "learning_rate": 6.709607006701149e-06, + "loss": 0.5131, + "step": 4185 + }, + { + "epoch": 0.4073965936739659, + "grad_norm": 1.5208934963641323, + "learning_rate": 6.708125763684903e-06, + "loss": 0.3869, + "step": 4186 + }, + { + "epoch": 0.40749391727493917, + "grad_norm": 1.3749250911768454, + "learning_rate": 6.706644350931149e-06, + "loss": 0.3839, + "step": 4187 + }, + { + "epoch": 0.4075912408759124, + "grad_norm": 1.0996995689031177, + "learning_rate": 6.7051627685870966e-06, + "loss": 0.2873, + "step": 4188 + }, + { + "epoch": 0.4076885644768856, + "grad_norm": 1.4604592586007714, + "learning_rate": 6.703681016799972e-06, + "loss": 0.4912, + "step": 4189 + }, + { + "epoch": 0.40778588807785887, + "grad_norm": 1.3866335130741587, + "learning_rate": 6.702199095717018e-06, + "loss": 0.5057, + "step": 4190 + }, + { + "epoch": 0.4078832116788321, + "grad_norm": 1.4173847603754959, + "learning_rate": 6.700717005485493e-06, + "loss": 0.4536, + "step": 4191 + }, + { + "epoch": 0.4079805352798054, + "grad_norm": 1.5420047114457367, + "learning_rate": 6.699234746252676e-06, + "loss": 0.616, + "step": 4192 + }, + { + "epoch": 0.4080778588807786, + "grad_norm": 1.3569785732132214, + "learning_rate": 6.697752318165855e-06, + "loss": 0.3401, + "step": 4193 + }, + { + "epoch": 0.40817518248175183, + "grad_norm": 1.1188023240624116, + "learning_rate": 6.696269721372344e-06, + "loss": 0.329, + "step": 4194 + }, + { + "epoch": 0.4082725060827251, + "grad_norm": 1.3197371002133358, + "learning_rate": 6.694786956019468e-06, + "loss": 0.4845, + "step": 4195 + }, + { + "epoch": 0.4083698296836983, + "grad_norm": 1.2069881394474367, + "learning_rate": 6.69330402225457e-06, + "loss": 0.3103, + "step": 4196 + }, + { + "epoch": 0.40846715328467154, + "grad_norm": 1.272617146059266, + "learning_rate": 6.691820920225011e-06, + "loss": 0.3242, + "step": 4197 + }, + { + "epoch": 0.4085644768856448, + "grad_norm": 1.2118257279828732, + "learning_rate": 6.690337650078167e-06, + "loss": 0.4317, + "step": 4198 + }, + { + "epoch": 0.408661800486618, + "grad_norm": 1.4997553625886875, + "learning_rate": 6.688854211961432e-06, + "loss": 0.5907, + "step": 4199 + }, + { + "epoch": 0.40875912408759124, + "grad_norm": 1.4858683252127711, + "learning_rate": 6.687370606022214e-06, + "loss": 0.5485, + "step": 4200 + }, + { + "epoch": 0.4088564476885645, + "grad_norm": 1.2447962687685288, + "learning_rate": 6.685886832407945e-06, + "loss": 0.3447, + "step": 4201 + }, + { + "epoch": 0.4089537712895377, + "grad_norm": 1.535995603376295, + "learning_rate": 6.684402891266063e-06, + "loss": 0.4063, + "step": 4202 + }, + { + "epoch": 0.40905109489051095, + "grad_norm": 1.5783775807307692, + "learning_rate": 6.682918782744033e-06, + "loss": 0.6425, + "step": 4203 + }, + { + "epoch": 0.4091484184914842, + "grad_norm": 1.4037569199049276, + "learning_rate": 6.681434506989327e-06, + "loss": 0.3833, + "step": 4204 + }, + { + "epoch": 0.4092457420924574, + "grad_norm": 1.402178826370091, + "learning_rate": 6.679950064149441e-06, + "loss": 0.3551, + "step": 4205 + }, + { + "epoch": 0.40934306569343065, + "grad_norm": 1.34468170512283, + "learning_rate": 6.678465454371883e-06, + "loss": 0.2863, + "step": 4206 + }, + { + "epoch": 0.4094403892944039, + "grad_norm": 1.482507916794428, + "learning_rate": 6.676980677804182e-06, + "loss": 0.5207, + "step": 4207 + }, + { + "epoch": 0.4095377128953771, + "grad_norm": 1.2804988426968886, + "learning_rate": 6.675495734593882e-06, + "loss": 0.3714, + "step": 4208 + }, + { + "epoch": 0.40963503649635036, + "grad_norm": 1.8393375161927517, + "learning_rate": 6.67401062488854e-06, + "loss": 0.3356, + "step": 4209 + }, + { + "epoch": 0.4097323600973236, + "grad_norm": 1.9062297382215951, + "learning_rate": 6.672525348835734e-06, + "loss": 0.4014, + "step": 4210 + }, + { + "epoch": 0.4098296836982968, + "grad_norm": 1.5134950596073753, + "learning_rate": 6.671039906583053e-06, + "loss": 0.5023, + "step": 4211 + }, + { + "epoch": 0.40992700729927006, + "grad_norm": 1.7782164726126763, + "learning_rate": 6.669554298278113e-06, + "loss": 0.464, + "step": 4212 + }, + { + "epoch": 0.4100243309002433, + "grad_norm": 1.682379122922247, + "learning_rate": 6.668068524068534e-06, + "loss": 0.4665, + "step": 4213 + }, + { + "epoch": 0.41012165450121657, + "grad_norm": 1.3325989707563795, + "learning_rate": 6.666582584101962e-06, + "loss": 0.465, + "step": 4214 + }, + { + "epoch": 0.41021897810218977, + "grad_norm": 1.2220874474854295, + "learning_rate": 6.665096478526054e-06, + "loss": 0.2988, + "step": 4215 + }, + { + "epoch": 0.410316301703163, + "grad_norm": 1.6164518148011633, + "learning_rate": 6.663610207488483e-06, + "loss": 0.5091, + "step": 4216 + }, + { + "epoch": 0.4104136253041363, + "grad_norm": 1.565691092613228, + "learning_rate": 6.662123771136946e-06, + "loss": 0.4084, + "step": 4217 + }, + { + "epoch": 0.4105109489051095, + "grad_norm": 1.5986808345528514, + "learning_rate": 6.660637169619147e-06, + "loss": 0.2829, + "step": 4218 + }, + { + "epoch": 0.4106082725060827, + "grad_norm": 1.8220294500874799, + "learning_rate": 6.659150403082812e-06, + "loss": 0.5608, + "step": 4219 + }, + { + "epoch": 0.410705596107056, + "grad_norm": 1.4769916213617307, + "learning_rate": 6.6576634716756815e-06, + "loss": 0.3977, + "step": 4220 + }, + { + "epoch": 0.4108029197080292, + "grad_norm": 1.4616686781315174, + "learning_rate": 6.656176375545513e-06, + "loss": 0.2517, + "step": 4221 + }, + { + "epoch": 0.41090024330900243, + "grad_norm": 1.1266940395897398, + "learning_rate": 6.654689114840081e-06, + "loss": 0.2244, + "step": 4222 + }, + { + "epoch": 0.4109975669099757, + "grad_norm": 1.4101438231198031, + "learning_rate": 6.653201689707174e-06, + "loss": 0.4818, + "step": 4223 + }, + { + "epoch": 0.4110948905109489, + "grad_norm": 1.296677172176164, + "learning_rate": 6.6517141002946e-06, + "loss": 0.3402, + "step": 4224 + }, + { + "epoch": 0.41119221411192214, + "grad_norm": 1.3840057914039408, + "learning_rate": 6.650226346750179e-06, + "loss": 0.3776, + "step": 4225 + }, + { + "epoch": 0.4112895377128954, + "grad_norm": 1.7375784492621011, + "learning_rate": 6.6487384292217515e-06, + "loss": 0.5091, + "step": 4226 + }, + { + "epoch": 0.4113868613138686, + "grad_norm": 1.317135934057917, + "learning_rate": 6.647250347857172e-06, + "loss": 0.3428, + "step": 4227 + }, + { + "epoch": 0.41148418491484184, + "grad_norm": 1.7823945099591982, + "learning_rate": 6.645762102804316e-06, + "loss": 0.5304, + "step": 4228 + }, + { + "epoch": 0.4115815085158151, + "grad_norm": 1.6357287779301097, + "learning_rate": 6.644273694211067e-06, + "loss": 0.3339, + "step": 4229 + }, + { + "epoch": 0.4116788321167883, + "grad_norm": 1.2562672855839114, + "learning_rate": 6.6427851222253304e-06, + "loss": 0.3303, + "step": 4230 + }, + { + "epoch": 0.41177615571776155, + "grad_norm": 1.5704338823918165, + "learning_rate": 6.641296386995025e-06, + "loss": 0.4193, + "step": 4231 + }, + { + "epoch": 0.4118734793187348, + "grad_norm": 1.5326708993630676, + "learning_rate": 6.639807488668091e-06, + "loss": 0.414, + "step": 4232 + }, + { + "epoch": 0.41197080291970806, + "grad_norm": 1.2172543567321743, + "learning_rate": 6.638318427392478e-06, + "loss": 0.1645, + "step": 4233 + }, + { + "epoch": 0.41206812652068125, + "grad_norm": 1.5519364904111816, + "learning_rate": 6.636829203316155e-06, + "loss": 0.3821, + "step": 4234 + }, + { + "epoch": 0.4121654501216545, + "grad_norm": 1.190874224833403, + "learning_rate": 6.635339816587109e-06, + "loss": 0.3556, + "step": 4235 + }, + { + "epoch": 0.41226277372262776, + "grad_norm": 1.1599374686849095, + "learning_rate": 6.63385026735334e-06, + "loss": 0.3749, + "step": 4236 + }, + { + "epoch": 0.41236009732360096, + "grad_norm": 1.7378771365759709, + "learning_rate": 6.632360555762865e-06, + "loss": 0.4853, + "step": 4237 + }, + { + "epoch": 0.4124574209245742, + "grad_norm": 1.6896892051906882, + "learning_rate": 6.6308706819637195e-06, + "loss": 0.3565, + "step": 4238 + }, + { + "epoch": 0.41255474452554747, + "grad_norm": 1.5371662211782342, + "learning_rate": 6.629380646103951e-06, + "loss": 0.5813, + "step": 4239 + }, + { + "epoch": 0.41265206812652067, + "grad_norm": 1.1350198221366297, + "learning_rate": 6.627890448331627e-06, + "loss": 0.2058, + "step": 4240 + }, + { + "epoch": 0.4127493917274939, + "grad_norm": 1.4439504442984183, + "learning_rate": 6.626400088794829e-06, + "loss": 0.5246, + "step": 4241 + }, + { + "epoch": 0.4128467153284672, + "grad_norm": 1.497251694970502, + "learning_rate": 6.624909567641653e-06, + "loss": 0.3753, + "step": 4242 + }, + { + "epoch": 0.41294403892944037, + "grad_norm": 2.013436245382801, + "learning_rate": 6.623418885020214e-06, + "loss": 0.3015, + "step": 4243 + }, + { + "epoch": 0.4130413625304136, + "grad_norm": 1.4477910414978084, + "learning_rate": 6.621928041078645e-06, + "loss": 0.3851, + "step": 4244 + }, + { + "epoch": 0.4131386861313869, + "grad_norm": 1.3762994174937473, + "learning_rate": 6.620437035965088e-06, + "loss": 0.3556, + "step": 4245 + }, + { + "epoch": 0.4132360097323601, + "grad_norm": 1.267972676100091, + "learning_rate": 6.618945869827708e-06, + "loss": 0.3547, + "step": 4246 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 1.428605801305056, + "learning_rate": 6.617454542814681e-06, + "loss": 0.352, + "step": 4247 + }, + { + "epoch": 0.4134306569343066, + "grad_norm": 2.0199889027885565, + "learning_rate": 6.615963055074202e-06, + "loss": 0.3482, + "step": 4248 + }, + { + "epoch": 0.4135279805352798, + "grad_norm": 1.585931486672713, + "learning_rate": 6.614471406754479e-06, + "loss": 0.4499, + "step": 4249 + }, + { + "epoch": 0.41362530413625304, + "grad_norm": 1.6316825936189212, + "learning_rate": 6.612979598003743e-06, + "loss": 0.4293, + "step": 4250 + }, + { + "epoch": 0.4137226277372263, + "grad_norm": 1.4668132105500566, + "learning_rate": 6.611487628970232e-06, + "loss": 0.3095, + "step": 4251 + }, + { + "epoch": 0.4138199513381995, + "grad_norm": 1.679886846024144, + "learning_rate": 6.609995499802204e-06, + "loss": 0.3704, + "step": 4252 + }, + { + "epoch": 0.41391727493917274, + "grad_norm": 1.558010365002375, + "learning_rate": 6.608503210647934e-06, + "loss": 0.456, + "step": 4253 + }, + { + "epoch": 0.414014598540146, + "grad_norm": 1.5296306221570222, + "learning_rate": 6.607010761655711e-06, + "loss": 0.3853, + "step": 4254 + }, + { + "epoch": 0.41411192214111925, + "grad_norm": 1.683084220519706, + "learning_rate": 6.605518152973842e-06, + "loss": 0.4359, + "step": 4255 + }, + { + "epoch": 0.41420924574209245, + "grad_norm": 1.3129724378896062, + "learning_rate": 6.604025384750646e-06, + "loss": 0.4273, + "step": 4256 + }, + { + "epoch": 0.4143065693430657, + "grad_norm": 1.4342940696208062, + "learning_rate": 6.602532457134463e-06, + "loss": 0.4389, + "step": 4257 + }, + { + "epoch": 0.41440389294403895, + "grad_norm": 3.280891471530478, + "learning_rate": 6.6010393702736444e-06, + "loss": 0.2364, + "step": 4258 + }, + { + "epoch": 0.41450121654501215, + "grad_norm": 1.4615220313171857, + "learning_rate": 6.599546124316558e-06, + "loss": 0.3685, + "step": 4259 + }, + { + "epoch": 0.4145985401459854, + "grad_norm": 1.2574907922123244, + "learning_rate": 6.598052719411592e-06, + "loss": 0.3912, + "step": 4260 + }, + { + "epoch": 0.41469586374695866, + "grad_norm": 1.5069809434087071, + "learning_rate": 6.596559155707144e-06, + "loss": 0.372, + "step": 4261 + }, + { + "epoch": 0.41479318734793186, + "grad_norm": 1.4890510390522098, + "learning_rate": 6.595065433351631e-06, + "loss": 0.3621, + "step": 4262 + }, + { + "epoch": 0.4148905109489051, + "grad_norm": 2.49061644970277, + "learning_rate": 6.5935715524934865e-06, + "loss": 0.3959, + "step": 4263 + }, + { + "epoch": 0.41498783454987836, + "grad_norm": 1.3665559018226794, + "learning_rate": 6.5920775132811565e-06, + "loss": 0.3345, + "step": 4264 + }, + { + "epoch": 0.41508515815085156, + "grad_norm": 1.7454356205611485, + "learning_rate": 6.590583315863106e-06, + "loss": 0.5648, + "step": 4265 + }, + { + "epoch": 0.4151824817518248, + "grad_norm": 1.1846482014659763, + "learning_rate": 6.589088960387814e-06, + "loss": 0.2959, + "step": 4266 + }, + { + "epoch": 0.41527980535279807, + "grad_norm": 1.403873385053726, + "learning_rate": 6.5875944470037745e-06, + "loss": 0.3663, + "step": 4267 + }, + { + "epoch": 0.41537712895377127, + "grad_norm": 1.3346226339580947, + "learning_rate": 6.5860997758595005e-06, + "loss": 0.3677, + "step": 4268 + }, + { + "epoch": 0.4154744525547445, + "grad_norm": 1.6581717803832259, + "learning_rate": 6.584604947103515e-06, + "loss": 0.4299, + "step": 4269 + }, + { + "epoch": 0.4155717761557178, + "grad_norm": 1.6137507681397871, + "learning_rate": 6.583109960884362e-06, + "loss": 0.4912, + "step": 4270 + }, + { + "epoch": 0.415669099756691, + "grad_norm": 1.542312984907508, + "learning_rate": 6.5816148173506e-06, + "loss": 0.638, + "step": 4271 + }, + { + "epoch": 0.4157664233576642, + "grad_norm": 1.4644079755455965, + "learning_rate": 6.5801195166508e-06, + "loss": 0.2901, + "step": 4272 + }, + { + "epoch": 0.4158637469586375, + "grad_norm": 2.0993461408388985, + "learning_rate": 6.578624058933555e-06, + "loss": 0.3534, + "step": 4273 + }, + { + "epoch": 0.4159610705596107, + "grad_norm": 1.529766064380746, + "learning_rate": 6.577128444347465e-06, + "loss": 0.3926, + "step": 4274 + }, + { + "epoch": 0.41605839416058393, + "grad_norm": 1.3503570473016178, + "learning_rate": 6.575632673041153e-06, + "loss": 0.3662, + "step": 4275 + }, + { + "epoch": 0.4161557177615572, + "grad_norm": 1.7770233200809984, + "learning_rate": 6.574136745163253e-06, + "loss": 0.4091, + "step": 4276 + }, + { + "epoch": 0.41625304136253044, + "grad_norm": 1.3168899598288053, + "learning_rate": 6.5726406608624185e-06, + "loss": 0.2383, + "step": 4277 + }, + { + "epoch": 0.41635036496350364, + "grad_norm": 1.4856806584236355, + "learning_rate": 6.571144420287314e-06, + "loss": 0.4036, + "step": 4278 + }, + { + "epoch": 0.4164476885644769, + "grad_norm": 1.4152039194265187, + "learning_rate": 6.569648023586624e-06, + "loss": 0.3096, + "step": 4279 + }, + { + "epoch": 0.41654501216545015, + "grad_norm": 1.4017120468664837, + "learning_rate": 6.568151470909042e-06, + "loss": 0.4778, + "step": 4280 + }, + { + "epoch": 0.41664233576642334, + "grad_norm": 1.1741744814184691, + "learning_rate": 6.566654762403286e-06, + "loss": 0.3247, + "step": 4281 + }, + { + "epoch": 0.4167396593673966, + "grad_norm": 1.4044574757131045, + "learning_rate": 6.5651578982180845e-06, + "loss": 0.3485, + "step": 4282 + }, + { + "epoch": 0.41683698296836985, + "grad_norm": 1.605059643525527, + "learning_rate": 6.56366087850218e-06, + "loss": 0.5178, + "step": 4283 + }, + { + "epoch": 0.41693430656934305, + "grad_norm": 1.6798776918121874, + "learning_rate": 6.562163703404333e-06, + "loss": 0.4953, + "step": 4284 + }, + { + "epoch": 0.4170316301703163, + "grad_norm": 1.3694256582426696, + "learning_rate": 6.560666373073317e-06, + "loss": 0.3987, + "step": 4285 + }, + { + "epoch": 0.41712895377128956, + "grad_norm": 1.6897161554637383, + "learning_rate": 6.559168887657926e-06, + "loss": 0.5096, + "step": 4286 + }, + { + "epoch": 0.41722627737226275, + "grad_norm": 1.3446734604223378, + "learning_rate": 6.557671247306965e-06, + "loss": 0.3865, + "step": 4287 + }, + { + "epoch": 0.417323600973236, + "grad_norm": 1.1928604006824761, + "learning_rate": 6.556173452169252e-06, + "loss": 0.2174, + "step": 4288 + }, + { + "epoch": 0.41742092457420926, + "grad_norm": 1.2595066957738728, + "learning_rate": 6.554675502393629e-06, + "loss": 0.2598, + "step": 4289 + }, + { + "epoch": 0.41751824817518246, + "grad_norm": 1.3670007196632394, + "learning_rate": 6.5531773981289436e-06, + "loss": 0.3852, + "step": 4290 + }, + { + "epoch": 0.4176155717761557, + "grad_norm": 1.2671207019965953, + "learning_rate": 6.551679139524068e-06, + "loss": 0.3286, + "step": 4291 + }, + { + "epoch": 0.41771289537712897, + "grad_norm": 3.291438678298298, + "learning_rate": 6.55018072672788e-06, + "loss": 0.341, + "step": 4292 + }, + { + "epoch": 0.41781021897810217, + "grad_norm": 1.1986094365133406, + "learning_rate": 6.548682159889284e-06, + "loss": 0.3322, + "step": 4293 + }, + { + "epoch": 0.4179075425790754, + "grad_norm": 1.7308394678650263, + "learning_rate": 6.547183439157187e-06, + "loss": 0.6067, + "step": 4294 + }, + { + "epoch": 0.4180048661800487, + "grad_norm": 1.3996236022686042, + "learning_rate": 6.54568456468052e-06, + "loss": 0.47, + "step": 4295 + }, + { + "epoch": 0.41810218978102187, + "grad_norm": 1.33827157270381, + "learning_rate": 6.54418553660823e-06, + "loss": 0.3526, + "step": 4296 + }, + { + "epoch": 0.4181995133819951, + "grad_norm": 1.0611044750927061, + "learning_rate": 6.542686355089273e-06, + "loss": 0.2076, + "step": 4297 + }, + { + "epoch": 0.4182968369829684, + "grad_norm": 1.483729653865723, + "learning_rate": 6.541187020272624e-06, + "loss": 0.4021, + "step": 4298 + }, + { + "epoch": 0.41839416058394163, + "grad_norm": 1.4550107799537069, + "learning_rate": 6.539687532307275e-06, + "loss": 0.5588, + "step": 4299 + }, + { + "epoch": 0.41849148418491483, + "grad_norm": 1.20942672500593, + "learning_rate": 6.538187891342228e-06, + "loss": 0.3367, + "step": 4300 + }, + { + "epoch": 0.4185888077858881, + "grad_norm": 1.3240648028254882, + "learning_rate": 6.5366880975265055e-06, + "loss": 0.341, + "step": 4301 + }, + { + "epoch": 0.41868613138686134, + "grad_norm": 1.805264599497935, + "learning_rate": 6.535188151009143e-06, + "loss": 0.2554, + "step": 4302 + }, + { + "epoch": 0.41878345498783454, + "grad_norm": 1.1279014154499223, + "learning_rate": 6.53368805193919e-06, + "loss": 0.1854, + "step": 4303 + }, + { + "epoch": 0.4188807785888078, + "grad_norm": 1.5248162706105455, + "learning_rate": 6.532187800465713e-06, + "loss": 0.5458, + "step": 4304 + }, + { + "epoch": 0.41897810218978104, + "grad_norm": 0.9949928333273576, + "learning_rate": 6.5306873967377916e-06, + "loss": 0.2022, + "step": 4305 + }, + { + "epoch": 0.41907542579075424, + "grad_norm": 1.3149218240859954, + "learning_rate": 6.5291868409045226e-06, + "loss": 0.3624, + "step": 4306 + }, + { + "epoch": 0.4191727493917275, + "grad_norm": 1.5395304167054258, + "learning_rate": 6.5276861331150175e-06, + "loss": 0.4642, + "step": 4307 + }, + { + "epoch": 0.41927007299270075, + "grad_norm": 1.2063026080741097, + "learning_rate": 6.526185273518402e-06, + "loss": 0.2982, + "step": 4308 + }, + { + "epoch": 0.41936739659367395, + "grad_norm": 1.4265007816263084, + "learning_rate": 6.52468426226382e-06, + "loss": 0.3693, + "step": 4309 + }, + { + "epoch": 0.4194647201946472, + "grad_norm": 1.6530646156286823, + "learning_rate": 6.523183099500423e-06, + "loss": 0.3381, + "step": 4310 + }, + { + "epoch": 0.41956204379562045, + "grad_norm": 1.222949008199894, + "learning_rate": 6.521681785377386e-06, + "loss": 0.2989, + "step": 4311 + }, + { + "epoch": 0.41965936739659365, + "grad_norm": 1.4404214942828424, + "learning_rate": 6.520180320043894e-06, + "loss": 0.5608, + "step": 4312 + }, + { + "epoch": 0.4197566909975669, + "grad_norm": 1.4377809332947757, + "learning_rate": 6.51867870364915e-06, + "loss": 0.3673, + "step": 4313 + }, + { + "epoch": 0.41985401459854016, + "grad_norm": 1.0479588946915206, + "learning_rate": 6.517176936342372e-06, + "loss": 0.1978, + "step": 4314 + }, + { + "epoch": 0.41995133819951336, + "grad_norm": 1.2313516991145266, + "learning_rate": 6.515675018272787e-06, + "loss": 0.2914, + "step": 4315 + }, + { + "epoch": 0.4200486618004866, + "grad_norm": 1.1594822014038484, + "learning_rate": 6.514172949589644e-06, + "loss": 0.3001, + "step": 4316 + }, + { + "epoch": 0.42014598540145986, + "grad_norm": 1.2029717087625669, + "learning_rate": 6.5126707304422035e-06, + "loss": 0.3405, + "step": 4317 + }, + { + "epoch": 0.42024330900243306, + "grad_norm": 1.16346382260932, + "learning_rate": 6.5111683609797435e-06, + "loss": 0.2537, + "step": 4318 + }, + { + "epoch": 0.4203406326034063, + "grad_norm": 1.4210857674247095, + "learning_rate": 6.509665841351555e-06, + "loss": 0.5547, + "step": 4319 + }, + { + "epoch": 0.42043795620437957, + "grad_norm": 1.5171779266903973, + "learning_rate": 6.508163171706944e-06, + "loss": 0.5007, + "step": 4320 + }, + { + "epoch": 0.4205352798053528, + "grad_norm": 1.3552902060116079, + "learning_rate": 6.506660352195231e-06, + "loss": 0.4022, + "step": 4321 + }, + { + "epoch": 0.420632603406326, + "grad_norm": 1.145145186502107, + "learning_rate": 6.505157382965752e-06, + "loss": 0.3137, + "step": 4322 + }, + { + "epoch": 0.4207299270072993, + "grad_norm": 1.5286464745316035, + "learning_rate": 6.503654264167861e-06, + "loss": 0.5978, + "step": 4323 + }, + { + "epoch": 0.42082725060827253, + "grad_norm": 1.426417712983417, + "learning_rate": 6.50215099595092e-06, + "loss": 0.3787, + "step": 4324 + }, + { + "epoch": 0.4209245742092457, + "grad_norm": 1.2648697000425113, + "learning_rate": 6.500647578464312e-06, + "loss": 0.3706, + "step": 4325 + }, + { + "epoch": 0.421021897810219, + "grad_norm": 1.3051542258204123, + "learning_rate": 6.499144011857431e-06, + "loss": 0.3678, + "step": 4326 + }, + { + "epoch": 0.42111922141119223, + "grad_norm": 1.3581541871587341, + "learning_rate": 6.497640296279688e-06, + "loss": 0.3831, + "step": 4327 + }, + { + "epoch": 0.42121654501216543, + "grad_norm": 1.1964601480022, + "learning_rate": 6.496136431880509e-06, + "loss": 0.3243, + "step": 4328 + }, + { + "epoch": 0.4213138686131387, + "grad_norm": 1.339441934284622, + "learning_rate": 6.4946324188093325e-06, + "loss": 0.3394, + "step": 4329 + }, + { + "epoch": 0.42141119221411194, + "grad_norm": 1.302596375163602, + "learning_rate": 6.493128257215614e-06, + "loss": 0.3479, + "step": 4330 + }, + { + "epoch": 0.42150851581508514, + "grad_norm": 1.3886290381325908, + "learning_rate": 6.491623947248824e-06, + "loss": 0.5067, + "step": 4331 + }, + { + "epoch": 0.4216058394160584, + "grad_norm": 1.2676024824015228, + "learning_rate": 6.490119489058444e-06, + "loss": 0.3229, + "step": 4332 + }, + { + "epoch": 0.42170316301703165, + "grad_norm": 1.2119833091115328, + "learning_rate": 6.488614882793974e-06, + "loss": 0.3683, + "step": 4333 + }, + { + "epoch": 0.42180048661800484, + "grad_norm": 1.4227111370092373, + "learning_rate": 6.48711012860493e-06, + "loss": 0.3916, + "step": 4334 + }, + { + "epoch": 0.4218978102189781, + "grad_norm": 1.1478638546259368, + "learning_rate": 6.4856052266408375e-06, + "loss": 0.271, + "step": 4335 + }, + { + "epoch": 0.42199513381995135, + "grad_norm": 1.5635814738770923, + "learning_rate": 6.484100177051242e-06, + "loss": 0.4534, + "step": 4336 + }, + { + "epoch": 0.42209245742092455, + "grad_norm": 1.3956104024626772, + "learning_rate": 6.4825949799856966e-06, + "loss": 0.4618, + "step": 4337 + }, + { + "epoch": 0.4221897810218978, + "grad_norm": 1.4196881255329625, + "learning_rate": 6.481089635593778e-06, + "loss": 0.4528, + "step": 4338 + }, + { + "epoch": 0.42228710462287106, + "grad_norm": 1.400784106020295, + "learning_rate": 6.479584144025073e-06, + "loss": 0.5102, + "step": 4339 + }, + { + "epoch": 0.42238442822384425, + "grad_norm": 2.468078425362333, + "learning_rate": 6.4780785054291816e-06, + "loss": 0.5655, + "step": 4340 + }, + { + "epoch": 0.4224817518248175, + "grad_norm": 1.6227598889110846, + "learning_rate": 6.476572719955721e-06, + "loss": 0.4855, + "step": 4341 + }, + { + "epoch": 0.42257907542579076, + "grad_norm": 1.306678121396544, + "learning_rate": 6.475066787754322e-06, + "loss": 0.4072, + "step": 4342 + }, + { + "epoch": 0.422676399026764, + "grad_norm": 1.37061329653536, + "learning_rate": 6.473560708974628e-06, + "loss": 0.4673, + "step": 4343 + }, + { + "epoch": 0.4227737226277372, + "grad_norm": 1.3905380736196298, + "learning_rate": 6.472054483766301e-06, + "loss": 0.2885, + "step": 4344 + }, + { + "epoch": 0.42287104622871047, + "grad_norm": 4.801162046596706, + "learning_rate": 6.470548112279016e-06, + "loss": 0.5519, + "step": 4345 + }, + { + "epoch": 0.4229683698296837, + "grad_norm": 1.3633612579215535, + "learning_rate": 6.46904159466246e-06, + "loss": 0.4516, + "step": 4346 + }, + { + "epoch": 0.4230656934306569, + "grad_norm": 1.4357055576360107, + "learning_rate": 6.4675349310663406e-06, + "loss": 0.4457, + "step": 4347 + }, + { + "epoch": 0.4231630170316302, + "grad_norm": 1.6129882398627167, + "learning_rate": 6.466028121640371e-06, + "loss": 0.4231, + "step": 4348 + }, + { + "epoch": 0.4232603406326034, + "grad_norm": 1.3445730128609834, + "learning_rate": 6.464521166534285e-06, + "loss": 0.235, + "step": 4349 + }, + { + "epoch": 0.4233576642335766, + "grad_norm": 1.3167614556649498, + "learning_rate": 6.4630140658978315e-06, + "loss": 0.4414, + "step": 4350 + }, + { + "epoch": 0.4234549878345499, + "grad_norm": 1.4125450802251278, + "learning_rate": 6.461506819880772e-06, + "loss": 0.4684, + "step": 4351 + }, + { + "epoch": 0.42355231143552313, + "grad_norm": 1.4953055035817089, + "learning_rate": 6.459999428632882e-06, + "loss": 0.4405, + "step": 4352 + }, + { + "epoch": 0.42364963503649633, + "grad_norm": 1.3815431102607814, + "learning_rate": 6.458491892303948e-06, + "loss": 0.3887, + "step": 4353 + }, + { + "epoch": 0.4237469586374696, + "grad_norm": 1.4957571422897962, + "learning_rate": 6.4569842110437795e-06, + "loss": 0.5109, + "step": 4354 + }, + { + "epoch": 0.42384428223844284, + "grad_norm": 1.4325727522185985, + "learning_rate": 6.455476385002195e-06, + "loss": 0.3701, + "step": 4355 + }, + { + "epoch": 0.42394160583941604, + "grad_norm": 1.3057908743402888, + "learning_rate": 6.453968414329029e-06, + "loss": 0.3755, + "step": 4356 + }, + { + "epoch": 0.4240389294403893, + "grad_norm": 1.1674257374901074, + "learning_rate": 6.452460299174126e-06, + "loss": 0.3337, + "step": 4357 + }, + { + "epoch": 0.42413625304136254, + "grad_norm": 1.5147142328351313, + "learning_rate": 6.450952039687352e-06, + "loss": 0.2443, + "step": 4358 + }, + { + "epoch": 0.42423357664233574, + "grad_norm": 1.5905612385678973, + "learning_rate": 6.449443636018579e-06, + "loss": 0.3695, + "step": 4359 + }, + { + "epoch": 0.424330900243309, + "grad_norm": 1.3783491268121844, + "learning_rate": 6.447935088317704e-06, + "loss": 0.4123, + "step": 4360 + }, + { + "epoch": 0.42442822384428225, + "grad_norm": 1.6061178783894494, + "learning_rate": 6.4464263967346286e-06, + "loss": 0.5431, + "step": 4361 + }, + { + "epoch": 0.42452554744525545, + "grad_norm": 1.4425269424857767, + "learning_rate": 6.444917561419272e-06, + "loss": 0.4448, + "step": 4362 + }, + { + "epoch": 0.4246228710462287, + "grad_norm": 1.6012552106118871, + "learning_rate": 6.443408582521571e-06, + "loss": 0.4083, + "step": 4363 + }, + { + "epoch": 0.42472019464720195, + "grad_norm": 1.0459143942818723, + "learning_rate": 6.4418994601914695e-06, + "loss": 0.1693, + "step": 4364 + }, + { + "epoch": 0.4248175182481752, + "grad_norm": 1.404870861675631, + "learning_rate": 6.4403901945789335e-06, + "loss": 0.2467, + "step": 4365 + }, + { + "epoch": 0.4249148418491484, + "grad_norm": 1.6503783456921153, + "learning_rate": 6.438880785833938e-06, + "loss": 0.6503, + "step": 4366 + }, + { + "epoch": 0.42501216545012166, + "grad_norm": 1.4886065750794846, + "learning_rate": 6.437371234106476e-06, + "loss": 0.3345, + "step": 4367 + }, + { + "epoch": 0.4251094890510949, + "grad_norm": 1.5814934463968207, + "learning_rate": 6.4358615395465506e-06, + "loss": 0.5983, + "step": 4368 + }, + { + "epoch": 0.4252068126520681, + "grad_norm": 1.5092673127042504, + "learning_rate": 6.43435170230418e-06, + "loss": 0.3829, + "step": 4369 + }, + { + "epoch": 0.42530413625304136, + "grad_norm": 1.3432402420129697, + "learning_rate": 6.4328417225294015e-06, + "loss": 0.4439, + "step": 4370 + }, + { + "epoch": 0.4254014598540146, + "grad_norm": 1.623766357480442, + "learning_rate": 6.431331600372259e-06, + "loss": 0.4843, + "step": 4371 + }, + { + "epoch": 0.4254987834549878, + "grad_norm": 1.7930756580668195, + "learning_rate": 6.4298213359828155e-06, + "loss": 0.3407, + "step": 4372 + }, + { + "epoch": 0.42559610705596107, + "grad_norm": 1.2559785919794741, + "learning_rate": 6.428310929511146e-06, + "loss": 0.2498, + "step": 4373 + }, + { + "epoch": 0.4256934306569343, + "grad_norm": 1.5454160958367298, + "learning_rate": 6.426800381107343e-06, + "loss": 0.4964, + "step": 4374 + }, + { + "epoch": 0.4257907542579075, + "grad_norm": 1.2723938579546594, + "learning_rate": 6.425289690921509e-06, + "loss": 0.2513, + "step": 4375 + }, + { + "epoch": 0.4258880778588808, + "grad_norm": 1.277636479890389, + "learning_rate": 6.423778859103762e-06, + "loss": 0.2989, + "step": 4376 + }, + { + "epoch": 0.42598540145985403, + "grad_norm": 1.4447771060194305, + "learning_rate": 6.4222678858042355e-06, + "loss": 0.5214, + "step": 4377 + }, + { + "epoch": 0.4260827250608272, + "grad_norm": 1.870842966622725, + "learning_rate": 6.420756771173075e-06, + "loss": 0.284, + "step": 4378 + }, + { + "epoch": 0.4261800486618005, + "grad_norm": 1.941139248883791, + "learning_rate": 6.419245515360441e-06, + "loss": 0.4939, + "step": 4379 + }, + { + "epoch": 0.42627737226277373, + "grad_norm": 1.4323796008311047, + "learning_rate": 6.4177341185165045e-06, + "loss": 0.4712, + "step": 4380 + }, + { + "epoch": 0.42637469586374693, + "grad_norm": 1.495438155711353, + "learning_rate": 6.41622258079146e-06, + "loss": 0.3948, + "step": 4381 + }, + { + "epoch": 0.4264720194647202, + "grad_norm": 1.275165719126958, + "learning_rate": 6.414710902335507e-06, + "loss": 0.3273, + "step": 4382 + }, + { + "epoch": 0.42656934306569344, + "grad_norm": 1.1470699861754334, + "learning_rate": 6.413199083298862e-06, + "loss": 0.2802, + "step": 4383 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 1.609847804866654, + "learning_rate": 6.411687123831756e-06, + "loss": 0.4448, + "step": 4384 + }, + { + "epoch": 0.4267639902676399, + "grad_norm": 1.1993407147045914, + "learning_rate": 6.4101750240844315e-06, + "loss": 0.2549, + "step": 4385 + }, + { + "epoch": 0.42686131386861315, + "grad_norm": 1.2585478345095418, + "learning_rate": 6.408662784207149e-06, + "loss": 0.4034, + "step": 4386 + }, + { + "epoch": 0.4269586374695864, + "grad_norm": 1.561278514250439, + "learning_rate": 6.40715040435018e-06, + "loss": 0.325, + "step": 4387 + }, + { + "epoch": 0.4270559610705596, + "grad_norm": 1.328127199058956, + "learning_rate": 6.40563788466381e-06, + "loss": 0.4286, + "step": 4388 + }, + { + "epoch": 0.42715328467153285, + "grad_norm": 1.3392689531276551, + "learning_rate": 6.40412522529834e-06, + "loss": 0.2742, + "step": 4389 + }, + { + "epoch": 0.4272506082725061, + "grad_norm": 1.5915708416492296, + "learning_rate": 6.4026124264040824e-06, + "loss": 0.4089, + "step": 4390 + }, + { + "epoch": 0.4273479318734793, + "grad_norm": 1.5011107782482271, + "learning_rate": 6.401099488131366e-06, + "loss": 0.4322, + "step": 4391 + }, + { + "epoch": 0.42744525547445256, + "grad_norm": 1.5212139628772348, + "learning_rate": 6.399586410630533e-06, + "loss": 0.4463, + "step": 4392 + }, + { + "epoch": 0.4275425790754258, + "grad_norm": 1.1681823535144769, + "learning_rate": 6.398073194051937e-06, + "loss": 0.2473, + "step": 4393 + }, + { + "epoch": 0.427639902676399, + "grad_norm": 1.587136703131999, + "learning_rate": 6.396559838545949e-06, + "loss": 0.3338, + "step": 4394 + }, + { + "epoch": 0.42773722627737226, + "grad_norm": 1.493821772044209, + "learning_rate": 6.395046344262951e-06, + "loss": 0.4352, + "step": 4395 + }, + { + "epoch": 0.4278345498783455, + "grad_norm": 1.3434445677218851, + "learning_rate": 6.393532711353341e-06, + "loss": 0.3475, + "step": 4396 + }, + { + "epoch": 0.4279318734793187, + "grad_norm": 1.135742712307757, + "learning_rate": 6.3920189399675295e-06, + "loss": 0.2778, + "step": 4397 + }, + { + "epoch": 0.42802919708029197, + "grad_norm": 1.582300911985823, + "learning_rate": 6.390505030255939e-06, + "loss": 0.4054, + "step": 4398 + }, + { + "epoch": 0.4281265206812652, + "grad_norm": 1.4579512322620043, + "learning_rate": 6.38899098236901e-06, + "loss": 0.355, + "step": 4399 + }, + { + "epoch": 0.4282238442822384, + "grad_norm": 1.3073200004921628, + "learning_rate": 6.387476796457192e-06, + "loss": 0.4277, + "step": 4400 + }, + { + "epoch": 0.4283211678832117, + "grad_norm": 1.52175732418107, + "learning_rate": 6.385962472670953e-06, + "loss": 0.411, + "step": 4401 + }, + { + "epoch": 0.4284184914841849, + "grad_norm": 1.9451581901413757, + "learning_rate": 6.384448011160771e-06, + "loss": 0.3991, + "step": 4402 + }, + { + "epoch": 0.4285158150851581, + "grad_norm": 1.4320770472785467, + "learning_rate": 6.38293341207714e-06, + "loss": 0.5328, + "step": 4403 + }, + { + "epoch": 0.4286131386861314, + "grad_norm": 1.4130794838716443, + "learning_rate": 6.3814186755705645e-06, + "loss": 0.3512, + "step": 4404 + }, + { + "epoch": 0.42871046228710463, + "grad_norm": 1.3562391219164305, + "learning_rate": 6.379903801791567e-06, + "loss": 0.428, + "step": 4405 + }, + { + "epoch": 0.4288077858880779, + "grad_norm": 1.2879718212279418, + "learning_rate": 6.3783887908906805e-06, + "loss": 0.369, + "step": 4406 + }, + { + "epoch": 0.4289051094890511, + "grad_norm": 1.775559017643254, + "learning_rate": 6.376873643018452e-06, + "loss": 0.5678, + "step": 4407 + }, + { + "epoch": 0.42900243309002434, + "grad_norm": 1.6372660811981936, + "learning_rate": 6.375358358325444e-06, + "loss": 0.6129, + "step": 4408 + }, + { + "epoch": 0.4290997566909976, + "grad_norm": 1.565161231635583, + "learning_rate": 6.37384293696223e-06, + "loss": 0.5301, + "step": 4409 + }, + { + "epoch": 0.4291970802919708, + "grad_norm": 1.3083780139331866, + "learning_rate": 6.3723273790793995e-06, + "loss": 0.3379, + "step": 4410 + }, + { + "epoch": 0.42929440389294404, + "grad_norm": 1.7242515598869155, + "learning_rate": 6.370811684827553e-06, + "loss": 0.436, + "step": 4411 + }, + { + "epoch": 0.4293917274939173, + "grad_norm": 1.2145655273563092, + "learning_rate": 6.369295854357307e-06, + "loss": 0.2734, + "step": 4412 + }, + { + "epoch": 0.4294890510948905, + "grad_norm": 1.4312853613070817, + "learning_rate": 6.36777988781929e-06, + "loss": 0.2536, + "step": 4413 + }, + { + "epoch": 0.42958637469586375, + "grad_norm": 1.3921604894550261, + "learning_rate": 6.366263785364146e-06, + "loss": 0.4233, + "step": 4414 + }, + { + "epoch": 0.429683698296837, + "grad_norm": 1.550203845248266, + "learning_rate": 6.36474754714253e-06, + "loss": 0.4369, + "step": 4415 + }, + { + "epoch": 0.4297810218978102, + "grad_norm": 1.5134552831430093, + "learning_rate": 6.363231173305111e-06, + "loss": 0.2749, + "step": 4416 + }, + { + "epoch": 0.42987834549878345, + "grad_norm": 1.4415675127333825, + "learning_rate": 6.361714664002572e-06, + "loss": 0.2797, + "step": 4417 + }, + { + "epoch": 0.4299756690997567, + "grad_norm": 1.3612060930020984, + "learning_rate": 6.360198019385609e-06, + "loss": 0.3666, + "step": 4418 + }, + { + "epoch": 0.4300729927007299, + "grad_norm": 1.043670304048288, + "learning_rate": 6.358681239604935e-06, + "loss": 0.2156, + "step": 4419 + }, + { + "epoch": 0.43017031630170316, + "grad_norm": 1.21375846836267, + "learning_rate": 6.357164324811269e-06, + "loss": 0.3139, + "step": 4420 + }, + { + "epoch": 0.4302676399026764, + "grad_norm": 1.300403987028639, + "learning_rate": 6.355647275155351e-06, + "loss": 0.354, + "step": 4421 + }, + { + "epoch": 0.4303649635036496, + "grad_norm": 1.5100041407092586, + "learning_rate": 6.354130090787929e-06, + "loss": 0.4109, + "step": 4422 + }, + { + "epoch": 0.43046228710462286, + "grad_norm": 1.3727074497945866, + "learning_rate": 6.352612771859769e-06, + "loss": 0.4171, + "step": 4423 + }, + { + "epoch": 0.4305596107055961, + "grad_norm": 1.3703693530338585, + "learning_rate": 6.351095318521646e-06, + "loss": 0.4578, + "step": 4424 + }, + { + "epoch": 0.4306569343065693, + "grad_norm": 1.7249543999435943, + "learning_rate": 6.34957773092435e-06, + "loss": 0.4751, + "step": 4425 + }, + { + "epoch": 0.43075425790754257, + "grad_norm": 1.540658191377533, + "learning_rate": 6.3480600092186865e-06, + "loss": 0.531, + "step": 4426 + }, + { + "epoch": 0.4308515815085158, + "grad_norm": 1.4931397919463056, + "learning_rate": 6.346542153555471e-06, + "loss": 0.2869, + "step": 4427 + }, + { + "epoch": 0.4309489051094891, + "grad_norm": 1.2386309493951129, + "learning_rate": 6.345024164085533e-06, + "loss": 0.3842, + "step": 4428 + }, + { + "epoch": 0.4310462287104623, + "grad_norm": 1.8030870615440533, + "learning_rate": 6.343506040959717e-06, + "loss": 0.4934, + "step": 4429 + }, + { + "epoch": 0.43114355231143553, + "grad_norm": 1.297247362108796, + "learning_rate": 6.341987784328881e-06, + "loss": 0.3116, + "step": 4430 + }, + { + "epoch": 0.4312408759124088, + "grad_norm": 1.7662444251877587, + "learning_rate": 6.340469394343895e-06, + "loss": 0.4347, + "step": 4431 + }, + { + "epoch": 0.431338199513382, + "grad_norm": 1.377574504492093, + "learning_rate": 6.338950871155641e-06, + "loss": 0.3021, + "step": 4432 + }, + { + "epoch": 0.43143552311435523, + "grad_norm": 1.2448102026587469, + "learning_rate": 6.337432214915014e-06, + "loss": 0.3662, + "step": 4433 + }, + { + "epoch": 0.4315328467153285, + "grad_norm": 1.609409665225116, + "learning_rate": 6.335913425772926e-06, + "loss": 0.5899, + "step": 4434 + }, + { + "epoch": 0.4316301703163017, + "grad_norm": 1.5454437107583565, + "learning_rate": 6.334394503880301e-06, + "loss": 0.4264, + "step": 4435 + }, + { + "epoch": 0.43172749391727494, + "grad_norm": 1.454423385884074, + "learning_rate": 6.332875449388074e-06, + "loss": 0.3231, + "step": 4436 + }, + { + "epoch": 0.4318248175182482, + "grad_norm": 1.4621439149788715, + "learning_rate": 6.3313562624471944e-06, + "loss": 0.4866, + "step": 4437 + }, + { + "epoch": 0.4319221411192214, + "grad_norm": 1.4447299092393133, + "learning_rate": 6.329836943208624e-06, + "loss": 0.3377, + "step": 4438 + }, + { + "epoch": 0.43201946472019465, + "grad_norm": 1.5423068787846852, + "learning_rate": 6.328317491823338e-06, + "loss": 0.4526, + "step": 4439 + }, + { + "epoch": 0.4321167883211679, + "grad_norm": 1.4786689709013954, + "learning_rate": 6.326797908442328e-06, + "loss": 0.6604, + "step": 4440 + }, + { + "epoch": 0.4322141119221411, + "grad_norm": 1.3727693236517078, + "learning_rate": 6.325278193216595e-06, + "loss": 0.3873, + "step": 4441 + }, + { + "epoch": 0.43231143552311435, + "grad_norm": 1.3990806933206996, + "learning_rate": 6.323758346297153e-06, + "loss": 0.4849, + "step": 4442 + }, + { + "epoch": 0.4324087591240876, + "grad_norm": 1.768379628255284, + "learning_rate": 6.32223836783503e-06, + "loss": 0.4031, + "step": 4443 + }, + { + "epoch": 0.4325060827250608, + "grad_norm": 1.5660766863699724, + "learning_rate": 6.3207182579812664e-06, + "loss": 0.6173, + "step": 4444 + }, + { + "epoch": 0.43260340632603406, + "grad_norm": 1.7078680785018712, + "learning_rate": 6.319198016886918e-06, + "loss": 0.4452, + "step": 4445 + }, + { + "epoch": 0.4327007299270073, + "grad_norm": 1.4433642478932882, + "learning_rate": 6.317677644703054e-06, + "loss": 0.5051, + "step": 4446 + }, + { + "epoch": 0.4327980535279805, + "grad_norm": 1.255438703024322, + "learning_rate": 6.316157141580751e-06, + "loss": 0.2952, + "step": 4447 + }, + { + "epoch": 0.43289537712895376, + "grad_norm": 1.499625079665707, + "learning_rate": 6.314636507671105e-06, + "loss": 0.4835, + "step": 4448 + }, + { + "epoch": 0.432992700729927, + "grad_norm": 1.339457286562369, + "learning_rate": 6.313115743125219e-06, + "loss": 0.3763, + "step": 4449 + }, + { + "epoch": 0.43309002433090027, + "grad_norm": 1.5988790221120088, + "learning_rate": 6.311594848094216e-06, + "loss": 0.2813, + "step": 4450 + }, + { + "epoch": 0.43318734793187347, + "grad_norm": 1.431445598979624, + "learning_rate": 6.310073822729228e-06, + "loss": 0.5356, + "step": 4451 + }, + { + "epoch": 0.4332846715328467, + "grad_norm": 1.687399967195683, + "learning_rate": 6.308552667181397e-06, + "loss": 0.7034, + "step": 4452 + }, + { + "epoch": 0.43338199513382, + "grad_norm": 2.2202476497363235, + "learning_rate": 6.307031381601885e-06, + "loss": 0.3946, + "step": 4453 + }, + { + "epoch": 0.4334793187347932, + "grad_norm": 1.527204424266891, + "learning_rate": 6.3055099661418585e-06, + "loss": 0.3661, + "step": 4454 + }, + { + "epoch": 0.4335766423357664, + "grad_norm": 1.7819298233274148, + "learning_rate": 6.303988420952506e-06, + "loss": 0.612, + "step": 4455 + }, + { + "epoch": 0.4336739659367397, + "grad_norm": 1.6866898181266232, + "learning_rate": 6.302466746185022e-06, + "loss": 0.5496, + "step": 4456 + }, + { + "epoch": 0.4337712895377129, + "grad_norm": 1.5237861429986579, + "learning_rate": 6.300944941990617e-06, + "loss": 0.3251, + "step": 4457 + }, + { + "epoch": 0.43386861313868613, + "grad_norm": 1.359651910048231, + "learning_rate": 6.299423008520514e-06, + "loss": 0.3467, + "step": 4458 + }, + { + "epoch": 0.4339659367396594, + "grad_norm": 1.6804030202924267, + "learning_rate": 6.2979009459259474e-06, + "loss": 0.6095, + "step": 4459 + }, + { + "epoch": 0.4340632603406326, + "grad_norm": 1.584995309359472, + "learning_rate": 6.296378754358166e-06, + "loss": 0.4095, + "step": 4460 + }, + { + "epoch": 0.43416058394160584, + "grad_norm": 1.477199033753084, + "learning_rate": 6.294856433968432e-06, + "loss": 0.3005, + "step": 4461 + }, + { + "epoch": 0.4342579075425791, + "grad_norm": 1.0669301740685238, + "learning_rate": 6.293333984908018e-06, + "loss": 0.1931, + "step": 4462 + }, + { + "epoch": 0.4343552311435523, + "grad_norm": 1.7266158013725337, + "learning_rate": 6.29181140732821e-06, + "loss": 0.5056, + "step": 4463 + }, + { + "epoch": 0.43445255474452554, + "grad_norm": 1.4202215620919534, + "learning_rate": 6.2902887013803095e-06, + "loss": 0.365, + "step": 4464 + }, + { + "epoch": 0.4345498783454988, + "grad_norm": 1.4021022663094602, + "learning_rate": 6.2887658672156256e-06, + "loss": 0.3146, + "step": 4465 + }, + { + "epoch": 0.434647201946472, + "grad_norm": 1.353687314020154, + "learning_rate": 6.287242904985488e-06, + "loss": 0.4899, + "step": 4466 + }, + { + "epoch": 0.43474452554744525, + "grad_norm": 1.313498525896759, + "learning_rate": 6.28571981484123e-06, + "loss": 0.3394, + "step": 4467 + }, + { + "epoch": 0.4348418491484185, + "grad_norm": 1.6439873527634399, + "learning_rate": 6.284196596934205e-06, + "loss": 0.5208, + "step": 4468 + }, + { + "epoch": 0.4349391727493917, + "grad_norm": 1.6003841624409958, + "learning_rate": 6.282673251415774e-06, + "loss": 0.6754, + "step": 4469 + }, + { + "epoch": 0.43503649635036495, + "grad_norm": 1.853737692197248, + "learning_rate": 6.281149778437314e-06, + "loss": 0.907, + "step": 4470 + }, + { + "epoch": 0.4351338199513382, + "grad_norm": 1.4836354873033306, + "learning_rate": 6.2796261781502135e-06, + "loss": 0.4864, + "step": 4471 + }, + { + "epoch": 0.43523114355231146, + "grad_norm": 1.4138693493182206, + "learning_rate": 6.278102450705872e-06, + "loss": 0.3788, + "step": 4472 + }, + { + "epoch": 0.43532846715328466, + "grad_norm": 1.5811350603509635, + "learning_rate": 6.276578596255705e-06, + "loss": 0.6215, + "step": 4473 + }, + { + "epoch": 0.4354257907542579, + "grad_norm": 1.1902771148669904, + "learning_rate": 6.2750546149511386e-06, + "loss": 0.1741, + "step": 4474 + }, + { + "epoch": 0.43552311435523117, + "grad_norm": 1.3679298347414897, + "learning_rate": 6.2735305069436104e-06, + "loss": 0.4947, + "step": 4475 + }, + { + "epoch": 0.43562043795620436, + "grad_norm": 1.1609853742337026, + "learning_rate": 6.2720062723845734e-06, + "loss": 0.4, + "step": 4476 + }, + { + "epoch": 0.4357177615571776, + "grad_norm": 1.2996616987055336, + "learning_rate": 6.270481911425491e-06, + "loss": 0.485, + "step": 4477 + }, + { + "epoch": 0.43581508515815087, + "grad_norm": 1.5048416201370751, + "learning_rate": 6.268957424217841e-06, + "loss": 0.5718, + "step": 4478 + }, + { + "epoch": 0.43591240875912407, + "grad_norm": 1.3161535728034814, + "learning_rate": 6.267432810913112e-06, + "loss": 0.363, + "step": 4479 + }, + { + "epoch": 0.4360097323600973, + "grad_norm": 1.4311736673714228, + "learning_rate": 6.265908071662804e-06, + "loss": 0.5634, + "step": 4480 + }, + { + "epoch": 0.4361070559610706, + "grad_norm": 0.9733294260151267, + "learning_rate": 6.264383206618434e-06, + "loss": 0.2752, + "step": 4481 + }, + { + "epoch": 0.4362043795620438, + "grad_norm": 1.1377354954511871, + "learning_rate": 6.262858215931527e-06, + "loss": 0.2207, + "step": 4482 + }, + { + "epoch": 0.43630170316301703, + "grad_norm": 1.5576967799021977, + "learning_rate": 6.261333099753623e-06, + "loss": 0.6161, + "step": 4483 + }, + { + "epoch": 0.4363990267639903, + "grad_norm": 1.5038150115637616, + "learning_rate": 6.259807858236276e-06, + "loss": 0.4355, + "step": 4484 + }, + { + "epoch": 0.4364963503649635, + "grad_norm": 1.3037916751392746, + "learning_rate": 6.258282491531044e-06, + "loss": 0.4034, + "step": 4485 + }, + { + "epoch": 0.43659367396593673, + "grad_norm": 1.421667146686385, + "learning_rate": 6.256756999789509e-06, + "loss": 0.3804, + "step": 4486 + }, + { + "epoch": 0.43669099756691, + "grad_norm": 1.3553832629067233, + "learning_rate": 6.255231383163257e-06, + "loss": 0.3941, + "step": 4487 + }, + { + "epoch": 0.4367883211678832, + "grad_norm": 1.4993022249408712, + "learning_rate": 6.253705641803893e-06, + "loss": 0.4304, + "step": 4488 + }, + { + "epoch": 0.43688564476885644, + "grad_norm": 1.3146885381920743, + "learning_rate": 6.25217977586303e-06, + "loss": 0.3686, + "step": 4489 + }, + { + "epoch": 0.4369829683698297, + "grad_norm": 1.404764110949849, + "learning_rate": 6.25065378549229e-06, + "loss": 0.3197, + "step": 4490 + }, + { + "epoch": 0.4370802919708029, + "grad_norm": 1.4483393136893457, + "learning_rate": 6.249127670843316e-06, + "loss": 0.4756, + "step": 4491 + }, + { + "epoch": 0.43717761557177615, + "grad_norm": 1.2899076523365525, + "learning_rate": 6.247601432067757e-06, + "loss": 0.3656, + "step": 4492 + }, + { + "epoch": 0.4372749391727494, + "grad_norm": 1.135799127727455, + "learning_rate": 6.246075069317278e-06, + "loss": 0.3566, + "step": 4493 + }, + { + "epoch": 0.43737226277372265, + "grad_norm": 1.1564573153655837, + "learning_rate": 6.244548582743553e-06, + "loss": 0.2099, + "step": 4494 + }, + { + "epoch": 0.43746958637469585, + "grad_norm": 1.4408336628960754, + "learning_rate": 6.2430219724982695e-06, + "loss": 0.4232, + "step": 4495 + }, + { + "epoch": 0.4375669099756691, + "grad_norm": 1.203964188436034, + "learning_rate": 6.241495238733128e-06, + "loss": 0.2844, + "step": 4496 + }, + { + "epoch": 0.43766423357664236, + "grad_norm": 1.3859509558814715, + "learning_rate": 6.239968381599843e-06, + "loss": 0.5172, + "step": 4497 + }, + { + "epoch": 0.43776155717761556, + "grad_norm": 1.3335422213163426, + "learning_rate": 6.238441401250138e-06, + "loss": 0.4624, + "step": 4498 + }, + { + "epoch": 0.4378588807785888, + "grad_norm": 1.107326119940465, + "learning_rate": 6.236914297835749e-06, + "loss": 0.2921, + "step": 4499 + }, + { + "epoch": 0.43795620437956206, + "grad_norm": 1.4409153213399095, + "learning_rate": 6.235387071508427e-06, + "loss": 0.4068, + "step": 4500 + }, + { + "epoch": 0.43805352798053526, + "grad_norm": 1.505851854436676, + "learning_rate": 6.233859722419932e-06, + "loss": 0.5088, + "step": 4501 + }, + { + "epoch": 0.4381508515815085, + "grad_norm": 1.5283188304954876, + "learning_rate": 6.232332250722037e-06, + "loss": 0.3688, + "step": 4502 + }, + { + "epoch": 0.43824817518248177, + "grad_norm": 1.3093036149624997, + "learning_rate": 6.230804656566528e-06, + "loss": 0.4499, + "step": 4503 + }, + { + "epoch": 0.43834549878345497, + "grad_norm": 1.504088868782464, + "learning_rate": 6.229276940105207e-06, + "loss": 0.437, + "step": 4504 + }, + { + "epoch": 0.4384428223844282, + "grad_norm": 1.2562191745904712, + "learning_rate": 6.227749101489878e-06, + "loss": 0.4475, + "step": 4505 + }, + { + "epoch": 0.4385401459854015, + "grad_norm": 1.6309788825739657, + "learning_rate": 6.226221140872368e-06, + "loss": 0.3089, + "step": 4506 + }, + { + "epoch": 0.4386374695863747, + "grad_norm": 2.3528430785037466, + "learning_rate": 6.224693058404508e-06, + "loss": 0.4253, + "step": 4507 + }, + { + "epoch": 0.4387347931873479, + "grad_norm": 1.286677193911547, + "learning_rate": 6.2231648542381465e-06, + "loss": 0.3963, + "step": 4508 + }, + { + "epoch": 0.4388321167883212, + "grad_norm": 1.5139355239140297, + "learning_rate": 6.221636528525142e-06, + "loss": 0.4707, + "step": 4509 + }, + { + "epoch": 0.4389294403892944, + "grad_norm": 1.3239916041185795, + "learning_rate": 6.220108081417364e-06, + "loss": 0.3053, + "step": 4510 + }, + { + "epoch": 0.43902676399026763, + "grad_norm": 1.416636532107715, + "learning_rate": 6.2185795130666985e-06, + "loss": 0.5528, + "step": 4511 + }, + { + "epoch": 0.4391240875912409, + "grad_norm": 1.1732661826974413, + "learning_rate": 6.217050823625035e-06, + "loss": 0.3784, + "step": 4512 + }, + { + "epoch": 0.43922141119221414, + "grad_norm": 1.5433129035443396, + "learning_rate": 6.215522013244284e-06, + "loss": 0.3514, + "step": 4513 + }, + { + "epoch": 0.43931873479318734, + "grad_norm": 1.4579446092871258, + "learning_rate": 6.213993082076363e-06, + "loss": 0.5273, + "step": 4514 + }, + { + "epoch": 0.4394160583941606, + "grad_norm": 1.7944461495096447, + "learning_rate": 6.212464030273204e-06, + "loss": 0.2067, + "step": 4515 + }, + { + "epoch": 0.43951338199513384, + "grad_norm": 1.3661123353334617, + "learning_rate": 6.210934857986749e-06, + "loss": 0.3677, + "step": 4516 + }, + { + "epoch": 0.43961070559610704, + "grad_norm": 1.3915341009552726, + "learning_rate": 6.209405565368952e-06, + "loss": 0.3781, + "step": 4517 + }, + { + "epoch": 0.4397080291970803, + "grad_norm": 1.4670010156712465, + "learning_rate": 6.207876152571781e-06, + "loss": 0.5255, + "step": 4518 + }, + { + "epoch": 0.43980535279805355, + "grad_norm": 1.311301728892854, + "learning_rate": 6.206346619747214e-06, + "loss": 0.2378, + "step": 4519 + }, + { + "epoch": 0.43990267639902675, + "grad_norm": 4.29827103564608, + "learning_rate": 6.204816967047244e-06, + "loss": 0.4591, + "step": 4520 + }, + { + "epoch": 0.44, + "grad_norm": 1.3808741804783327, + "learning_rate": 6.20328719462387e-06, + "loss": 0.4271, + "step": 4521 + }, + { + "epoch": 0.44009732360097326, + "grad_norm": 1.5466603602220501, + "learning_rate": 6.2017573026291074e-06, + "loss": 0.3395, + "step": 4522 + }, + { + "epoch": 0.44019464720194645, + "grad_norm": 1.5704367669123898, + "learning_rate": 6.2002272912149816e-06, + "loss": 0.4498, + "step": 4523 + }, + { + "epoch": 0.4402919708029197, + "grad_norm": 1.345795826148155, + "learning_rate": 6.198697160533535e-06, + "loss": 0.4448, + "step": 4524 + }, + { + "epoch": 0.44038929440389296, + "grad_norm": 1.766947090937083, + "learning_rate": 6.197166910736815e-06, + "loss": 0.4943, + "step": 4525 + }, + { + "epoch": 0.44048661800486616, + "grad_norm": 1.2348236041408933, + "learning_rate": 6.195636541976881e-06, + "loss": 0.3829, + "step": 4526 + }, + { + "epoch": 0.4405839416058394, + "grad_norm": 1.216056390232819, + "learning_rate": 6.194106054405811e-06, + "loss": 0.3383, + "step": 4527 + }, + { + "epoch": 0.44068126520681267, + "grad_norm": 1.0767140762079304, + "learning_rate": 6.192575448175685e-06, + "loss": 0.3027, + "step": 4528 + }, + { + "epoch": 0.44077858880778586, + "grad_norm": 2.2278844288860453, + "learning_rate": 6.1910447234386074e-06, + "loss": 0.4248, + "step": 4529 + }, + { + "epoch": 0.4408759124087591, + "grad_norm": 1.5645319124687282, + "learning_rate": 6.189513880346681e-06, + "loss": 0.4836, + "step": 4530 + }, + { + "epoch": 0.44097323600973237, + "grad_norm": 1.4546530288234156, + "learning_rate": 6.187982919052031e-06, + "loss": 0.4532, + "step": 4531 + }, + { + "epoch": 0.44107055961070557, + "grad_norm": 1.3533832876181235, + "learning_rate": 6.1864518397067875e-06, + "loss": 0.4175, + "step": 4532 + }, + { + "epoch": 0.4411678832116788, + "grad_norm": 1.3880156134625945, + "learning_rate": 6.184920642463095e-06, + "loss": 0.3536, + "step": 4533 + }, + { + "epoch": 0.4412652068126521, + "grad_norm": 1.6523868203467935, + "learning_rate": 6.18338932747311e-06, + "loss": 0.3165, + "step": 4534 + }, + { + "epoch": 0.44136253041362533, + "grad_norm": 1.0224564487537273, + "learning_rate": 6.181857894889001e-06, + "loss": 0.231, + "step": 4535 + }, + { + "epoch": 0.44145985401459853, + "grad_norm": 1.4535342224784864, + "learning_rate": 6.180326344862947e-06, + "loss": 0.4897, + "step": 4536 + }, + { + "epoch": 0.4415571776155718, + "grad_norm": 1.178209958440824, + "learning_rate": 6.178794677547138e-06, + "loss": 0.249, + "step": 4537 + }, + { + "epoch": 0.44165450121654504, + "grad_norm": 1.2598408521325328, + "learning_rate": 6.177262893093776e-06, + "loss": 0.226, + "step": 4538 + }, + { + "epoch": 0.44175182481751823, + "grad_norm": 1.214630010023126, + "learning_rate": 6.175730991655077e-06, + "loss": 0.3077, + "step": 4539 + }, + { + "epoch": 0.4418491484184915, + "grad_norm": 1.1811098530299018, + "learning_rate": 6.174198973383268e-06, + "loss": 0.2962, + "step": 4540 + }, + { + "epoch": 0.44194647201946474, + "grad_norm": 1.37805845657898, + "learning_rate": 6.1726668384305845e-06, + "loss": 0.5316, + "step": 4541 + }, + { + "epoch": 0.44204379562043794, + "grad_norm": 1.2328033098813826, + "learning_rate": 6.171134586949277e-06, + "loss": 0.3806, + "step": 4542 + }, + { + "epoch": 0.4421411192214112, + "grad_norm": 1.4675287079480706, + "learning_rate": 6.169602219091605e-06, + "loss": 0.5604, + "step": 4543 + }, + { + "epoch": 0.44223844282238445, + "grad_norm": 1.2852516486753356, + "learning_rate": 6.168069735009842e-06, + "loss": 0.3012, + "step": 4544 + }, + { + "epoch": 0.44233576642335765, + "grad_norm": 1.348395899266854, + "learning_rate": 6.166537134856272e-06, + "loss": 0.4062, + "step": 4545 + }, + { + "epoch": 0.4424330900243309, + "grad_norm": 1.3878599816806487, + "learning_rate": 6.1650044187831895e-06, + "loss": 0.3538, + "step": 4546 + }, + { + "epoch": 0.44253041362530415, + "grad_norm": 1.4923058907243107, + "learning_rate": 6.163471586942901e-06, + "loss": 0.4973, + "step": 4547 + }, + { + "epoch": 0.44262773722627735, + "grad_norm": 1.4316166437282047, + "learning_rate": 6.161938639487728e-06, + "loss": 0.3876, + "step": 4548 + }, + { + "epoch": 0.4427250608272506, + "grad_norm": 1.5722407558203977, + "learning_rate": 6.160405576569996e-06, + "loss": 0.5089, + "step": 4549 + }, + { + "epoch": 0.44282238442822386, + "grad_norm": 1.434215442035674, + "learning_rate": 6.1588723983420485e-06, + "loss": 0.4982, + "step": 4550 + }, + { + "epoch": 0.44291970802919706, + "grad_norm": 1.4729024747014614, + "learning_rate": 6.15733910495624e-06, + "loss": 0.549, + "step": 4551 + }, + { + "epoch": 0.4430170316301703, + "grad_norm": 1.1632891140397226, + "learning_rate": 6.155805696564934e-06, + "loss": 0.2645, + "step": 4552 + }, + { + "epoch": 0.44311435523114356, + "grad_norm": 1.623128987609642, + "learning_rate": 6.154272173320503e-06, + "loss": 0.4606, + "step": 4553 + }, + { + "epoch": 0.44321167883211676, + "grad_norm": 1.8114455872128037, + "learning_rate": 6.152738535375337e-06, + "loss": 0.524, + "step": 4554 + }, + { + "epoch": 0.44330900243309, + "grad_norm": 1.5846271833353753, + "learning_rate": 6.151204782881835e-06, + "loss": 0.3161, + "step": 4555 + }, + { + "epoch": 0.44340632603406327, + "grad_norm": 1.4826265980282791, + "learning_rate": 6.149670915992407e-06, + "loss": 0.5112, + "step": 4556 + }, + { + "epoch": 0.4435036496350365, + "grad_norm": 1.2833430244443191, + "learning_rate": 6.1481369348594725e-06, + "loss": 0.2817, + "step": 4557 + }, + { + "epoch": 0.4436009732360097, + "grad_norm": 1.757349114269509, + "learning_rate": 6.146602839635466e-06, + "loss": 0.4294, + "step": 4558 + }, + { + "epoch": 0.443698296836983, + "grad_norm": 1.7036663909794458, + "learning_rate": 6.145068630472829e-06, + "loss": 0.3035, + "step": 4559 + }, + { + "epoch": 0.44379562043795623, + "grad_norm": 2.71526437698097, + "learning_rate": 6.143534307524019e-06, + "loss": 0.236, + "step": 4560 + }, + { + "epoch": 0.4438929440389294, + "grad_norm": 1.6314644975788934, + "learning_rate": 6.141999870941503e-06, + "loss": 0.4021, + "step": 4561 + }, + { + "epoch": 0.4439902676399027, + "grad_norm": 3.25660621027306, + "learning_rate": 6.140465320877757e-06, + "loss": 0.5017, + "step": 4562 + }, + { + "epoch": 0.44408759124087593, + "grad_norm": 2.3230459636290672, + "learning_rate": 6.1389306574852715e-06, + "loss": 0.336, + "step": 4563 + }, + { + "epoch": 0.44418491484184913, + "grad_norm": 1.203787782101882, + "learning_rate": 6.137395880916546e-06, + "loss": 0.3911, + "step": 4564 + }, + { + "epoch": 0.4442822384428224, + "grad_norm": 2.2630418455306573, + "learning_rate": 6.135860991324092e-06, + "loss": 0.4102, + "step": 4565 + }, + { + "epoch": 0.44437956204379564, + "grad_norm": 1.4186240119281917, + "learning_rate": 6.1343259888604335e-06, + "loss": 0.2737, + "step": 4566 + }, + { + "epoch": 0.44447688564476884, + "grad_norm": 1.5655785559429751, + "learning_rate": 6.132790873678105e-06, + "loss": 0.5299, + "step": 4567 + }, + { + "epoch": 0.4445742092457421, + "grad_norm": 1.430534737115544, + "learning_rate": 6.13125564592965e-06, + "loss": 0.2333, + "step": 4568 + }, + { + "epoch": 0.44467153284671534, + "grad_norm": 1.608300378030264, + "learning_rate": 6.129720305767628e-06, + "loss": 0.4874, + "step": 4569 + }, + { + "epoch": 0.44476885644768854, + "grad_norm": 1.5687565472834768, + "learning_rate": 6.128184853344604e-06, + "loss": 0.3432, + "step": 4570 + }, + { + "epoch": 0.4448661800486618, + "grad_norm": 1.5216378561818185, + "learning_rate": 6.126649288813157e-06, + "loss": 0.3871, + "step": 4571 + }, + { + "epoch": 0.44496350364963505, + "grad_norm": 1.1804091917573971, + "learning_rate": 6.125113612325879e-06, + "loss": 0.235, + "step": 4572 + }, + { + "epoch": 0.44506082725060825, + "grad_norm": 1.3663869203189398, + "learning_rate": 6.123577824035368e-06, + "loss": 0.4032, + "step": 4573 + }, + { + "epoch": 0.4451581508515815, + "grad_norm": 1.7585831372802134, + "learning_rate": 6.12204192409424e-06, + "loss": 0.6367, + "step": 4574 + }, + { + "epoch": 0.44525547445255476, + "grad_norm": 1.4053312903418944, + "learning_rate": 6.120505912655115e-06, + "loss": 0.3227, + "step": 4575 + }, + { + "epoch": 0.44535279805352795, + "grad_norm": 1.2522162816981348, + "learning_rate": 6.118969789870629e-06, + "loss": 0.2994, + "step": 4576 + }, + { + "epoch": 0.4454501216545012, + "grad_norm": 1.282504281897582, + "learning_rate": 6.117433555893426e-06, + "loss": 0.2873, + "step": 4577 + }, + { + "epoch": 0.44554744525547446, + "grad_norm": 1.4863199441812478, + "learning_rate": 6.115897210876166e-06, + "loss": 0.3527, + "step": 4578 + }, + { + "epoch": 0.4456447688564477, + "grad_norm": 1.20324746041235, + "learning_rate": 6.114360754971515e-06, + "loss": 0.3046, + "step": 4579 + }, + { + "epoch": 0.4457420924574209, + "grad_norm": 1.4778238388920404, + "learning_rate": 6.112824188332148e-06, + "loss": 0.5038, + "step": 4580 + }, + { + "epoch": 0.44583941605839417, + "grad_norm": 1.3118376195778845, + "learning_rate": 6.111287511110758e-06, + "loss": 0.423, + "step": 4581 + }, + { + "epoch": 0.4459367396593674, + "grad_norm": 1.281042407826542, + "learning_rate": 6.109750723460045e-06, + "loss": 0.3795, + "step": 4582 + }, + { + "epoch": 0.4460340632603406, + "grad_norm": 1.180836806111601, + "learning_rate": 6.108213825532722e-06, + "loss": 0.2544, + "step": 4583 + }, + { + "epoch": 0.44613138686131387, + "grad_norm": 1.2594621159639785, + "learning_rate": 6.106676817481508e-06, + "loss": 0.2813, + "step": 4584 + }, + { + "epoch": 0.4462287104622871, + "grad_norm": 1.4518226612130753, + "learning_rate": 6.1051396994591405e-06, + "loss": 0.4844, + "step": 4585 + }, + { + "epoch": 0.4463260340632603, + "grad_norm": 1.4653587961723828, + "learning_rate": 6.103602471618361e-06, + "loss": 0.514, + "step": 4586 + }, + { + "epoch": 0.4464233576642336, + "grad_norm": 1.5072282835369855, + "learning_rate": 6.102065134111924e-06, + "loss": 0.5215, + "step": 4587 + }, + { + "epoch": 0.44652068126520683, + "grad_norm": 1.4484793120998232, + "learning_rate": 6.100527687092599e-06, + "loss": 0.4045, + "step": 4588 + }, + { + "epoch": 0.44661800486618003, + "grad_norm": 1.763093466978771, + "learning_rate": 6.09899013071316e-06, + "loss": 0.4045, + "step": 4589 + }, + { + "epoch": 0.4467153284671533, + "grad_norm": 1.4119795743556356, + "learning_rate": 6.097452465126399e-06, + "loss": 0.4171, + "step": 4590 + }, + { + "epoch": 0.44681265206812654, + "grad_norm": 1.5294922062057048, + "learning_rate": 6.095914690485109e-06, + "loss": 0.5523, + "step": 4591 + }, + { + "epoch": 0.44690997566909973, + "grad_norm": 1.481430345332682, + "learning_rate": 6.0943768069421035e-06, + "loss": 0.502, + "step": 4592 + }, + { + "epoch": 0.447007299270073, + "grad_norm": 1.6136069350056235, + "learning_rate": 6.092838814650202e-06, + "loss": 0.5971, + "step": 4593 + }, + { + "epoch": 0.44710462287104624, + "grad_norm": 1.3313623801781076, + "learning_rate": 6.091300713762236e-06, + "loss": 0.4233, + "step": 4594 + }, + { + "epoch": 0.44720194647201944, + "grad_norm": 1.423435654204561, + "learning_rate": 6.0897625044310475e-06, + "loss": 0.4662, + "step": 4595 + }, + { + "epoch": 0.4472992700729927, + "grad_norm": 1.3331887146880435, + "learning_rate": 6.08822418680949e-06, + "loss": 0.3056, + "step": 4596 + }, + { + "epoch": 0.44739659367396595, + "grad_norm": 1.2900254785254246, + "learning_rate": 6.086685761050423e-06, + "loss": 0.252, + "step": 4597 + }, + { + "epoch": 0.44749391727493915, + "grad_norm": 1.3447249668086938, + "learning_rate": 6.085147227306727e-06, + "loss": 0.3061, + "step": 4598 + }, + { + "epoch": 0.4475912408759124, + "grad_norm": 1.12221277194141, + "learning_rate": 6.083608585731283e-06, + "loss": 0.3122, + "step": 4599 + }, + { + "epoch": 0.44768856447688565, + "grad_norm": 1.4658809804338166, + "learning_rate": 6.082069836476988e-06, + "loss": 0.4721, + "step": 4600 + }, + { + "epoch": 0.4477858880778589, + "grad_norm": 1.2247027737228893, + "learning_rate": 6.0805309796967484e-06, + "loss": 0.2408, + "step": 4601 + }, + { + "epoch": 0.4478832116788321, + "grad_norm": 1.2025495862353188, + "learning_rate": 6.07899201554348e-06, + "loss": 0.4184, + "step": 4602 + }, + { + "epoch": 0.44798053527980536, + "grad_norm": 1.3242619600251044, + "learning_rate": 6.077452944170113e-06, + "loss": 0.3371, + "step": 4603 + }, + { + "epoch": 0.4480778588807786, + "grad_norm": 1.106734989558217, + "learning_rate": 6.075913765729584e-06, + "loss": 0.2301, + "step": 4604 + }, + { + "epoch": 0.4481751824817518, + "grad_norm": 1.2729210303565284, + "learning_rate": 6.074374480374844e-06, + "loss": 0.2714, + "step": 4605 + }, + { + "epoch": 0.44827250608272506, + "grad_norm": 15.212009323594724, + "learning_rate": 6.072835088258851e-06, + "loss": 0.38, + "step": 4606 + }, + { + "epoch": 0.4483698296836983, + "grad_norm": 1.4758284424424868, + "learning_rate": 6.071295589534576e-06, + "loss": 0.5838, + "step": 4607 + }, + { + "epoch": 0.4484671532846715, + "grad_norm": 1.3641656231434411, + "learning_rate": 6.0697559843549994e-06, + "loss": 0.3822, + "step": 4608 + }, + { + "epoch": 0.44856447688564477, + "grad_norm": 1.12662425936695, + "learning_rate": 6.068216272873112e-06, + "loss": 0.2544, + "step": 4609 + }, + { + "epoch": 0.448661800486618, + "grad_norm": 1.3773493636304128, + "learning_rate": 6.066676455241919e-06, + "loss": 0.2609, + "step": 4610 + }, + { + "epoch": 0.4487591240875912, + "grad_norm": 1.3603847743332291, + "learning_rate": 6.0651365316144295e-06, + "loss": 0.3462, + "step": 4611 + }, + { + "epoch": 0.4488564476885645, + "grad_norm": 1.9756668534351456, + "learning_rate": 6.0635965021436696e-06, + "loss": 0.3505, + "step": 4612 + }, + { + "epoch": 0.44895377128953773, + "grad_norm": 1.7428842015365853, + "learning_rate": 6.0620563669826695e-06, + "loss": 0.5871, + "step": 4613 + }, + { + "epoch": 0.4490510948905109, + "grad_norm": 1.427382899490253, + "learning_rate": 6.060516126284477e-06, + "loss": 0.4058, + "step": 4614 + }, + { + "epoch": 0.4491484184914842, + "grad_norm": 1.6387860846020157, + "learning_rate": 6.058975780202144e-06, + "loss": 0.4983, + "step": 4615 + }, + { + "epoch": 0.44924574209245743, + "grad_norm": 1.3894509921139317, + "learning_rate": 6.057435328888739e-06, + "loss": 0.3205, + "step": 4616 + }, + { + "epoch": 0.44934306569343063, + "grad_norm": 1.582913828371373, + "learning_rate": 6.0558947724973345e-06, + "loss": 0.4199, + "step": 4617 + }, + { + "epoch": 0.4494403892944039, + "grad_norm": 1.555270291844662, + "learning_rate": 6.054354111181015e-06, + "loss": 0.4911, + "step": 4618 + }, + { + "epoch": 0.44953771289537714, + "grad_norm": 1.382143365205803, + "learning_rate": 6.0528133450928826e-06, + "loss": 0.2393, + "step": 4619 + }, + { + "epoch": 0.44963503649635034, + "grad_norm": 1.720605299698648, + "learning_rate": 6.051272474386039e-06, + "loss": 0.3921, + "step": 4620 + }, + { + "epoch": 0.4497323600973236, + "grad_norm": 1.6775663955669964, + "learning_rate": 6.0497314992136055e-06, + "loss": 0.5277, + "step": 4621 + }, + { + "epoch": 0.44982968369829684, + "grad_norm": 1.2197851040232244, + "learning_rate": 6.048190419728706e-06, + "loss": 0.3849, + "step": 4622 + }, + { + "epoch": 0.4499270072992701, + "grad_norm": 1.324451887375089, + "learning_rate": 6.046649236084481e-06, + "loss": 0.2846, + "step": 4623 + }, + { + "epoch": 0.4500243309002433, + "grad_norm": 1.3942500239593356, + "learning_rate": 6.045107948434077e-06, + "loss": 0.2472, + "step": 4624 + }, + { + "epoch": 0.45012165450121655, + "grad_norm": 1.2311046107859251, + "learning_rate": 6.043566556930656e-06, + "loss": 0.3205, + "step": 4625 + }, + { + "epoch": 0.4502189781021898, + "grad_norm": 1.251194954956108, + "learning_rate": 6.042025061727384e-06, + "loss": 0.3929, + "step": 4626 + }, + { + "epoch": 0.450316301703163, + "grad_norm": 1.2916062658810665, + "learning_rate": 6.040483462977439e-06, + "loss": 0.3156, + "step": 4627 + }, + { + "epoch": 0.45041362530413626, + "grad_norm": 1.392814969203336, + "learning_rate": 6.038941760834014e-06, + "loss": 0.3315, + "step": 4628 + }, + { + "epoch": 0.4505109489051095, + "grad_norm": 1.1827916111985008, + "learning_rate": 6.037399955450307e-06, + "loss": 0.3114, + "step": 4629 + }, + { + "epoch": 0.4506082725060827, + "grad_norm": 1.2372835875581027, + "learning_rate": 6.0358580469795315e-06, + "loss": 0.2393, + "step": 4630 + }, + { + "epoch": 0.45070559610705596, + "grad_norm": 1.988431374029195, + "learning_rate": 6.034316035574903e-06, + "loss": 0.4706, + "step": 4631 + }, + { + "epoch": 0.4508029197080292, + "grad_norm": 2.396437018811852, + "learning_rate": 6.032773921389655e-06, + "loss": 0.4336, + "step": 4632 + }, + { + "epoch": 0.4509002433090024, + "grad_norm": 1.4280452781145334, + "learning_rate": 6.031231704577027e-06, + "loss": 0.3378, + "step": 4633 + }, + { + "epoch": 0.45099756690997567, + "grad_norm": 1.498033506092563, + "learning_rate": 6.0296893852902705e-06, + "loss": 0.3096, + "step": 4634 + }, + { + "epoch": 0.4510948905109489, + "grad_norm": 1.2436054701671617, + "learning_rate": 6.0281469636826486e-06, + "loss": 0.3644, + "step": 4635 + }, + { + "epoch": 0.4511922141119221, + "grad_norm": 1.5192499758500848, + "learning_rate": 6.026604439907429e-06, + "loss": 0.3506, + "step": 4636 + }, + { + "epoch": 0.45128953771289537, + "grad_norm": 1.6928058850470935, + "learning_rate": 6.025061814117896e-06, + "loss": 0.3323, + "step": 4637 + }, + { + "epoch": 0.4513868613138686, + "grad_norm": 1.1049977299938791, + "learning_rate": 6.023519086467341e-06, + "loss": 0.2905, + "step": 4638 + }, + { + "epoch": 0.4514841849148418, + "grad_norm": 1.4257531568293946, + "learning_rate": 6.021976257109064e-06, + "loss": 0.308, + "step": 4639 + }, + { + "epoch": 0.4515815085158151, + "grad_norm": 1.5893181909270777, + "learning_rate": 6.020433326196379e-06, + "loss": 0.4309, + "step": 4640 + }, + { + "epoch": 0.45167883211678833, + "grad_norm": 1.1799946568254955, + "learning_rate": 6.018890293882607e-06, + "loss": 0.3656, + "step": 4641 + }, + { + "epoch": 0.45177615571776153, + "grad_norm": 1.5076992802780589, + "learning_rate": 6.01734716032108e-06, + "loss": 0.3004, + "step": 4642 + }, + { + "epoch": 0.4518734793187348, + "grad_norm": 1.4123734982988385, + "learning_rate": 6.015803925665141e-06, + "loss": 0.5163, + "step": 4643 + }, + { + "epoch": 0.45197080291970804, + "grad_norm": 1.718868320900687, + "learning_rate": 6.014260590068142e-06, + "loss": 0.484, + "step": 4644 + }, + { + "epoch": 0.4520681265206813, + "grad_norm": 1.6108627109575375, + "learning_rate": 6.012717153683443e-06, + "loss": 0.5274, + "step": 4645 + }, + { + "epoch": 0.4521654501216545, + "grad_norm": 1.1731609155667762, + "learning_rate": 6.0111736166644196e-06, + "loss": 0.3736, + "step": 4646 + }, + { + "epoch": 0.45226277372262774, + "grad_norm": 1.2623844811494913, + "learning_rate": 6.009629979164451e-06, + "loss": 0.2155, + "step": 4647 + }, + { + "epoch": 0.452360097323601, + "grad_norm": 1.42235749829862, + "learning_rate": 6.0080862413369324e-06, + "loss": 0.3683, + "step": 4648 + }, + { + "epoch": 0.4524574209245742, + "grad_norm": 1.3023936041870583, + "learning_rate": 6.006542403335263e-06, + "loss": 0.3659, + "step": 4649 + }, + { + "epoch": 0.45255474452554745, + "grad_norm": 1.52765919581587, + "learning_rate": 6.004998465312857e-06, + "loss": 0.5769, + "step": 4650 + }, + { + "epoch": 0.4526520681265207, + "grad_norm": 1.5719134771593801, + "learning_rate": 6.003454427423135e-06, + "loss": 0.5393, + "step": 4651 + }, + { + "epoch": 0.4527493917274939, + "grad_norm": 1.3672396404678124, + "learning_rate": 6.00191028981953e-06, + "loss": 0.4496, + "step": 4652 + }, + { + "epoch": 0.45284671532846715, + "grad_norm": 1.5536820117072887, + "learning_rate": 6.000366052655485e-06, + "loss": 0.4263, + "step": 4653 + }, + { + "epoch": 0.4529440389294404, + "grad_norm": 1.634041647530894, + "learning_rate": 5.99882171608445e-06, + "loss": 0.3762, + "step": 4654 + }, + { + "epoch": 0.4530413625304136, + "grad_norm": 1.388217985979722, + "learning_rate": 5.997277280259886e-06, + "loss": 0.4541, + "step": 4655 + }, + { + "epoch": 0.45313868613138686, + "grad_norm": 1.3776033638252942, + "learning_rate": 5.9957327453352655e-06, + "loss": 0.4501, + "step": 4656 + }, + { + "epoch": 0.4532360097323601, + "grad_norm": 1.195712951201049, + "learning_rate": 5.994188111464072e-06, + "loss": 0.219, + "step": 4657 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 1.511034512487341, + "learning_rate": 5.992643378799794e-06, + "loss": 0.3969, + "step": 4658 + }, + { + "epoch": 0.45343065693430656, + "grad_norm": 3.725563795191409, + "learning_rate": 5.991098547495933e-06, + "loss": 0.6012, + "step": 4659 + }, + { + "epoch": 0.4535279805352798, + "grad_norm": 1.5248535617497452, + "learning_rate": 5.989553617706e-06, + "loss": 0.4387, + "step": 4660 + }, + { + "epoch": 0.453625304136253, + "grad_norm": 1.5028785265567297, + "learning_rate": 5.988008589583516e-06, + "loss": 0.3756, + "step": 4661 + }, + { + "epoch": 0.45372262773722627, + "grad_norm": 1.2276140795525827, + "learning_rate": 5.9864634632820115e-06, + "loss": 0.4242, + "step": 4662 + }, + { + "epoch": 0.4538199513381995, + "grad_norm": 1.6639607716301263, + "learning_rate": 5.984918238955025e-06, + "loss": 0.3579, + "step": 4663 + }, + { + "epoch": 0.4539172749391728, + "grad_norm": 1.4925341777626235, + "learning_rate": 5.98337291675611e-06, + "loss": 0.5927, + "step": 4664 + }, + { + "epoch": 0.454014598540146, + "grad_norm": 1.4464693955138501, + "learning_rate": 5.9818274968388225e-06, + "loss": 0.4453, + "step": 4665 + }, + { + "epoch": 0.45411192214111923, + "grad_norm": 1.4591826645028443, + "learning_rate": 5.980281979356732e-06, + "loss": 0.479, + "step": 4666 + }, + { + "epoch": 0.4542092457420925, + "grad_norm": 1.2111037109500833, + "learning_rate": 5.97873636446342e-06, + "loss": 0.3399, + "step": 4667 + }, + { + "epoch": 0.4543065693430657, + "grad_norm": 1.531131116962134, + "learning_rate": 5.977190652312474e-06, + "loss": 0.4434, + "step": 4668 + }, + { + "epoch": 0.45440389294403893, + "grad_norm": 1.4656116964879211, + "learning_rate": 5.975644843057492e-06, + "loss": 0.2733, + "step": 4669 + }, + { + "epoch": 0.4545012165450122, + "grad_norm": 1.2977483175873437, + "learning_rate": 5.974098936852083e-06, + "loss": 0.3621, + "step": 4670 + }, + { + "epoch": 0.4545985401459854, + "grad_norm": 1.8005773466107835, + "learning_rate": 5.9725529338498625e-06, + "loss": 0.6883, + "step": 4671 + }, + { + "epoch": 0.45469586374695864, + "grad_norm": 1.342052274382292, + "learning_rate": 5.9710068342044595e-06, + "loss": 0.2238, + "step": 4672 + }, + { + "epoch": 0.4547931873479319, + "grad_norm": 1.4839427117757193, + "learning_rate": 5.969460638069512e-06, + "loss": 0.5407, + "step": 4673 + }, + { + "epoch": 0.4548905109489051, + "grad_norm": 1.5533686272182006, + "learning_rate": 5.967914345598663e-06, + "loss": 0.4274, + "step": 4674 + }, + { + "epoch": 0.45498783454987834, + "grad_norm": 1.4488756951086967, + "learning_rate": 5.966367956945572e-06, + "loss": 0.4039, + "step": 4675 + }, + { + "epoch": 0.4550851581508516, + "grad_norm": 1.3695980945732107, + "learning_rate": 5.964821472263903e-06, + "loss": 0.3252, + "step": 4676 + }, + { + "epoch": 0.4551824817518248, + "grad_norm": 1.3430505750937012, + "learning_rate": 5.96327489170733e-06, + "loss": 0.3255, + "step": 4677 + }, + { + "epoch": 0.45527980535279805, + "grad_norm": 1.4850760962515406, + "learning_rate": 5.96172821542954e-06, + "loss": 0.3645, + "step": 4678 + }, + { + "epoch": 0.4553771289537713, + "grad_norm": 1.428325734127547, + "learning_rate": 5.960181443584226e-06, + "loss": 0.4561, + "step": 4679 + }, + { + "epoch": 0.4554744525547445, + "grad_norm": 1.2874387745049296, + "learning_rate": 5.958634576325093e-06, + "loss": 0.3419, + "step": 4680 + }, + { + "epoch": 0.45557177615571776, + "grad_norm": 1.5518515326284563, + "learning_rate": 5.957087613805851e-06, + "loss": 0.6531, + "step": 4681 + }, + { + "epoch": 0.455669099756691, + "grad_norm": 1.4272935345883002, + "learning_rate": 5.955540556180225e-06, + "loss": 0.3665, + "step": 4682 + }, + { + "epoch": 0.4557664233576642, + "grad_norm": 1.4818369800930495, + "learning_rate": 5.9539934036019465e-06, + "loss": 0.3336, + "step": 4683 + }, + { + "epoch": 0.45586374695863746, + "grad_norm": 1.374483488345197, + "learning_rate": 5.952446156224759e-06, + "loss": 0.2611, + "step": 4684 + }, + { + "epoch": 0.4559610705596107, + "grad_norm": 1.5687184759977315, + "learning_rate": 5.950898814202408e-06, + "loss": 0.5522, + "step": 4685 + }, + { + "epoch": 0.45605839416058397, + "grad_norm": 1.7181432866926953, + "learning_rate": 5.94935137768866e-06, + "loss": 0.7505, + "step": 4686 + }, + { + "epoch": 0.45615571776155717, + "grad_norm": 1.564740824825937, + "learning_rate": 5.94780384683728e-06, + "loss": 0.3494, + "step": 4687 + }, + { + "epoch": 0.4562530413625304, + "grad_norm": 1.1399923163707189, + "learning_rate": 5.946256221802052e-06, + "loss": 0.3039, + "step": 4688 + }, + { + "epoch": 0.4563503649635037, + "grad_norm": 1.55416378228923, + "learning_rate": 5.94470850273676e-06, + "loss": 0.4167, + "step": 4689 + }, + { + "epoch": 0.45644768856447687, + "grad_norm": 1.100399512653527, + "learning_rate": 5.943160689795204e-06, + "loss": 0.2567, + "step": 4690 + }, + { + "epoch": 0.4565450121654501, + "grad_norm": 1.6707898477554108, + "learning_rate": 5.941612783131191e-06, + "loss": 0.4441, + "step": 4691 + }, + { + "epoch": 0.4566423357664234, + "grad_norm": 1.3812763712165028, + "learning_rate": 5.940064782898535e-06, + "loss": 0.4095, + "step": 4692 + }, + { + "epoch": 0.4567396593673966, + "grad_norm": 1.401481979932014, + "learning_rate": 5.938516689251065e-06, + "loss": 0.278, + "step": 4693 + }, + { + "epoch": 0.45683698296836983, + "grad_norm": 1.450856560892366, + "learning_rate": 5.936968502342614e-06, + "loss": 0.333, + "step": 4694 + }, + { + "epoch": 0.4569343065693431, + "grad_norm": 1.499290298416481, + "learning_rate": 5.935420222327028e-06, + "loss": 0.4097, + "step": 4695 + }, + { + "epoch": 0.4570316301703163, + "grad_norm": 1.2903930270860955, + "learning_rate": 5.933871849358159e-06, + "loss": 0.4401, + "step": 4696 + }, + { + "epoch": 0.45712895377128954, + "grad_norm": 1.6502016580057606, + "learning_rate": 5.93232338358987e-06, + "loss": 0.5137, + "step": 4697 + }, + { + "epoch": 0.4572262773722628, + "grad_norm": 1.9160637805731742, + "learning_rate": 5.930774825176034e-06, + "loss": 0.35, + "step": 4698 + }, + { + "epoch": 0.457323600973236, + "grad_norm": 1.3674680149584886, + "learning_rate": 5.9292261742705315e-06, + "loss": 0.5852, + "step": 4699 + }, + { + "epoch": 0.45742092457420924, + "grad_norm": 1.2672307224639892, + "learning_rate": 5.927677431027253e-06, + "loss": 0.326, + "step": 4700 + }, + { + "epoch": 0.4575182481751825, + "grad_norm": 1.1561098271538528, + "learning_rate": 5.926128595600098e-06, + "loss": 0.28, + "step": 4701 + }, + { + "epoch": 0.4576155717761557, + "grad_norm": 1.712247582091781, + "learning_rate": 5.9245796681429744e-06, + "loss": 0.3832, + "step": 4702 + }, + { + "epoch": 0.45771289537712895, + "grad_norm": 1.2239785645889314, + "learning_rate": 5.923030648809801e-06, + "loss": 0.3703, + "step": 4703 + }, + { + "epoch": 0.4578102189781022, + "grad_norm": 1.3056423325253275, + "learning_rate": 5.921481537754505e-06, + "loss": 0.2638, + "step": 4704 + }, + { + "epoch": 0.4579075425790754, + "grad_norm": 1.3444712278011954, + "learning_rate": 5.919932335131022e-06, + "loss": 0.4296, + "step": 4705 + }, + { + "epoch": 0.45800486618004865, + "grad_norm": 1.3515867550828926, + "learning_rate": 5.918383041093299e-06, + "loss": 0.2354, + "step": 4706 + }, + { + "epoch": 0.4581021897810219, + "grad_norm": 1.6942273019928202, + "learning_rate": 5.916833655795287e-06, + "loss": 0.5158, + "step": 4707 + }, + { + "epoch": 0.45819951338199516, + "grad_norm": 1.4532183545386486, + "learning_rate": 5.915284179390951e-06, + "loss": 0.3856, + "step": 4708 + }, + { + "epoch": 0.45829683698296836, + "grad_norm": 1.1775832957066483, + "learning_rate": 5.9137346120342655e-06, + "loss": 0.3254, + "step": 4709 + }, + { + "epoch": 0.4583941605839416, + "grad_norm": 1.386752147003762, + "learning_rate": 5.912184953879207e-06, + "loss": 0.423, + "step": 4710 + }, + { + "epoch": 0.45849148418491487, + "grad_norm": 1.1159458329512677, + "learning_rate": 5.910635205079772e-06, + "loss": 0.2296, + "step": 4711 + }, + { + "epoch": 0.45858880778588806, + "grad_norm": 1.4813859257522484, + "learning_rate": 5.909085365789955e-06, + "loss": 0.5599, + "step": 4712 + }, + { + "epoch": 0.4586861313868613, + "grad_norm": 1.3715052842926014, + "learning_rate": 5.907535436163767e-06, + "loss": 0.3495, + "step": 4713 + }, + { + "epoch": 0.45878345498783457, + "grad_norm": 1.4981944025202203, + "learning_rate": 5.905985416355225e-06, + "loss": 0.3131, + "step": 4714 + }, + { + "epoch": 0.45888077858880777, + "grad_norm": 1.6325759351282747, + "learning_rate": 5.904435306518354e-06, + "loss": 0.6115, + "step": 4715 + }, + { + "epoch": 0.458978102189781, + "grad_norm": 1.7392502730482995, + "learning_rate": 5.902885106807193e-06, + "loss": 0.5174, + "step": 4716 + }, + { + "epoch": 0.4590754257907543, + "grad_norm": 1.3934187525403232, + "learning_rate": 5.901334817375782e-06, + "loss": 0.4157, + "step": 4717 + }, + { + "epoch": 0.4591727493917275, + "grad_norm": 1.5194400867877766, + "learning_rate": 5.899784438378177e-06, + "loss": 0.4937, + "step": 4718 + }, + { + "epoch": 0.45927007299270073, + "grad_norm": 1.2421749762137688, + "learning_rate": 5.898233969968439e-06, + "loss": 0.389, + "step": 4719 + }, + { + "epoch": 0.459367396593674, + "grad_norm": 1.2968529389125105, + "learning_rate": 5.89668341230064e-06, + "loss": 0.4248, + "step": 4720 + }, + { + "epoch": 0.4594647201946472, + "grad_norm": 1.7127495493366949, + "learning_rate": 5.895132765528858e-06, + "loss": 0.3465, + "step": 4721 + }, + { + "epoch": 0.45956204379562043, + "grad_norm": 1.4977326741267858, + "learning_rate": 5.893582029807184e-06, + "loss": 0.4816, + "step": 4722 + }, + { + "epoch": 0.4596593673965937, + "grad_norm": 1.4970528737823605, + "learning_rate": 5.892031205289714e-06, + "loss": 0.3091, + "step": 4723 + }, + { + "epoch": 0.4597566909975669, + "grad_norm": 2.0058564034712045, + "learning_rate": 5.890480292130555e-06, + "loss": 0.5912, + "step": 4724 + }, + { + "epoch": 0.45985401459854014, + "grad_norm": 1.7918745703907915, + "learning_rate": 5.888929290483822e-06, + "loss": 0.3417, + "step": 4725 + }, + { + "epoch": 0.4599513381995134, + "grad_norm": 1.572239694322271, + "learning_rate": 5.887378200503639e-06, + "loss": 0.4157, + "step": 4726 + }, + { + "epoch": 0.4600486618004866, + "grad_norm": 1.5704926344763475, + "learning_rate": 5.88582702234414e-06, + "loss": 0.2702, + "step": 4727 + }, + { + "epoch": 0.46014598540145984, + "grad_norm": 1.5454666971045126, + "learning_rate": 5.8842757561594636e-06, + "loss": 0.5971, + "step": 4728 + }, + { + "epoch": 0.4602433090024331, + "grad_norm": 1.4922891152819104, + "learning_rate": 5.882724402103762e-06, + "loss": 0.3751, + "step": 4729 + }, + { + "epoch": 0.46034063260340635, + "grad_norm": 2.476920598371043, + "learning_rate": 5.881172960331194e-06, + "loss": 0.2977, + "step": 4730 + }, + { + "epoch": 0.46043795620437955, + "grad_norm": 1.2731556024557937, + "learning_rate": 5.879621430995927e-06, + "loss": 0.3484, + "step": 4731 + }, + { + "epoch": 0.4605352798053528, + "grad_norm": 1.1937076541120601, + "learning_rate": 5.8780698142521385e-06, + "loss": 0.3455, + "step": 4732 + }, + { + "epoch": 0.46063260340632606, + "grad_norm": 1.3845968856553887, + "learning_rate": 5.8765181102540136e-06, + "loss": 0.368, + "step": 4733 + }, + { + "epoch": 0.46072992700729926, + "grad_norm": 1.2118541078699578, + "learning_rate": 5.874966319155744e-06, + "loss": 0.3384, + "step": 4734 + }, + { + "epoch": 0.4608272506082725, + "grad_norm": 1.3623706651249927, + "learning_rate": 5.873414441111534e-06, + "loss": 0.2656, + "step": 4735 + }, + { + "epoch": 0.46092457420924576, + "grad_norm": 1.4140159641427623, + "learning_rate": 5.871862476275595e-06, + "loss": 0.3252, + "step": 4736 + }, + { + "epoch": 0.46102189781021896, + "grad_norm": 1.2601663114899302, + "learning_rate": 5.870310424802144e-06, + "loss": 0.3305, + "step": 4737 + }, + { + "epoch": 0.4611192214111922, + "grad_norm": 1.530399475555011, + "learning_rate": 5.868758286845413e-06, + "loss": 0.524, + "step": 4738 + }, + { + "epoch": 0.46121654501216547, + "grad_norm": 1.4374656673146151, + "learning_rate": 5.867206062559636e-06, + "loss": 0.444, + "step": 4739 + }, + { + "epoch": 0.46131386861313867, + "grad_norm": 1.3525900930975647, + "learning_rate": 5.865653752099058e-06, + "loss": 0.3673, + "step": 4740 + }, + { + "epoch": 0.4614111922141119, + "grad_norm": 1.138213652709309, + "learning_rate": 5.864101355617937e-06, + "loss": 0.2852, + "step": 4741 + }, + { + "epoch": 0.4615085158150852, + "grad_norm": 1.855979768753775, + "learning_rate": 5.862548873270533e-06, + "loss": 0.7494, + "step": 4742 + }, + { + "epoch": 0.46160583941605837, + "grad_norm": 1.298570742237486, + "learning_rate": 5.860996305211116e-06, + "loss": 0.31, + "step": 4743 + }, + { + "epoch": 0.4617031630170316, + "grad_norm": 1.3800618891728689, + "learning_rate": 5.859443651593968e-06, + "loss": 0.3635, + "step": 4744 + }, + { + "epoch": 0.4618004866180049, + "grad_norm": 1.1578742295256148, + "learning_rate": 5.8578909125733764e-06, + "loss": 0.2873, + "step": 4745 + }, + { + "epoch": 0.4618978102189781, + "grad_norm": 1.3754807845437125, + "learning_rate": 5.856338088303636e-06, + "loss": 0.4375, + "step": 4746 + }, + { + "epoch": 0.46199513381995133, + "grad_norm": 1.3080329262616794, + "learning_rate": 5.854785178939054e-06, + "loss": 0.4391, + "step": 4747 + }, + { + "epoch": 0.4620924574209246, + "grad_norm": 1.4965704813226828, + "learning_rate": 5.853232184633943e-06, + "loss": 0.5334, + "step": 4748 + }, + { + "epoch": 0.4621897810218978, + "grad_norm": 2.2958549258491168, + "learning_rate": 5.851679105542627e-06, + "loss": 0.54, + "step": 4749 + }, + { + "epoch": 0.46228710462287104, + "grad_norm": 1.31437240738189, + "learning_rate": 5.850125941819433e-06, + "loss": 0.2722, + "step": 4750 + }, + { + "epoch": 0.4623844282238443, + "grad_norm": 1.2323280455936996, + "learning_rate": 5.848572693618703e-06, + "loss": 0.3704, + "step": 4751 + }, + { + "epoch": 0.46248175182481754, + "grad_norm": 1.4382237809866574, + "learning_rate": 5.8470193610947825e-06, + "loss": 0.4253, + "step": 4752 + }, + { + "epoch": 0.46257907542579074, + "grad_norm": 1.5270857727253344, + "learning_rate": 5.8454659444020276e-06, + "loss": 0.528, + "step": 4753 + }, + { + "epoch": 0.462676399026764, + "grad_norm": 1.339534089808389, + "learning_rate": 5.843912443694802e-06, + "loss": 0.4199, + "step": 4754 + }, + { + "epoch": 0.46277372262773725, + "grad_norm": 1.6462659951188234, + "learning_rate": 5.8423588591274786e-06, + "loss": 0.198, + "step": 4755 + }, + { + "epoch": 0.46287104622871045, + "grad_norm": 1.46963555443937, + "learning_rate": 5.8408051908544365e-06, + "loss": 0.3966, + "step": 4756 + }, + { + "epoch": 0.4629683698296837, + "grad_norm": 1.3757584820693338, + "learning_rate": 5.8392514390300644e-06, + "loss": 0.4127, + "step": 4757 + }, + { + "epoch": 0.46306569343065695, + "grad_norm": 1.5854642565983017, + "learning_rate": 5.837697603808764e-06, + "loss": 0.3674, + "step": 4758 + }, + { + "epoch": 0.46316301703163015, + "grad_norm": 1.4058604230609197, + "learning_rate": 5.836143685344937e-06, + "loss": 0.1736, + "step": 4759 + }, + { + "epoch": 0.4632603406326034, + "grad_norm": 3.188656914883977, + "learning_rate": 5.834589683792998e-06, + "loss": 0.4361, + "step": 4760 + }, + { + "epoch": 0.46335766423357666, + "grad_norm": 1.277741209948926, + "learning_rate": 5.833035599307367e-06, + "loss": 0.2996, + "step": 4761 + }, + { + "epoch": 0.46345498783454986, + "grad_norm": 1.5163201379867834, + "learning_rate": 5.831481432042479e-06, + "loss": 0.579, + "step": 4762 + }, + { + "epoch": 0.4635523114355231, + "grad_norm": 1.480130924542566, + "learning_rate": 5.8299271821527704e-06, + "loss": 0.4744, + "step": 4763 + }, + { + "epoch": 0.46364963503649637, + "grad_norm": 1.5295331243873154, + "learning_rate": 5.8283728497926865e-06, + "loss": 0.4863, + "step": 4764 + }, + { + "epoch": 0.46374695863746956, + "grad_norm": 1.7381638390825362, + "learning_rate": 5.826818435116684e-06, + "loss": 0.5357, + "step": 4765 + }, + { + "epoch": 0.4638442822384428, + "grad_norm": 1.4226409190659568, + "learning_rate": 5.825263938279223e-06, + "loss": 0.3567, + "step": 4766 + }, + { + "epoch": 0.46394160583941607, + "grad_norm": 1.3756233016354147, + "learning_rate": 5.823709359434779e-06, + "loss": 0.4331, + "step": 4767 + }, + { + "epoch": 0.46403892944038927, + "grad_norm": 1.4866340713320438, + "learning_rate": 5.82215469873783e-06, + "loss": 0.291, + "step": 4768 + }, + { + "epoch": 0.4641362530413625, + "grad_norm": 1.4247021916044675, + "learning_rate": 5.820599956342864e-06, + "loss": 0.2503, + "step": 4769 + }, + { + "epoch": 0.4642335766423358, + "grad_norm": 1.514852346520823, + "learning_rate": 5.819045132404374e-06, + "loss": 0.4879, + "step": 4770 + }, + { + "epoch": 0.464330900243309, + "grad_norm": 1.1723389787867933, + "learning_rate": 5.8174902270768666e-06, + "loss": 0.3285, + "step": 4771 + }, + { + "epoch": 0.46442822384428223, + "grad_norm": 1.6214649189163381, + "learning_rate": 5.8159352405148525e-06, + "loss": 0.6075, + "step": 4772 + }, + { + "epoch": 0.4645255474452555, + "grad_norm": 1.5229859294178767, + "learning_rate": 5.814380172872853e-06, + "loss": 0.448, + "step": 4773 + }, + { + "epoch": 0.46462287104622874, + "grad_norm": 1.3925630159347562, + "learning_rate": 5.812825024305395e-06, + "loss": 0.3614, + "step": 4774 + }, + { + "epoch": 0.46472019464720193, + "grad_norm": 1.2947793597339534, + "learning_rate": 5.8112697949670135e-06, + "loss": 0.3463, + "step": 4775 + }, + { + "epoch": 0.4648175182481752, + "grad_norm": 1.5942348415927672, + "learning_rate": 5.809714485012254e-06, + "loss": 0.3919, + "step": 4776 + }, + { + "epoch": 0.46491484184914844, + "grad_norm": 1.89133736297067, + "learning_rate": 5.808159094595669e-06, + "loss": 0.2706, + "step": 4777 + }, + { + "epoch": 0.46501216545012164, + "grad_norm": 1.4231638673643952, + "learning_rate": 5.806603623871819e-06, + "loss": 0.4138, + "step": 4778 + }, + { + "epoch": 0.4651094890510949, + "grad_norm": 1.7206309343451145, + "learning_rate": 5.80504807299527e-06, + "loss": 0.403, + "step": 4779 + }, + { + "epoch": 0.46520681265206815, + "grad_norm": 1.359607983497329, + "learning_rate": 5.8034924421206e-06, + "loss": 0.3557, + "step": 4780 + }, + { + "epoch": 0.46530413625304134, + "grad_norm": 1.2100118729505402, + "learning_rate": 5.801936731402392e-06, + "loss": 0.3914, + "step": 4781 + }, + { + "epoch": 0.4654014598540146, + "grad_norm": 1.5790450180961257, + "learning_rate": 5.800380940995236e-06, + "loss": 0.403, + "step": 4782 + }, + { + "epoch": 0.46549878345498785, + "grad_norm": 1.1697128695101966, + "learning_rate": 5.798825071053738e-06, + "loss": 0.3045, + "step": 4783 + }, + { + "epoch": 0.46559610705596105, + "grad_norm": 1.5814625412505545, + "learning_rate": 5.7972691217324985e-06, + "loss": 0.6549, + "step": 4784 + }, + { + "epoch": 0.4656934306569343, + "grad_norm": 1.4855521102439881, + "learning_rate": 5.795713093186137e-06, + "loss": 0.4597, + "step": 4785 + }, + { + "epoch": 0.46579075425790756, + "grad_norm": 1.059777361772832, + "learning_rate": 5.794156985569276e-06, + "loss": 0.3115, + "step": 4786 + }, + { + "epoch": 0.46588807785888076, + "grad_norm": 1.5462466418292278, + "learning_rate": 5.792600799036547e-06, + "loss": 0.4384, + "step": 4787 + }, + { + "epoch": 0.465985401459854, + "grad_norm": 1.34648475038298, + "learning_rate": 5.79104453374259e-06, + "loss": 0.2994, + "step": 4788 + }, + { + "epoch": 0.46608272506082726, + "grad_norm": 1.3342291633904535, + "learning_rate": 5.789488189842053e-06, + "loss": 0.3545, + "step": 4789 + }, + { + "epoch": 0.46618004866180046, + "grad_norm": 1.347617814994831, + "learning_rate": 5.787931767489588e-06, + "loss": 0.4196, + "step": 4790 + }, + { + "epoch": 0.4662773722627737, + "grad_norm": 1.394591685462805, + "learning_rate": 5.786375266839859e-06, + "loss": 0.4832, + "step": 4791 + }, + { + "epoch": 0.46637469586374697, + "grad_norm": 1.5611341992108034, + "learning_rate": 5.784818688047536e-06, + "loss": 0.4155, + "step": 4792 + }, + { + "epoch": 0.4664720194647202, + "grad_norm": 1.3150159699228137, + "learning_rate": 5.7832620312672975e-06, + "loss": 0.4826, + "step": 4793 + }, + { + "epoch": 0.4665693430656934, + "grad_norm": 1.4168733090801606, + "learning_rate": 5.7817052966538304e-06, + "loss": 0.5195, + "step": 4794 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.300365897184467, + "learning_rate": 5.780148484361826e-06, + "loss": 0.3441, + "step": 4795 + }, + { + "epoch": 0.4667639902676399, + "grad_norm": 1.3810574013218457, + "learning_rate": 5.778591594545989e-06, + "loss": 0.3596, + "step": 4796 + }, + { + "epoch": 0.4668613138686131, + "grad_norm": 1.4598916946372429, + "learning_rate": 5.777034627361025e-06, + "loss": 0.3572, + "step": 4797 + }, + { + "epoch": 0.4669586374695864, + "grad_norm": 1.4837209766509183, + "learning_rate": 5.775477582961653e-06, + "loss": 0.342, + "step": 4798 + }, + { + "epoch": 0.46705596107055963, + "grad_norm": 1.6607214111550288, + "learning_rate": 5.7739204615025975e-06, + "loss": 0.3814, + "step": 4799 + }, + { + "epoch": 0.46715328467153283, + "grad_norm": 1.652182568924618, + "learning_rate": 5.772363263138589e-06, + "loss": 0.5886, + "step": 4800 + }, + { + "epoch": 0.4672506082725061, + "grad_norm": 1.1429975734160958, + "learning_rate": 5.770805988024371e-06, + "loss": 0.3413, + "step": 4801 + }, + { + "epoch": 0.46734793187347934, + "grad_norm": 1.432321009479854, + "learning_rate": 5.769248636314686e-06, + "loss": 0.386, + "step": 4802 + }, + { + "epoch": 0.46744525547445254, + "grad_norm": 1.3423489652240848, + "learning_rate": 5.767691208164291e-06, + "loss": 0.3524, + "step": 4803 + }, + { + "epoch": 0.4675425790754258, + "grad_norm": 1.2135395273781886, + "learning_rate": 5.766133703727948e-06, + "loss": 0.3286, + "step": 4804 + }, + { + "epoch": 0.46763990267639904, + "grad_norm": 1.4955897665118012, + "learning_rate": 5.76457612316043e-06, + "loss": 0.5215, + "step": 4805 + }, + { + "epoch": 0.46773722627737224, + "grad_norm": 1.3141062762104827, + "learning_rate": 5.7630184666165125e-06, + "loss": 0.3436, + "step": 4806 + }, + { + "epoch": 0.4678345498783455, + "grad_norm": 1.7784427808040708, + "learning_rate": 5.761460734250981e-06, + "loss": 0.3238, + "step": 4807 + }, + { + "epoch": 0.46793187347931875, + "grad_norm": 1.4663382675516843, + "learning_rate": 5.759902926218627e-06, + "loss": 0.4586, + "step": 4808 + }, + { + "epoch": 0.46802919708029195, + "grad_norm": 1.3903432410612793, + "learning_rate": 5.758345042674253e-06, + "loss": 0.3347, + "step": 4809 + }, + { + "epoch": 0.4681265206812652, + "grad_norm": 1.1531113993351785, + "learning_rate": 5.7567870837726655e-06, + "loss": 0.296, + "step": 4810 + }, + { + "epoch": 0.46822384428223845, + "grad_norm": 1.5297628693915974, + "learning_rate": 5.755229049668681e-06, + "loss": 0.4743, + "step": 4811 + }, + { + "epoch": 0.46832116788321165, + "grad_norm": 2.2740622379774096, + "learning_rate": 5.753670940517122e-06, + "loss": 0.2528, + "step": 4812 + }, + { + "epoch": 0.4684184914841849, + "grad_norm": 1.3345646636320794, + "learning_rate": 5.752112756472818e-06, + "loss": 0.4653, + "step": 4813 + }, + { + "epoch": 0.46851581508515816, + "grad_norm": 1.357079200578268, + "learning_rate": 5.7505544976906055e-06, + "loss": 0.4277, + "step": 4814 + }, + { + "epoch": 0.4686131386861314, + "grad_norm": 1.3251376944197377, + "learning_rate": 5.748996164325332e-06, + "loss": 0.3701, + "step": 4815 + }, + { + "epoch": 0.4687104622871046, + "grad_norm": 1.2991457443232788, + "learning_rate": 5.747437756531851e-06, + "loss": 0.4338, + "step": 4816 + }, + { + "epoch": 0.46880778588807787, + "grad_norm": 1.596269822083475, + "learning_rate": 5.7458792744650206e-06, + "loss": 0.6416, + "step": 4817 + }, + { + "epoch": 0.4689051094890511, + "grad_norm": 2.0883872973184534, + "learning_rate": 5.7443207182797066e-06, + "loss": 0.4152, + "step": 4818 + }, + { + "epoch": 0.4690024330900243, + "grad_norm": 1.6561593084876625, + "learning_rate": 5.742762088130785e-06, + "loss": 0.3861, + "step": 4819 + }, + { + "epoch": 0.46909975669099757, + "grad_norm": 1.2816432721483586, + "learning_rate": 5.741203384173139e-06, + "loss": 0.3791, + "step": 4820 + }, + { + "epoch": 0.4691970802919708, + "grad_norm": 1.2696822326085209, + "learning_rate": 5.7396446065616585e-06, + "loss": 0.3685, + "step": 4821 + }, + { + "epoch": 0.469294403892944, + "grad_norm": 1.5827454261840233, + "learning_rate": 5.738085755451237e-06, + "loss": 0.5133, + "step": 4822 + }, + { + "epoch": 0.4693917274939173, + "grad_norm": 1.5409051543697243, + "learning_rate": 5.736526830996782e-06, + "loss": 0.3443, + "step": 4823 + }, + { + "epoch": 0.46948905109489053, + "grad_norm": 1.7383513361485372, + "learning_rate": 5.734967833353201e-06, + "loss": 0.2957, + "step": 4824 + }, + { + "epoch": 0.46958637469586373, + "grad_norm": 1.1363874129495541, + "learning_rate": 5.733408762675415e-06, + "loss": 0.3485, + "step": 4825 + }, + { + "epoch": 0.469683698296837, + "grad_norm": 1.191790759465704, + "learning_rate": 5.73184961911835e-06, + "loss": 0.3427, + "step": 4826 + }, + { + "epoch": 0.46978102189781024, + "grad_norm": 1.496233184133573, + "learning_rate": 5.7302904028369386e-06, + "loss": 0.4096, + "step": 4827 + }, + { + "epoch": 0.46987834549878343, + "grad_norm": 1.8262031430102197, + "learning_rate": 5.728731113986122e-06, + "loss": 0.6644, + "step": 4828 + }, + { + "epoch": 0.4699756690997567, + "grad_norm": 1.4064090214992204, + "learning_rate": 5.727171752720846e-06, + "loss": 0.3706, + "step": 4829 + }, + { + "epoch": 0.47007299270072994, + "grad_norm": 1.411066158000839, + "learning_rate": 5.725612319196065e-06, + "loss": 0.3495, + "step": 4830 + }, + { + "epoch": 0.47017031630170314, + "grad_norm": 1.7578993034123143, + "learning_rate": 5.724052813566742e-06, + "loss": 0.4626, + "step": 4831 + }, + { + "epoch": 0.4702676399026764, + "grad_norm": 1.412637867571139, + "learning_rate": 5.722493235987847e-06, + "loss": 0.329, + "step": 4832 + }, + { + "epoch": 0.47036496350364965, + "grad_norm": 1.3267675306485687, + "learning_rate": 5.720933586614355e-06, + "loss": 0.401, + "step": 4833 + }, + { + "epoch": 0.47046228710462284, + "grad_norm": 1.5867750089317443, + "learning_rate": 5.719373865601249e-06, + "loss": 0.5137, + "step": 4834 + }, + { + "epoch": 0.4705596107055961, + "grad_norm": 1.4410510775681757, + "learning_rate": 5.7178140731035195e-06, + "loss": 0.3148, + "step": 4835 + }, + { + "epoch": 0.47065693430656935, + "grad_norm": 1.1879705199620183, + "learning_rate": 5.716254209276163e-06, + "loss": 0.3546, + "step": 4836 + }, + { + "epoch": 0.4707542579075426, + "grad_norm": 1.419039571510067, + "learning_rate": 5.714694274274189e-06, + "loss": 0.4785, + "step": 4837 + }, + { + "epoch": 0.4708515815085158, + "grad_norm": 1.3906630607282704, + "learning_rate": 5.713134268252603e-06, + "loss": 0.4719, + "step": 4838 + }, + { + "epoch": 0.47094890510948906, + "grad_norm": 1.5072844376178671, + "learning_rate": 5.711574191366427e-06, + "loss": 0.5591, + "step": 4839 + }, + { + "epoch": 0.4710462287104623, + "grad_norm": 1.4731496293052737, + "learning_rate": 5.710014043770686e-06, + "loss": 0.4348, + "step": 4840 + }, + { + "epoch": 0.4711435523114355, + "grad_norm": 1.4626012294328556, + "learning_rate": 5.708453825620413e-06, + "loss": 0.3738, + "step": 4841 + }, + { + "epoch": 0.47124087591240876, + "grad_norm": 1.3952035859287184, + "learning_rate": 5.706893537070648e-06, + "loss": 0.3287, + "step": 4842 + }, + { + "epoch": 0.471338199513382, + "grad_norm": 1.2682325087652746, + "learning_rate": 5.705333178276439e-06, + "loss": 0.3744, + "step": 4843 + }, + { + "epoch": 0.4714355231143552, + "grad_norm": 1.3258810047392384, + "learning_rate": 5.7037727493928374e-06, + "loss": 0.4449, + "step": 4844 + }, + { + "epoch": 0.47153284671532847, + "grad_norm": 1.0974041523713896, + "learning_rate": 5.702212250574905e-06, + "loss": 0.3161, + "step": 4845 + }, + { + "epoch": 0.4716301703163017, + "grad_norm": 1.6106032341998873, + "learning_rate": 5.7006516819777105e-06, + "loss": 0.4791, + "step": 4846 + }, + { + "epoch": 0.4717274939172749, + "grad_norm": 1.4543270758588145, + "learning_rate": 5.699091043756326e-06, + "loss": 0.4122, + "step": 4847 + }, + { + "epoch": 0.4718248175182482, + "grad_norm": 1.1669639424373461, + "learning_rate": 5.697530336065837e-06, + "loss": 0.3915, + "step": 4848 + }, + { + "epoch": 0.4719221411192214, + "grad_norm": 1.7484002294309418, + "learning_rate": 5.695969559061328e-06, + "loss": 0.6337, + "step": 4849 + }, + { + "epoch": 0.4720194647201946, + "grad_norm": 1.1084749035779666, + "learning_rate": 5.694408712897898e-06, + "loss": 0.2675, + "step": 4850 + }, + { + "epoch": 0.4721167883211679, + "grad_norm": 1.7836386743035124, + "learning_rate": 5.692847797730644e-06, + "loss": 0.2981, + "step": 4851 + }, + { + "epoch": 0.47221411192214113, + "grad_norm": 1.3478942218702235, + "learning_rate": 5.691286813714682e-06, + "loss": 0.3685, + "step": 4852 + }, + { + "epoch": 0.47231143552311433, + "grad_norm": 1.4550135655770304, + "learning_rate": 5.6897257610051225e-06, + "loss": 0.2745, + "step": 4853 + }, + { + "epoch": 0.4724087591240876, + "grad_norm": 1.4982703090789, + "learning_rate": 5.688164639757091e-06, + "loss": 0.1822, + "step": 4854 + }, + { + "epoch": 0.47250608272506084, + "grad_norm": 1.4542555154292847, + "learning_rate": 5.686603450125717e-06, + "loss": 0.4963, + "step": 4855 + }, + { + "epoch": 0.47260340632603404, + "grad_norm": 1.3239793589213282, + "learning_rate": 5.685042192266134e-06, + "loss": 0.4562, + "step": 4856 + }, + { + "epoch": 0.4727007299270073, + "grad_norm": 1.5861723026037755, + "learning_rate": 5.683480866333489e-06, + "loss": 0.502, + "step": 4857 + }, + { + "epoch": 0.47279805352798054, + "grad_norm": 1.5279472700177954, + "learning_rate": 5.68191947248293e-06, + "loss": 0.482, + "step": 4858 + }, + { + "epoch": 0.4728953771289538, + "grad_norm": 1.6153320873369572, + "learning_rate": 5.680358010869613e-06, + "loss": 0.4353, + "step": 4859 + }, + { + "epoch": 0.472992700729927, + "grad_norm": 1.348796505837762, + "learning_rate": 5.678796481648703e-06, + "loss": 0.3374, + "step": 4860 + }, + { + "epoch": 0.47309002433090025, + "grad_norm": 1.4866060856733978, + "learning_rate": 5.677234884975369e-06, + "loss": 0.3824, + "step": 4861 + }, + { + "epoch": 0.4731873479318735, + "grad_norm": 1.5254879968021549, + "learning_rate": 5.675673221004788e-06, + "loss": 0.449, + "step": 4862 + }, + { + "epoch": 0.4732846715328467, + "grad_norm": 1.3671322184694132, + "learning_rate": 5.674111489892144e-06, + "loss": 0.4025, + "step": 4863 + }, + { + "epoch": 0.47338199513381995, + "grad_norm": 1.0117031965201657, + "learning_rate": 5.672549691792629e-06, + "loss": 0.2595, + "step": 4864 + }, + { + "epoch": 0.4734793187347932, + "grad_norm": 1.281972448847432, + "learning_rate": 5.670987826861435e-06, + "loss": 0.3899, + "step": 4865 + }, + { + "epoch": 0.4735766423357664, + "grad_norm": 1.3228606064500106, + "learning_rate": 5.669425895253769e-06, + "loss": 0.3252, + "step": 4866 + }, + { + "epoch": 0.47367396593673966, + "grad_norm": 1.254391205079037, + "learning_rate": 5.66786389712484e-06, + "loss": 0.3983, + "step": 4867 + }, + { + "epoch": 0.4737712895377129, + "grad_norm": 1.2578585598491165, + "learning_rate": 5.666301832629866e-06, + "loss": 0.3728, + "step": 4868 + }, + { + "epoch": 0.4738686131386861, + "grad_norm": 1.7176300561518312, + "learning_rate": 5.664739701924069e-06, + "loss": 0.4013, + "step": 4869 + }, + { + "epoch": 0.47396593673965937, + "grad_norm": 1.9698213962010664, + "learning_rate": 5.663177505162679e-06, + "loss": 0.5781, + "step": 4870 + }, + { + "epoch": 0.4740632603406326, + "grad_norm": 1.4756819102434395, + "learning_rate": 5.661615242500933e-06, + "loss": 0.3928, + "step": 4871 + }, + { + "epoch": 0.4741605839416058, + "grad_norm": 1.587339636445031, + "learning_rate": 5.660052914094073e-06, + "loss": 0.4726, + "step": 4872 + }, + { + "epoch": 0.47425790754257907, + "grad_norm": 1.5365933978527655, + "learning_rate": 5.658490520097351e-06, + "loss": 0.4451, + "step": 4873 + }, + { + "epoch": 0.4743552311435523, + "grad_norm": 1.5917052072915214, + "learning_rate": 5.656928060666018e-06, + "loss": 0.3587, + "step": 4874 + }, + { + "epoch": 0.4744525547445255, + "grad_norm": 1.4206774352936657, + "learning_rate": 5.655365535955343e-06, + "loss": 0.4493, + "step": 4875 + }, + { + "epoch": 0.4745498783454988, + "grad_norm": 1.340672762004427, + "learning_rate": 5.65380294612059e-06, + "loss": 0.455, + "step": 4876 + }, + { + "epoch": 0.47464720194647203, + "grad_norm": 1.3776284584062226, + "learning_rate": 5.652240291317037e-06, + "loss": 0.3514, + "step": 4877 + }, + { + "epoch": 0.47474452554744523, + "grad_norm": 1.7446009593383291, + "learning_rate": 5.650677571699965e-06, + "loss": 0.5945, + "step": 4878 + }, + { + "epoch": 0.4748418491484185, + "grad_norm": 1.5843637861046846, + "learning_rate": 5.6491147874246636e-06, + "loss": 0.5681, + "step": 4879 + }, + { + "epoch": 0.47493917274939174, + "grad_norm": 1.3959518164077136, + "learning_rate": 5.647551938646426e-06, + "loss": 0.3407, + "step": 4880 + }, + { + "epoch": 0.475036496350365, + "grad_norm": 1.3839884784582743, + "learning_rate": 5.645989025520555e-06, + "loss": 0.2746, + "step": 4881 + }, + { + "epoch": 0.4751338199513382, + "grad_norm": 1.8844954697293113, + "learning_rate": 5.644426048202357e-06, + "loss": 0.3953, + "step": 4882 + }, + { + "epoch": 0.47523114355231144, + "grad_norm": 1.2908271501925401, + "learning_rate": 5.642863006847146e-06, + "loss": 0.3223, + "step": 4883 + }, + { + "epoch": 0.4753284671532847, + "grad_norm": 1.4328633040445249, + "learning_rate": 5.641299901610244e-06, + "loss": 0.4966, + "step": 4884 + }, + { + "epoch": 0.4754257907542579, + "grad_norm": 1.5146698449083573, + "learning_rate": 5.639736732646977e-06, + "loss": 0.2527, + "step": 4885 + }, + { + "epoch": 0.47552311435523115, + "grad_norm": 1.3432001275183485, + "learning_rate": 5.638173500112676e-06, + "loss": 0.4179, + "step": 4886 + }, + { + "epoch": 0.4756204379562044, + "grad_norm": 1.2322461082999159, + "learning_rate": 5.6366102041626825e-06, + "loss": 0.3818, + "step": 4887 + }, + { + "epoch": 0.4757177615571776, + "grad_norm": 1.3767714360266998, + "learning_rate": 5.635046844952342e-06, + "loss": 0.4691, + "step": 4888 + }, + { + "epoch": 0.47581508515815085, + "grad_norm": 1.314630958351386, + "learning_rate": 5.633483422637005e-06, + "loss": 0.4198, + "step": 4889 + }, + { + "epoch": 0.4759124087591241, + "grad_norm": 1.8890943399906002, + "learning_rate": 5.631919937372034e-06, + "loss": 0.8231, + "step": 4890 + }, + { + "epoch": 0.4760097323600973, + "grad_norm": 1.3889867583866484, + "learning_rate": 5.6303563893127885e-06, + "loss": 0.4546, + "step": 4891 + }, + { + "epoch": 0.47610705596107056, + "grad_norm": 1.6799912288981764, + "learning_rate": 5.62879277861464e-06, + "loss": 0.4559, + "step": 4892 + }, + { + "epoch": 0.4762043795620438, + "grad_norm": 1.4226023731367958, + "learning_rate": 5.627229105432968e-06, + "loss": 0.2873, + "step": 4893 + }, + { + "epoch": 0.476301703163017, + "grad_norm": 1.6722706834537924, + "learning_rate": 5.6256653699231535e-06, + "loss": 0.3074, + "step": 4894 + }, + { + "epoch": 0.47639902676399026, + "grad_norm": 1.5132051006609315, + "learning_rate": 5.624101572240588e-06, + "loss": 0.2159, + "step": 4895 + }, + { + "epoch": 0.4764963503649635, + "grad_norm": 1.4105581624866415, + "learning_rate": 5.622537712540664e-06, + "loss": 0.3531, + "step": 4896 + }, + { + "epoch": 0.4765936739659367, + "grad_norm": 1.5128648932544173, + "learning_rate": 5.6209737909787864e-06, + "loss": 0.583, + "step": 4897 + }, + { + "epoch": 0.47669099756690997, + "grad_norm": 1.3110258361371465, + "learning_rate": 5.619409807710361e-06, + "loss": 0.2983, + "step": 4898 + }, + { + "epoch": 0.4767883211678832, + "grad_norm": 1.297295283960915, + "learning_rate": 5.617845762890801e-06, + "loss": 0.3841, + "step": 4899 + }, + { + "epoch": 0.4768856447688564, + "grad_norm": 1.5268473489250836, + "learning_rate": 5.616281656675529e-06, + "loss": 0.3997, + "step": 4900 + }, + { + "epoch": 0.4769829683698297, + "grad_norm": 1.1738970794989763, + "learning_rate": 5.614717489219969e-06, + "loss": 0.3441, + "step": 4901 + }, + { + "epoch": 0.4770802919708029, + "grad_norm": 1.4550986064519102, + "learning_rate": 5.613153260679557e-06, + "loss": 0.4218, + "step": 4902 + }, + { + "epoch": 0.4771776155717762, + "grad_norm": 1.2110212727764387, + "learning_rate": 5.611588971209726e-06, + "loss": 0.2402, + "step": 4903 + }, + { + "epoch": 0.4772749391727494, + "grad_norm": 1.341140342133277, + "learning_rate": 5.610024620965924e-06, + "loss": 0.5273, + "step": 4904 + }, + { + "epoch": 0.47737226277372263, + "grad_norm": 1.4563750151972352, + "learning_rate": 5.608460210103599e-06, + "loss": 0.2982, + "step": 4905 + }, + { + "epoch": 0.4774695863746959, + "grad_norm": 1.4238212595749262, + "learning_rate": 5.606895738778211e-06, + "loss": 0.4861, + "step": 4906 + }, + { + "epoch": 0.4775669099756691, + "grad_norm": 1.1703006596087244, + "learning_rate": 5.605331207145219e-06, + "loss": 0.2687, + "step": 4907 + }, + { + "epoch": 0.47766423357664234, + "grad_norm": 1.4732598400916934, + "learning_rate": 5.603766615360094e-06, + "loss": 0.3763, + "step": 4908 + }, + { + "epoch": 0.4777615571776156, + "grad_norm": 1.2987517140792977, + "learning_rate": 5.602201963578308e-06, + "loss": 0.3241, + "step": 4909 + }, + { + "epoch": 0.4778588807785888, + "grad_norm": 1.4014749625096345, + "learning_rate": 5.600637251955343e-06, + "loss": 0.3606, + "step": 4910 + }, + { + "epoch": 0.47795620437956204, + "grad_norm": 1.5299494923252082, + "learning_rate": 5.599072480646686e-06, + "loss": 0.4809, + "step": 4911 + }, + { + "epoch": 0.4780535279805353, + "grad_norm": 1.5049743030420062, + "learning_rate": 5.597507649807828e-06, + "loss": 0.552, + "step": 4912 + }, + { + "epoch": 0.4781508515815085, + "grad_norm": 1.3122576989449315, + "learning_rate": 5.595942759594268e-06, + "loss": 0.3756, + "step": 4913 + }, + { + "epoch": 0.47824817518248175, + "grad_norm": 1.2378409291743957, + "learning_rate": 5.594377810161509e-06, + "loss": 0.3848, + "step": 4914 + }, + { + "epoch": 0.478345498783455, + "grad_norm": 1.4881152763834071, + "learning_rate": 5.592812801665062e-06, + "loss": 0.6671, + "step": 4915 + }, + { + "epoch": 0.4784428223844282, + "grad_norm": 1.083624702787122, + "learning_rate": 5.591247734260441e-06, + "loss": 0.2953, + "step": 4916 + }, + { + "epoch": 0.47854014598540145, + "grad_norm": 1.506118534413334, + "learning_rate": 5.589682608103172e-06, + "loss": 0.5348, + "step": 4917 + }, + { + "epoch": 0.4786374695863747, + "grad_norm": 1.4154348721520453, + "learning_rate": 5.588117423348779e-06, + "loss": 0.4859, + "step": 4918 + }, + { + "epoch": 0.4787347931873479, + "grad_norm": 1.3776882941573374, + "learning_rate": 5.586552180152795e-06, + "loss": 0.3041, + "step": 4919 + }, + { + "epoch": 0.47883211678832116, + "grad_norm": 1.4627494794850848, + "learning_rate": 5.58498687867076e-06, + "loss": 0.5225, + "step": 4920 + }, + { + "epoch": 0.4789294403892944, + "grad_norm": 1.7558812361768268, + "learning_rate": 5.583421519058221e-06, + "loss": 0.3869, + "step": 4921 + }, + { + "epoch": 0.4790267639902676, + "grad_norm": 1.3597510683616933, + "learning_rate": 5.5818561014707265e-06, + "loss": 0.3343, + "step": 4922 + }, + { + "epoch": 0.47912408759124087, + "grad_norm": 1.496715844598856, + "learning_rate": 5.580290626063833e-06, + "loss": 0.6334, + "step": 4923 + }, + { + "epoch": 0.4792214111922141, + "grad_norm": 1.62323606561308, + "learning_rate": 5.578725092993103e-06, + "loss": 0.6151, + "step": 4924 + }, + { + "epoch": 0.4793187347931874, + "grad_norm": 1.3797589224174194, + "learning_rate": 5.577159502414105e-06, + "loss": 0.3447, + "step": 4925 + }, + { + "epoch": 0.47941605839416057, + "grad_norm": 1.6225545528980303, + "learning_rate": 5.575593854482414e-06, + "loss": 0.3989, + "step": 4926 + }, + { + "epoch": 0.4795133819951338, + "grad_norm": 1.5792786564367016, + "learning_rate": 5.574028149353607e-06, + "loss": 0.4858, + "step": 4927 + }, + { + "epoch": 0.4796107055961071, + "grad_norm": 1.6675443086928328, + "learning_rate": 5.57246238718327e-06, + "loss": 0.5457, + "step": 4928 + }, + { + "epoch": 0.4797080291970803, + "grad_norm": 1.4119889457286139, + "learning_rate": 5.570896568126994e-06, + "loss": 0.3692, + "step": 4929 + }, + { + "epoch": 0.47980535279805353, + "grad_norm": 1.4957501288236537, + "learning_rate": 5.569330692340372e-06, + "loss": 0.4741, + "step": 4930 + }, + { + "epoch": 0.4799026763990268, + "grad_norm": 1.4937942402565056, + "learning_rate": 5.567764759979013e-06, + "loss": 0.4398, + "step": 4931 + }, + { + "epoch": 0.48, + "grad_norm": 1.302421978464306, + "learning_rate": 5.566198771198519e-06, + "loss": 0.3684, + "step": 4932 + }, + { + "epoch": 0.48009732360097324, + "grad_norm": 1.4896919193126248, + "learning_rate": 5.564632726154506e-06, + "loss": 0.5215, + "step": 4933 + }, + { + "epoch": 0.4801946472019465, + "grad_norm": 1.3684396296081907, + "learning_rate": 5.5630666250025924e-06, + "loss": 0.3742, + "step": 4934 + }, + { + "epoch": 0.4802919708029197, + "grad_norm": 1.4541732944603456, + "learning_rate": 5.5615004678984005e-06, + "loss": 0.389, + "step": 4935 + }, + { + "epoch": 0.48038929440389294, + "grad_norm": 1.2989944695669944, + "learning_rate": 5.559934254997563e-06, + "loss": 0.2933, + "step": 4936 + }, + { + "epoch": 0.4804866180048662, + "grad_norm": 1.2551515758973009, + "learning_rate": 5.558367986455715e-06, + "loss": 0.2312, + "step": 4937 + }, + { + "epoch": 0.4805839416058394, + "grad_norm": 1.6383916276486878, + "learning_rate": 5.556801662428497e-06, + "loss": 0.3433, + "step": 4938 + }, + { + "epoch": 0.48068126520681265, + "grad_norm": 1.395987981323709, + "learning_rate": 5.555235283071554e-06, + "loss": 0.2205, + "step": 4939 + }, + { + "epoch": 0.4807785888077859, + "grad_norm": 1.5870226074656453, + "learning_rate": 5.5536688485405395e-06, + "loss": 0.5398, + "step": 4940 + }, + { + "epoch": 0.4808759124087591, + "grad_norm": 1.4049976282337302, + "learning_rate": 5.5521023589911124e-06, + "loss": 0.3912, + "step": 4941 + }, + { + "epoch": 0.48097323600973235, + "grad_norm": 1.4051654448113469, + "learning_rate": 5.550535814578935e-06, + "loss": 0.3652, + "step": 4942 + }, + { + "epoch": 0.4810705596107056, + "grad_norm": 1.3749511016473956, + "learning_rate": 5.548969215459674e-06, + "loss": 0.4801, + "step": 4943 + }, + { + "epoch": 0.48116788321167886, + "grad_norm": 1.807326200157131, + "learning_rate": 5.547402561789007e-06, + "loss": 0.6021, + "step": 4944 + }, + { + "epoch": 0.48126520681265206, + "grad_norm": 1.534704554452748, + "learning_rate": 5.545835853722609e-06, + "loss": 0.5896, + "step": 4945 + }, + { + "epoch": 0.4813625304136253, + "grad_norm": 1.5090586202369751, + "learning_rate": 5.544269091416165e-06, + "loss": 0.3395, + "step": 4946 + }, + { + "epoch": 0.48145985401459857, + "grad_norm": 1.3434490044175096, + "learning_rate": 5.542702275025371e-06, + "loss": 0.2682, + "step": 4947 + }, + { + "epoch": 0.48155717761557176, + "grad_norm": 1.2846389239459437, + "learning_rate": 5.541135404705915e-06, + "loss": 0.3631, + "step": 4948 + }, + { + "epoch": 0.481654501216545, + "grad_norm": 1.2126695612690586, + "learning_rate": 5.5395684806135046e-06, + "loss": 0.2274, + "step": 4949 + }, + { + "epoch": 0.48175182481751827, + "grad_norm": 1.2719046137031083, + "learning_rate": 5.538001502903839e-06, + "loss": 0.3379, + "step": 4950 + }, + { + "epoch": 0.48184914841849147, + "grad_norm": 1.7949003564916597, + "learning_rate": 5.536434471732635e-06, + "loss": 0.5321, + "step": 4951 + }, + { + "epoch": 0.4819464720194647, + "grad_norm": 1.5533075390394497, + "learning_rate": 5.534867387255607e-06, + "loss": 0.3958, + "step": 4952 + }, + { + "epoch": 0.482043795620438, + "grad_norm": 1.9088016379615553, + "learning_rate": 5.533300249628479e-06, + "loss": 0.4515, + "step": 4953 + }, + { + "epoch": 0.4821411192214112, + "grad_norm": 3.884497294261637, + "learning_rate": 5.531733059006978e-06, + "loss": 0.3338, + "step": 4954 + }, + { + "epoch": 0.4822384428223844, + "grad_norm": 1.9162863754930761, + "learning_rate": 5.530165815546835e-06, + "loss": 0.3446, + "step": 4955 + }, + { + "epoch": 0.4823357664233577, + "grad_norm": 1.1899820789622602, + "learning_rate": 5.528598519403788e-06, + "loss": 0.3353, + "step": 4956 + }, + { + "epoch": 0.4824330900243309, + "grad_norm": 1.2085002367353228, + "learning_rate": 5.527031170733583e-06, + "loss": 0.3172, + "step": 4957 + }, + { + "epoch": 0.48253041362530413, + "grad_norm": 1.4730257334743548, + "learning_rate": 5.525463769691967e-06, + "loss": 0.4526, + "step": 4958 + }, + { + "epoch": 0.4826277372262774, + "grad_norm": 1.5591799749760946, + "learning_rate": 5.523896316434692e-06, + "loss": 0.462, + "step": 4959 + }, + { + "epoch": 0.4827250608272506, + "grad_norm": 2.1529087218245127, + "learning_rate": 5.522328811117519e-06, + "loss": 0.4195, + "step": 4960 + }, + { + "epoch": 0.48282238442822384, + "grad_norm": 1.744038372737554, + "learning_rate": 5.52076125389621e-06, + "loss": 0.6677, + "step": 4961 + }, + { + "epoch": 0.4829197080291971, + "grad_norm": 1.4795186962610711, + "learning_rate": 5.5191936449265345e-06, + "loss": 0.4061, + "step": 4962 + }, + { + "epoch": 0.4830170316301703, + "grad_norm": 1.7915332507945234, + "learning_rate": 5.517625984364269e-06, + "loss": 0.5287, + "step": 4963 + }, + { + "epoch": 0.48311435523114354, + "grad_norm": 1.5862892399737336, + "learning_rate": 5.5160582723651905e-06, + "loss": 0.3734, + "step": 4964 + }, + { + "epoch": 0.4832116788321168, + "grad_norm": 1.3700896061182768, + "learning_rate": 5.514490509085084e-06, + "loss": 0.4018, + "step": 4965 + }, + { + "epoch": 0.48330900243309005, + "grad_norm": 1.3501396848679752, + "learning_rate": 5.512922694679739e-06, + "loss": 0.3885, + "step": 4966 + }, + { + "epoch": 0.48340632603406325, + "grad_norm": 1.3364214104388243, + "learning_rate": 5.511354829304952e-06, + "loss": 0.4145, + "step": 4967 + }, + { + "epoch": 0.4835036496350365, + "grad_norm": 1.5072023412269178, + "learning_rate": 5.509786913116521e-06, + "loss": 0.4158, + "step": 4968 + }, + { + "epoch": 0.48360097323600976, + "grad_norm": 1.433925746133814, + "learning_rate": 5.508218946270251e-06, + "loss": 0.3939, + "step": 4969 + }, + { + "epoch": 0.48369829683698295, + "grad_norm": 1.3899463326227424, + "learning_rate": 5.5066509289219505e-06, + "loss": 0.427, + "step": 4970 + }, + { + "epoch": 0.4837956204379562, + "grad_norm": 1.3961752382649586, + "learning_rate": 5.505082861227437e-06, + "loss": 0.3507, + "step": 4971 + }, + { + "epoch": 0.48389294403892946, + "grad_norm": 1.4261266002714594, + "learning_rate": 5.50351474334253e-06, + "loss": 0.3776, + "step": 4972 + }, + { + "epoch": 0.48399026763990266, + "grad_norm": 1.3049976642631587, + "learning_rate": 5.501946575423051e-06, + "loss": 0.2857, + "step": 4973 + }, + { + "epoch": 0.4840875912408759, + "grad_norm": 1.7623259519308563, + "learning_rate": 5.500378357624835e-06, + "loss": 0.4472, + "step": 4974 + }, + { + "epoch": 0.48418491484184917, + "grad_norm": 1.6750772771516935, + "learning_rate": 5.498810090103712e-06, + "loss": 0.3952, + "step": 4975 + }, + { + "epoch": 0.48428223844282237, + "grad_norm": 1.326746820196234, + "learning_rate": 5.4972417730155256e-06, + "loss": 0.2186, + "step": 4976 + }, + { + "epoch": 0.4843795620437956, + "grad_norm": 2.6917502926648904, + "learning_rate": 5.4956734065161176e-06, + "loss": 0.4336, + "step": 4977 + }, + { + "epoch": 0.4844768856447689, + "grad_norm": 1.5933122220629257, + "learning_rate": 5.494104990761338e-06, + "loss": 0.3989, + "step": 4978 + }, + { + "epoch": 0.48457420924574207, + "grad_norm": 1.562234703429822, + "learning_rate": 5.492536525907042e-06, + "loss": 0.28, + "step": 4979 + }, + { + "epoch": 0.4846715328467153, + "grad_norm": 1.155189440165412, + "learning_rate": 5.490968012109089e-06, + "loss": 0.3582, + "step": 4980 + }, + { + "epoch": 0.4847688564476886, + "grad_norm": 1.504468698725549, + "learning_rate": 5.489399449523343e-06, + "loss": 0.4868, + "step": 4981 + }, + { + "epoch": 0.4848661800486618, + "grad_norm": 1.6600998094725776, + "learning_rate": 5.4878308383056735e-06, + "loss": 0.551, + "step": 4982 + }, + { + "epoch": 0.48496350364963503, + "grad_norm": 1.6646907856061826, + "learning_rate": 5.486262178611953e-06, + "loss": 0.3845, + "step": 4983 + }, + { + "epoch": 0.4850608272506083, + "grad_norm": 2.203031497350937, + "learning_rate": 5.484693470598061e-06, + "loss": 0.5805, + "step": 4984 + }, + { + "epoch": 0.4851581508515815, + "grad_norm": 1.3567660936308192, + "learning_rate": 5.483124714419881e-06, + "loss": 0.2999, + "step": 4985 + }, + { + "epoch": 0.48525547445255474, + "grad_norm": 1.2357480102807044, + "learning_rate": 5.4815559102333005e-06, + "loss": 0.3387, + "step": 4986 + }, + { + "epoch": 0.485352798053528, + "grad_norm": 1.6859951373581095, + "learning_rate": 5.479987058194214e-06, + "loss": 0.4272, + "step": 4987 + }, + { + "epoch": 0.48545012165450124, + "grad_norm": 1.5177938928969597, + "learning_rate": 5.478418158458518e-06, + "loss": 0.5521, + "step": 4988 + }, + { + "epoch": 0.48554744525547444, + "grad_norm": 1.2753160346799406, + "learning_rate": 5.476849211182115e-06, + "loss": 0.4378, + "step": 4989 + }, + { + "epoch": 0.4856447688564477, + "grad_norm": 1.3387716024100207, + "learning_rate": 5.475280216520913e-06, + "loss": 0.4093, + "step": 4990 + }, + { + "epoch": 0.48574209245742095, + "grad_norm": 1.4017262068692473, + "learning_rate": 5.473711174630826e-06, + "loss": 0.3709, + "step": 4991 + }, + { + "epoch": 0.48583941605839415, + "grad_norm": 1.4983315725744055, + "learning_rate": 5.472142085667767e-06, + "loss": 0.4051, + "step": 4992 + }, + { + "epoch": 0.4859367396593674, + "grad_norm": 1.3419200057791822, + "learning_rate": 5.470572949787658e-06, + "loss": 0.4465, + "step": 4993 + }, + { + "epoch": 0.48603406326034065, + "grad_norm": 1.0828551312926495, + "learning_rate": 5.469003767146426e-06, + "loss": 0.2349, + "step": 4994 + }, + { + "epoch": 0.48613138686131385, + "grad_norm": 1.1408772646849228, + "learning_rate": 5.4674345379e-06, + "loss": 0.2653, + "step": 4995 + }, + { + "epoch": 0.4862287104622871, + "grad_norm": 1.595169781345277, + "learning_rate": 5.46586526220432e-06, + "loss": 0.3262, + "step": 4996 + }, + { + "epoch": 0.48632603406326036, + "grad_norm": 1.3117900017930924, + "learning_rate": 5.4642959402153205e-06, + "loss": 0.3817, + "step": 4997 + }, + { + "epoch": 0.48642335766423356, + "grad_norm": 1.4679453654555827, + "learning_rate": 5.462726572088949e-06, + "loss": 0.3647, + "step": 4998 + }, + { + "epoch": 0.4865206812652068, + "grad_norm": 1.7158286496667112, + "learning_rate": 5.46115715798115e-06, + "loss": 0.5639, + "step": 4999 + }, + { + "epoch": 0.48661800486618007, + "grad_norm": 1.6549363594842164, + "learning_rate": 5.459587698047886e-06, + "loss": 0.5489, + "step": 5000 + }, + { + "epoch": 0.48671532846715326, + "grad_norm": 2.1898319348857926, + "learning_rate": 5.458018192445108e-06, + "loss": 0.4976, + "step": 5001 + }, + { + "epoch": 0.4868126520681265, + "grad_norm": 1.5974613434557499, + "learning_rate": 5.45644864132878e-06, + "loss": 0.3197, + "step": 5002 + }, + { + "epoch": 0.48690997566909977, + "grad_norm": 1.3887355698626391, + "learning_rate": 5.45487904485487e-06, + "loss": 0.2057, + "step": 5003 + }, + { + "epoch": 0.48700729927007297, + "grad_norm": 1.6059660129870075, + "learning_rate": 5.45330940317935e-06, + "loss": 0.4177, + "step": 5004 + }, + { + "epoch": 0.4871046228710462, + "grad_norm": 1.5738497696839364, + "learning_rate": 5.451739716458196e-06, + "loss": 0.345, + "step": 5005 + }, + { + "epoch": 0.4872019464720195, + "grad_norm": 1.77737970001417, + "learning_rate": 5.450169984847389e-06, + "loss": 0.4421, + "step": 5006 + }, + { + "epoch": 0.4872992700729927, + "grad_norm": 1.7144957169695934, + "learning_rate": 5.4486002085029145e-06, + "loss": 0.3191, + "step": 5007 + }, + { + "epoch": 0.4873965936739659, + "grad_norm": 1.3777116133883833, + "learning_rate": 5.44703038758076e-06, + "loss": 0.3365, + "step": 5008 + }, + { + "epoch": 0.4874939172749392, + "grad_norm": 1.4604287307103112, + "learning_rate": 5.445460522236923e-06, + "loss": 0.4651, + "step": 5009 + }, + { + "epoch": 0.48759124087591244, + "grad_norm": 1.4975005624607667, + "learning_rate": 5.443890612627398e-06, + "loss": 0.3582, + "step": 5010 + }, + { + "epoch": 0.48768856447688563, + "grad_norm": 1.5004432341259126, + "learning_rate": 5.44232065890819e-06, + "loss": 0.4919, + "step": 5011 + }, + { + "epoch": 0.4877858880778589, + "grad_norm": 1.7923642803829052, + "learning_rate": 5.440750661235308e-06, + "loss": 0.4711, + "step": 5012 + }, + { + "epoch": 0.48788321167883214, + "grad_norm": 1.455676965568023, + "learning_rate": 5.439180619764761e-06, + "loss": 0.5366, + "step": 5013 + }, + { + "epoch": 0.48798053527980534, + "grad_norm": 1.4412783847736603, + "learning_rate": 5.437610534652567e-06, + "loss": 0.4822, + "step": 5014 + }, + { + "epoch": 0.4880778588807786, + "grad_norm": 1.4061603517567263, + "learning_rate": 5.4360404060547424e-06, + "loss": 0.4498, + "step": 5015 + }, + { + "epoch": 0.48817518248175185, + "grad_norm": 1.2205597572059488, + "learning_rate": 5.434470234127317e-06, + "loss": 0.2444, + "step": 5016 + }, + { + "epoch": 0.48827250608272504, + "grad_norm": 1.1571719643633969, + "learning_rate": 5.432900019026316e-06, + "loss": 0.372, + "step": 5017 + }, + { + "epoch": 0.4883698296836983, + "grad_norm": 1.2545109416777944, + "learning_rate": 5.431329760907775e-06, + "loss": 0.2478, + "step": 5018 + }, + { + "epoch": 0.48846715328467155, + "grad_norm": 1.58204119643864, + "learning_rate": 5.429759459927731e-06, + "loss": 0.7143, + "step": 5019 + }, + { + "epoch": 0.48856447688564475, + "grad_norm": 1.380213477167105, + "learning_rate": 5.428189116242224e-06, + "loss": 0.2741, + "step": 5020 + }, + { + "epoch": 0.488661800486618, + "grad_norm": 1.208192532107823, + "learning_rate": 5.426618730007303e-06, + "loss": 0.3596, + "step": 5021 + }, + { + "epoch": 0.48875912408759126, + "grad_norm": 1.4361021460116605, + "learning_rate": 5.4250483013790146e-06, + "loss": 0.5025, + "step": 5022 + }, + { + "epoch": 0.48885644768856445, + "grad_norm": 1.1517878856617345, + "learning_rate": 5.423477830513416e-06, + "loss": 0.2839, + "step": 5023 + }, + { + "epoch": 0.4889537712895377, + "grad_norm": 1.7188803883425718, + "learning_rate": 5.421907317566566e-06, + "loss": 0.6908, + "step": 5024 + }, + { + "epoch": 0.48905109489051096, + "grad_norm": 1.4039101950325197, + "learning_rate": 5.420336762694524e-06, + "loss": 0.4987, + "step": 5025 + }, + { + "epoch": 0.48914841849148416, + "grad_norm": 1.0076942078711917, + "learning_rate": 5.418766166053362e-06, + "loss": 0.181, + "step": 5026 + }, + { + "epoch": 0.4892457420924574, + "grad_norm": 1.3958603965451082, + "learning_rate": 5.4171955277991484e-06, + "loss": 0.4415, + "step": 5027 + }, + { + "epoch": 0.48934306569343067, + "grad_norm": 1.4424291769852293, + "learning_rate": 5.415624848087959e-06, + "loss": 0.3817, + "step": 5028 + }, + { + "epoch": 0.48944038929440387, + "grad_norm": 1.685222430956794, + "learning_rate": 5.414054127075872e-06, + "loss": 0.4739, + "step": 5029 + }, + { + "epoch": 0.4895377128953771, + "grad_norm": 1.674234312041242, + "learning_rate": 5.412483364918972e-06, + "loss": 0.481, + "step": 5030 + }, + { + "epoch": 0.4896350364963504, + "grad_norm": 1.530713201359312, + "learning_rate": 5.410912561773346e-06, + "loss": 0.1731, + "step": 5031 + }, + { + "epoch": 0.4897323600973236, + "grad_norm": 1.7243115696123834, + "learning_rate": 5.409341717795088e-06, + "loss": 0.6449, + "step": 5032 + }, + { + "epoch": 0.4898296836982968, + "grad_norm": 1.197138021866118, + "learning_rate": 5.40777083314029e-06, + "loss": 0.2843, + "step": 5033 + }, + { + "epoch": 0.4899270072992701, + "grad_norm": 1.4747319252677107, + "learning_rate": 5.406199907965055e-06, + "loss": 0.4054, + "step": 5034 + }, + { + "epoch": 0.49002433090024333, + "grad_norm": 1.2706173989537235, + "learning_rate": 5.404628942425485e-06, + "loss": 0.3084, + "step": 5035 + }, + { + "epoch": 0.49012165450121653, + "grad_norm": 1.150549782071767, + "learning_rate": 5.403057936677688e-06, + "loss": 0.3388, + "step": 5036 + }, + { + "epoch": 0.4902189781021898, + "grad_norm": 1.524267430312653, + "learning_rate": 5.401486890877777e-06, + "loss": 0.515, + "step": 5037 + }, + { + "epoch": 0.49031630170316304, + "grad_norm": 1.5902620932052205, + "learning_rate": 5.399915805181866e-06, + "loss": 0.4287, + "step": 5038 + }, + { + "epoch": 0.49041362530413624, + "grad_norm": 1.2788113903176752, + "learning_rate": 5.398344679746077e-06, + "loss": 0.3266, + "step": 5039 + }, + { + "epoch": 0.4905109489051095, + "grad_norm": 1.5081258655708445, + "learning_rate": 5.39677351472653e-06, + "loss": 0.4176, + "step": 5040 + }, + { + "epoch": 0.49060827250608274, + "grad_norm": 1.8789293442415165, + "learning_rate": 5.395202310279356e-06, + "loss": 0.3402, + "step": 5041 + }, + { + "epoch": 0.49070559610705594, + "grad_norm": 1.7754391033071233, + "learning_rate": 5.393631066560685e-06, + "loss": 0.5186, + "step": 5042 + }, + { + "epoch": 0.4908029197080292, + "grad_norm": 1.4392182405674039, + "learning_rate": 5.392059783726655e-06, + "loss": 0.3767, + "step": 5043 + }, + { + "epoch": 0.49090024330900245, + "grad_norm": 1.284098552272941, + "learning_rate": 5.3904884619334005e-06, + "loss": 0.2681, + "step": 5044 + }, + { + "epoch": 0.49099756690997565, + "grad_norm": 1.7355035083134127, + "learning_rate": 5.38891710133707e-06, + "loss": 0.4057, + "step": 5045 + }, + { + "epoch": 0.4910948905109489, + "grad_norm": 1.4529751115834646, + "learning_rate": 5.387345702093807e-06, + "loss": 0.3407, + "step": 5046 + }, + { + "epoch": 0.49119221411192215, + "grad_norm": 1.6764104057784581, + "learning_rate": 5.385774264359763e-06, + "loss": 0.4085, + "step": 5047 + }, + { + "epoch": 0.49128953771289535, + "grad_norm": 2.040083005001846, + "learning_rate": 5.384202788291095e-06, + "loss": 0.2671, + "step": 5048 + }, + { + "epoch": 0.4913868613138686, + "grad_norm": 1.4438382439666158, + "learning_rate": 5.382631274043958e-06, + "loss": 0.4988, + "step": 5049 + }, + { + "epoch": 0.49148418491484186, + "grad_norm": 1.3415722610399325, + "learning_rate": 5.3810597217745175e-06, + "loss": 0.3087, + "step": 5050 + }, + { + "epoch": 0.49158150851581506, + "grad_norm": 1.7837742103051073, + "learning_rate": 5.379488131638937e-06, + "loss": 0.7146, + "step": 5051 + }, + { + "epoch": 0.4916788321167883, + "grad_norm": 1.6439537792298233, + "learning_rate": 5.377916503793388e-06, + "loss": 0.4824, + "step": 5052 + }, + { + "epoch": 0.49177615571776157, + "grad_norm": 1.416325902159072, + "learning_rate": 5.376344838394043e-06, + "loss": 0.4164, + "step": 5053 + }, + { + "epoch": 0.4918734793187348, + "grad_norm": 2.45441320332752, + "learning_rate": 5.374773135597081e-06, + "loss": 0.2579, + "step": 5054 + }, + { + "epoch": 0.491970802919708, + "grad_norm": 1.91651852353104, + "learning_rate": 5.373201395558684e-06, + "loss": 0.2235, + "step": 5055 + }, + { + "epoch": 0.49206812652068127, + "grad_norm": 1.515787332237694, + "learning_rate": 5.371629618435031e-06, + "loss": 0.5142, + "step": 5056 + }, + { + "epoch": 0.4921654501216545, + "grad_norm": 1.6655105952324112, + "learning_rate": 5.370057804382317e-06, + "loss": 0.3892, + "step": 5057 + }, + { + "epoch": 0.4922627737226277, + "grad_norm": 1.5378211577647913, + "learning_rate": 5.36848595355673e-06, + "loss": 0.5481, + "step": 5058 + }, + { + "epoch": 0.492360097323601, + "grad_norm": 1.58946159699911, + "learning_rate": 5.366914066114469e-06, + "loss": 0.4617, + "step": 5059 + }, + { + "epoch": 0.49245742092457423, + "grad_norm": 1.3627819469502345, + "learning_rate": 5.36534214221173e-06, + "loss": 0.3906, + "step": 5060 + }, + { + "epoch": 0.4925547445255474, + "grad_norm": 1.1795425831286543, + "learning_rate": 5.36377018200472e-06, + "loss": 0.2812, + "step": 5061 + }, + { + "epoch": 0.4926520681265207, + "grad_norm": 1.404350778449777, + "learning_rate": 5.362198185649642e-06, + "loss": 0.4139, + "step": 5062 + }, + { + "epoch": 0.49274939172749394, + "grad_norm": 1.3558242727366516, + "learning_rate": 5.360626153302707e-06, + "loss": 0.2076, + "step": 5063 + }, + { + "epoch": 0.49284671532846713, + "grad_norm": 1.544055179593263, + "learning_rate": 5.359054085120131e-06, + "loss": 0.3684, + "step": 5064 + }, + { + "epoch": 0.4929440389294404, + "grad_norm": 1.2038715077807927, + "learning_rate": 5.357481981258129e-06, + "loss": 0.2324, + "step": 5065 + }, + { + "epoch": 0.49304136253041364, + "grad_norm": 1.6667516766166808, + "learning_rate": 5.355909841872924e-06, + "loss": 0.5443, + "step": 5066 + }, + { + "epoch": 0.49313868613138684, + "grad_norm": 1.7590657831440246, + "learning_rate": 5.354337667120737e-06, + "loss": 0.7178, + "step": 5067 + }, + { + "epoch": 0.4932360097323601, + "grad_norm": 1.771383028170083, + "learning_rate": 5.352765457157799e-06, + "loss": 0.4871, + "step": 5068 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 1.305854229998231, + "learning_rate": 5.351193212140341e-06, + "loss": 0.28, + "step": 5069 + }, + { + "epoch": 0.49343065693430654, + "grad_norm": 1.6390936834898198, + "learning_rate": 5.349620932224598e-06, + "loss": 0.4043, + "step": 5070 + }, + { + "epoch": 0.4935279805352798, + "grad_norm": 1.3421541540690891, + "learning_rate": 5.348048617566808e-06, + "loss": 0.3224, + "step": 5071 + }, + { + "epoch": 0.49362530413625305, + "grad_norm": 1.397997994859428, + "learning_rate": 5.346476268323213e-06, + "loss": 0.4431, + "step": 5072 + }, + { + "epoch": 0.4937226277372263, + "grad_norm": 1.6316665205340113, + "learning_rate": 5.3449038846500575e-06, + "loss": 0.3536, + "step": 5073 + }, + { + "epoch": 0.4938199513381995, + "grad_norm": 1.5267178162636428, + "learning_rate": 5.343331466703592e-06, + "loss": 0.4414, + "step": 5074 + }, + { + "epoch": 0.49391727493917276, + "grad_norm": 1.7290466527386708, + "learning_rate": 5.341759014640068e-06, + "loss": 0.583, + "step": 5075 + }, + { + "epoch": 0.494014598540146, + "grad_norm": 1.4809204861753724, + "learning_rate": 5.340186528615738e-06, + "loss": 0.4611, + "step": 5076 + }, + { + "epoch": 0.4941119221411192, + "grad_norm": 1.7473791115003812, + "learning_rate": 5.3386140087868665e-06, + "loss": 0.3554, + "step": 5077 + }, + { + "epoch": 0.49420924574209246, + "grad_norm": 1.5159704070411184, + "learning_rate": 5.337041455309712e-06, + "loss": 0.482, + "step": 5078 + }, + { + "epoch": 0.4943065693430657, + "grad_norm": 1.3566136430802587, + "learning_rate": 5.3354688683405396e-06, + "loss": 0.2888, + "step": 5079 + }, + { + "epoch": 0.4944038929440389, + "grad_norm": 1.452230034326972, + "learning_rate": 5.33389624803562e-06, + "loss": 0.3318, + "step": 5080 + }, + { + "epoch": 0.49450121654501217, + "grad_norm": 1.4110866869083454, + "learning_rate": 5.332323594551227e-06, + "loss": 0.4727, + "step": 5081 + }, + { + "epoch": 0.4945985401459854, + "grad_norm": 1.3293925351527427, + "learning_rate": 5.3307509080436324e-06, + "loss": 0.3068, + "step": 5082 + }, + { + "epoch": 0.4946958637469586, + "grad_norm": 1.743561480869901, + "learning_rate": 5.329178188669118e-06, + "loss": 0.3913, + "step": 5083 + }, + { + "epoch": 0.4947931873479319, + "grad_norm": 1.213242730034601, + "learning_rate": 5.3276054365839626e-06, + "loss": 0.2869, + "step": 5084 + }, + { + "epoch": 0.4948905109489051, + "grad_norm": 1.3356450078569697, + "learning_rate": 5.326032651944454e-06, + "loss": 0.3197, + "step": 5085 + }, + { + "epoch": 0.4949878345498783, + "grad_norm": 1.5030506424941168, + "learning_rate": 5.324459834906882e-06, + "loss": 0.3483, + "step": 5086 + }, + { + "epoch": 0.4950851581508516, + "grad_norm": 1.7122767532626826, + "learning_rate": 5.322886985627535e-06, + "loss": 0.3508, + "step": 5087 + }, + { + "epoch": 0.49518248175182483, + "grad_norm": 1.5107896892277575, + "learning_rate": 5.321314104262711e-06, + "loss": 0.4561, + "step": 5088 + }, + { + "epoch": 0.49527980535279803, + "grad_norm": 1.3812632529887294, + "learning_rate": 5.319741190968706e-06, + "loss": 0.4924, + "step": 5089 + }, + { + "epoch": 0.4953771289537713, + "grad_norm": 1.5944586281730757, + "learning_rate": 5.318168245901823e-06, + "loss": 0.3154, + "step": 5090 + }, + { + "epoch": 0.49547445255474454, + "grad_norm": 1.5509847127971457, + "learning_rate": 5.316595269218367e-06, + "loss": 0.4957, + "step": 5091 + }, + { + "epoch": 0.49557177615571774, + "grad_norm": 1.3107801704434123, + "learning_rate": 5.315022261074642e-06, + "loss": 0.4174, + "step": 5092 + }, + { + "epoch": 0.495669099756691, + "grad_norm": 1.563988853649021, + "learning_rate": 5.313449221626965e-06, + "loss": 0.5301, + "step": 5093 + }, + { + "epoch": 0.49576642335766424, + "grad_norm": 1.2459488069907703, + "learning_rate": 5.311876151031642e-06, + "loss": 0.2666, + "step": 5094 + }, + { + "epoch": 0.4958637469586375, + "grad_norm": 1.7150798105069895, + "learning_rate": 5.310303049444996e-06, + "loss": 0.518, + "step": 5095 + }, + { + "epoch": 0.4959610705596107, + "grad_norm": 1.490026056970703, + "learning_rate": 5.308729917023346e-06, + "loss": 0.2927, + "step": 5096 + }, + { + "epoch": 0.49605839416058395, + "grad_norm": 1.412429878722388, + "learning_rate": 5.307156753923014e-06, + "loss": 0.3318, + "step": 5097 + }, + { + "epoch": 0.4961557177615572, + "grad_norm": 1.5612364428308012, + "learning_rate": 5.305583560300325e-06, + "loss": 0.4288, + "step": 5098 + }, + { + "epoch": 0.4962530413625304, + "grad_norm": 1.7470922836534188, + "learning_rate": 5.304010336311611e-06, + "loss": 0.3495, + "step": 5099 + }, + { + "epoch": 0.49635036496350365, + "grad_norm": 1.4355178103456536, + "learning_rate": 5.302437082113203e-06, + "loss": 0.4107, + "step": 5100 + }, + { + "epoch": 0.4964476885644769, + "grad_norm": 1.3910614357496167, + "learning_rate": 5.300863797861436e-06, + "loss": 0.3276, + "step": 5101 + }, + { + "epoch": 0.4965450121654501, + "grad_norm": 1.369456647896358, + "learning_rate": 5.29929048371265e-06, + "loss": 0.3209, + "step": 5102 + }, + { + "epoch": 0.49664233576642336, + "grad_norm": 1.3194231276993127, + "learning_rate": 5.297717139823183e-06, + "loss": 0.2718, + "step": 5103 + }, + { + "epoch": 0.4967396593673966, + "grad_norm": 1.5736389606985905, + "learning_rate": 5.2961437663493805e-06, + "loss": 0.4813, + "step": 5104 + }, + { + "epoch": 0.4968369829683698, + "grad_norm": 1.1553842551060758, + "learning_rate": 5.294570363447589e-06, + "loss": 0.3266, + "step": 5105 + }, + { + "epoch": 0.49693430656934306, + "grad_norm": 1.586373555787601, + "learning_rate": 5.2929969312741625e-06, + "loss": 0.5386, + "step": 5106 + }, + { + "epoch": 0.4970316301703163, + "grad_norm": 1.8884361774869836, + "learning_rate": 5.291423469985449e-06, + "loss": 0.5736, + "step": 5107 + }, + { + "epoch": 0.4971289537712895, + "grad_norm": 1.5461639928065092, + "learning_rate": 5.289849979737808e-06, + "loss": 0.5066, + "step": 5108 + }, + { + "epoch": 0.49722627737226277, + "grad_norm": 1.1938311304614138, + "learning_rate": 5.288276460687595e-06, + "loss": 0.3225, + "step": 5109 + }, + { + "epoch": 0.497323600973236, + "grad_norm": 1.5838662137551112, + "learning_rate": 5.286702912991172e-06, + "loss": 0.4726, + "step": 5110 + }, + { + "epoch": 0.4974209245742092, + "grad_norm": 1.6421001523613388, + "learning_rate": 5.285129336804905e-06, + "loss": 0.4279, + "step": 5111 + }, + { + "epoch": 0.4975182481751825, + "grad_norm": 1.965461770271289, + "learning_rate": 5.283555732285161e-06, + "loss": 0.3888, + "step": 5112 + }, + { + "epoch": 0.49761557177615573, + "grad_norm": 1.408633437525859, + "learning_rate": 5.28198209958831e-06, + "loss": 0.3822, + "step": 5113 + }, + { + "epoch": 0.4977128953771289, + "grad_norm": 1.3523571346508707, + "learning_rate": 5.280408438870723e-06, + "loss": 0.3911, + "step": 5114 + }, + { + "epoch": 0.4978102189781022, + "grad_norm": 1.2093674849244664, + "learning_rate": 5.2788347502887775e-06, + "loss": 0.2573, + "step": 5115 + }, + { + "epoch": 0.49790754257907544, + "grad_norm": 1.5253078618962868, + "learning_rate": 5.277261033998852e-06, + "loss": 0.3471, + "step": 5116 + }, + { + "epoch": 0.4980048661800487, + "grad_norm": 1.5897083271657746, + "learning_rate": 5.2756872901573275e-06, + "loss": 0.4199, + "step": 5117 + }, + { + "epoch": 0.4981021897810219, + "grad_norm": 2.0022642907947117, + "learning_rate": 5.274113518920586e-06, + "loss": 0.3263, + "step": 5118 + }, + { + "epoch": 0.49819951338199514, + "grad_norm": 1.6320439770514106, + "learning_rate": 5.272539720445017e-06, + "loss": 0.5367, + "step": 5119 + }, + { + "epoch": 0.4982968369829684, + "grad_norm": 2.330014786734751, + "learning_rate": 5.270965894887008e-06, + "loss": 0.4306, + "step": 5120 + }, + { + "epoch": 0.4983941605839416, + "grad_norm": 1.47016643645919, + "learning_rate": 5.269392042402951e-06, + "loss": 0.4917, + "step": 5121 + }, + { + "epoch": 0.49849148418491485, + "grad_norm": 1.239272241530952, + "learning_rate": 5.267818163149242e-06, + "loss": 0.2625, + "step": 5122 + }, + { + "epoch": 0.4985888077858881, + "grad_norm": 1.249212780068013, + "learning_rate": 5.266244257282277e-06, + "loss": 0.2373, + "step": 5123 + }, + { + "epoch": 0.4986861313868613, + "grad_norm": 1.2338533868729509, + "learning_rate": 5.264670324958458e-06, + "loss": 0.3284, + "step": 5124 + }, + { + "epoch": 0.49878345498783455, + "grad_norm": 1.4536950418673635, + "learning_rate": 5.2630963663341835e-06, + "loss": 0.3611, + "step": 5125 + }, + { + "epoch": 0.4988807785888078, + "grad_norm": 1.849642695899207, + "learning_rate": 5.261522381565863e-06, + "loss": 0.432, + "step": 5126 + }, + { + "epoch": 0.498978102189781, + "grad_norm": 1.1337195612513011, + "learning_rate": 5.259948370809902e-06, + "loss": 0.2563, + "step": 5127 + }, + { + "epoch": 0.49907542579075426, + "grad_norm": 1.9535786826596058, + "learning_rate": 5.258374334222712e-06, + "loss": 0.4362, + "step": 5128 + }, + { + "epoch": 0.4991727493917275, + "grad_norm": 1.5933493316251561, + "learning_rate": 5.256800271960707e-06, + "loss": 0.264, + "step": 5129 + }, + { + "epoch": 0.4992700729927007, + "grad_norm": 1.7529896675410312, + "learning_rate": 5.2552261841803e-06, + "loss": 0.3878, + "step": 5130 + }, + { + "epoch": 0.49936739659367396, + "grad_norm": 1.3258193253920645, + "learning_rate": 5.2536520710379095e-06, + "loss": 0.3274, + "step": 5131 + }, + { + "epoch": 0.4994647201946472, + "grad_norm": 1.4931087346426537, + "learning_rate": 5.252077932689956e-06, + "loss": 0.3069, + "step": 5132 + }, + { + "epoch": 0.4995620437956204, + "grad_norm": 1.3760062783730125, + "learning_rate": 5.2505037692928654e-06, + "loss": 0.3101, + "step": 5133 + }, + { + "epoch": 0.49965936739659367, + "grad_norm": 1.564688027947308, + "learning_rate": 5.248929581003061e-06, + "loss": 0.327, + "step": 5134 + }, + { + "epoch": 0.4997566909975669, + "grad_norm": 1.4480734358338416, + "learning_rate": 5.247355367976971e-06, + "loss": 0.3627, + "step": 5135 + }, + { + "epoch": 0.4998540145985401, + "grad_norm": 1.2929757570382263, + "learning_rate": 5.245781130371025e-06, + "loss": 0.3447, + "step": 5136 + }, + { + "epoch": 0.4999513381995134, + "grad_norm": 0.9760162656906078, + "learning_rate": 5.244206868341657e-06, + "loss": 0.1752, + "step": 5137 + }, + { + "epoch": 0.5000486618004866, + "grad_norm": 1.3141856948855382, + "learning_rate": 5.242632582045304e-06, + "loss": 0.3896, + "step": 5138 + }, + { + "epoch": 0.5001459854014598, + "grad_norm": 1.3324418763443528, + "learning_rate": 5.241058271638401e-06, + "loss": 0.3976, + "step": 5139 + }, + { + "epoch": 0.5002433090024331, + "grad_norm": 1.2800905517496453, + "learning_rate": 5.23948393727739e-06, + "loss": 0.2589, + "step": 5140 + }, + { + "epoch": 0.5003406326034063, + "grad_norm": 1.5624023406980514, + "learning_rate": 5.237909579118713e-06, + "loss": 0.2955, + "step": 5141 + }, + { + "epoch": 0.5004379562043796, + "grad_norm": 1.2296745037860775, + "learning_rate": 5.236335197318814e-06, + "loss": 0.3862, + "step": 5142 + }, + { + "epoch": 0.5005352798053528, + "grad_norm": 1.4387826587407209, + "learning_rate": 5.23476079203414e-06, + "loss": 0.4948, + "step": 5143 + }, + { + "epoch": 0.500632603406326, + "grad_norm": 1.4881502028713878, + "learning_rate": 5.2331863634211455e-06, + "loss": 0.3596, + "step": 5144 + }, + { + "epoch": 0.5007299270072992, + "grad_norm": 1.5866263260775346, + "learning_rate": 5.2316119116362765e-06, + "loss": 0.4472, + "step": 5145 + }, + { + "epoch": 0.5008272506082725, + "grad_norm": 1.3412917962010176, + "learning_rate": 5.23003743683599e-06, + "loss": 0.4589, + "step": 5146 + }, + { + "epoch": 0.5009245742092457, + "grad_norm": 1.3788291758320874, + "learning_rate": 5.2284629391767405e-06, + "loss": 0.474, + "step": 5147 + }, + { + "epoch": 0.501021897810219, + "grad_norm": 1.9449824164216007, + "learning_rate": 5.22688841881499e-06, + "loss": 0.2456, + "step": 5148 + }, + { + "epoch": 0.5011192214111923, + "grad_norm": 1.538030287931762, + "learning_rate": 5.225313875907198e-06, + "loss": 0.4345, + "step": 5149 + }, + { + "epoch": 0.5012165450121655, + "grad_norm": 1.184014727065925, + "learning_rate": 5.223739310609827e-06, + "loss": 0.2862, + "step": 5150 + }, + { + "epoch": 0.5013138686131386, + "grad_norm": 1.4929447349800702, + "learning_rate": 5.222164723079344e-06, + "loss": 0.571, + "step": 5151 + }, + { + "epoch": 0.5014111922141119, + "grad_norm": 1.007335106203281, + "learning_rate": 5.220590113472214e-06, + "loss": 0.236, + "step": 5152 + }, + { + "epoch": 0.5015085158150852, + "grad_norm": 1.106142142258815, + "learning_rate": 5.21901548194491e-06, + "loss": 0.264, + "step": 5153 + }, + { + "epoch": 0.5016058394160584, + "grad_norm": 1.1532992264954316, + "learning_rate": 5.217440828653902e-06, + "loss": 0.1964, + "step": 5154 + }, + { + "epoch": 0.5017031630170317, + "grad_norm": 1.4016312087960912, + "learning_rate": 5.215866153755667e-06, + "loss": 0.3007, + "step": 5155 + }, + { + "epoch": 0.5018004866180049, + "grad_norm": 1.4902958088582248, + "learning_rate": 5.214291457406679e-06, + "loss": 0.6606, + "step": 5156 + }, + { + "epoch": 0.5018978102189781, + "grad_norm": 1.800707966602163, + "learning_rate": 5.212716739763417e-06, + "loss": 0.2991, + "step": 5157 + }, + { + "epoch": 0.5019951338199513, + "grad_norm": 1.1795829069746984, + "learning_rate": 5.211142000982361e-06, + "loss": 0.232, + "step": 5158 + }, + { + "epoch": 0.5020924574209246, + "grad_norm": 1.3363897439309071, + "learning_rate": 5.209567241219995e-06, + "loss": 0.3185, + "step": 5159 + }, + { + "epoch": 0.5021897810218978, + "grad_norm": 1.4984272022708915, + "learning_rate": 5.207992460632805e-06, + "loss": 0.4622, + "step": 5160 + }, + { + "epoch": 0.5022871046228711, + "grad_norm": 1.6346673240867733, + "learning_rate": 5.206417659377274e-06, + "loss": 0.6491, + "step": 5161 + }, + { + "epoch": 0.5023844282238443, + "grad_norm": 1.4843904193278086, + "learning_rate": 5.204842837609896e-06, + "loss": 0.531, + "step": 5162 + }, + { + "epoch": 0.5024817518248175, + "grad_norm": 1.4928961863467327, + "learning_rate": 5.203267995487156e-06, + "loss": 0.4246, + "step": 5163 + }, + { + "epoch": 0.5025790754257907, + "grad_norm": 1.403113831133665, + "learning_rate": 5.201693133165553e-06, + "loss": 0.3214, + "step": 5164 + }, + { + "epoch": 0.502676399026764, + "grad_norm": 1.418220182463448, + "learning_rate": 5.200118250801579e-06, + "loss": 0.3571, + "step": 5165 + }, + { + "epoch": 0.5027737226277372, + "grad_norm": 1.541211172637322, + "learning_rate": 5.19854334855173e-06, + "loss": 0.4525, + "step": 5166 + }, + { + "epoch": 0.5028710462287105, + "grad_norm": 1.479374705267467, + "learning_rate": 5.196968426572509e-06, + "loss": 0.4558, + "step": 5167 + }, + { + "epoch": 0.5029683698296837, + "grad_norm": 1.3299527326229879, + "learning_rate": 5.19539348502041e-06, + "loss": 0.3289, + "step": 5168 + }, + { + "epoch": 0.5030656934306569, + "grad_norm": 1.4899737328895861, + "learning_rate": 5.193818524051944e-06, + "loss": 0.3758, + "step": 5169 + }, + { + "epoch": 0.5031630170316301, + "grad_norm": 1.6762585082609824, + "learning_rate": 5.192243543823611e-06, + "loss": 0.4284, + "step": 5170 + }, + { + "epoch": 0.5032603406326034, + "grad_norm": 1.3559478880340818, + "learning_rate": 5.190668544491919e-06, + "loss": 0.2668, + "step": 5171 + }, + { + "epoch": 0.5033576642335766, + "grad_norm": 1.379702459753202, + "learning_rate": 5.1890935262133765e-06, + "loss": 0.3795, + "step": 5172 + }, + { + "epoch": 0.5034549878345499, + "grad_norm": 1.2992525941589073, + "learning_rate": 5.187518489144494e-06, + "loss": 0.2682, + "step": 5173 + }, + { + "epoch": 0.5035523114355231, + "grad_norm": 1.2960847952029224, + "learning_rate": 5.1859434334417845e-06, + "loss": 0.3201, + "step": 5174 + }, + { + "epoch": 0.5036496350364964, + "grad_norm": 1.5377677467251165, + "learning_rate": 5.184368359261761e-06, + "loss": 0.2732, + "step": 5175 + }, + { + "epoch": 0.5037469586374695, + "grad_norm": 1.2077602419583777, + "learning_rate": 5.182793266760942e-06, + "loss": 0.3052, + "step": 5176 + }, + { + "epoch": 0.5038442822384428, + "grad_norm": 1.3311778268368157, + "learning_rate": 5.181218156095842e-06, + "loss": 0.3701, + "step": 5177 + }, + { + "epoch": 0.503941605839416, + "grad_norm": 1.6803458409604715, + "learning_rate": 5.179643027422983e-06, + "loss": 0.6306, + "step": 5178 + }, + { + "epoch": 0.5040389294403893, + "grad_norm": 1.7291322114874463, + "learning_rate": 5.178067880898884e-06, + "loss": 0.4439, + "step": 5179 + }, + { + "epoch": 0.5041362530413626, + "grad_norm": 1.3614150776177938, + "learning_rate": 5.176492716680072e-06, + "loss": 0.3516, + "step": 5180 + }, + { + "epoch": 0.5042335766423358, + "grad_norm": 1.4597611204449916, + "learning_rate": 5.174917534923071e-06, + "loss": 0.5477, + "step": 5181 + }, + { + "epoch": 0.504330900243309, + "grad_norm": 1.192848258438179, + "learning_rate": 5.173342335784407e-06, + "loss": 0.3049, + "step": 5182 + }, + { + "epoch": 0.5044282238442822, + "grad_norm": 1.3529121407700522, + "learning_rate": 5.171767119420609e-06, + "loss": 0.3515, + "step": 5183 + }, + { + "epoch": 0.5045255474452555, + "grad_norm": 1.3189405276532165, + "learning_rate": 5.170191885988204e-06, + "loss": 0.303, + "step": 5184 + }, + { + "epoch": 0.5046228710462287, + "grad_norm": 1.3395134432503608, + "learning_rate": 5.168616635643728e-06, + "loss": 0.4379, + "step": 5185 + }, + { + "epoch": 0.504720194647202, + "grad_norm": 1.4687489448705902, + "learning_rate": 5.167041368543714e-06, + "loss": 0.3411, + "step": 5186 + }, + { + "epoch": 0.5048175182481752, + "grad_norm": 1.4648007806197731, + "learning_rate": 5.165466084844697e-06, + "loss": 0.4563, + "step": 5187 + }, + { + "epoch": 0.5049148418491484, + "grad_norm": 1.3408152447498458, + "learning_rate": 5.163890784703211e-06, + "loss": 0.3668, + "step": 5188 + }, + { + "epoch": 0.5050121654501216, + "grad_norm": 1.2218158502725498, + "learning_rate": 5.1623154682757985e-06, + "loss": 0.3419, + "step": 5189 + }, + { + "epoch": 0.5051094890510949, + "grad_norm": 1.6401215280304642, + "learning_rate": 5.160740135718998e-06, + "loss": 0.481, + "step": 5190 + }, + { + "epoch": 0.5052068126520681, + "grad_norm": 1.4872384811163053, + "learning_rate": 5.1591647871893525e-06, + "loss": 0.5492, + "step": 5191 + }, + { + "epoch": 0.5053041362530414, + "grad_norm": 1.4379726959575616, + "learning_rate": 5.157589422843405e-06, + "loss": 0.4766, + "step": 5192 + }, + { + "epoch": 0.5054014598540146, + "grad_norm": 1.5206530041669701, + "learning_rate": 5.156014042837696e-06, + "loss": 0.6581, + "step": 5193 + }, + { + "epoch": 0.5054987834549879, + "grad_norm": 1.395265871915643, + "learning_rate": 5.154438647328778e-06, + "loss": 0.4679, + "step": 5194 + }, + { + "epoch": 0.505596107055961, + "grad_norm": 1.684967401115602, + "learning_rate": 5.152863236473195e-06, + "loss": 0.5604, + "step": 5195 + }, + { + "epoch": 0.5056934306569343, + "grad_norm": 1.3288036632018836, + "learning_rate": 5.151287810427501e-06, + "loss": 0.3682, + "step": 5196 + }, + { + "epoch": 0.5057907542579075, + "grad_norm": 1.5564774986574346, + "learning_rate": 5.1497123693482435e-06, + "loss": 0.3276, + "step": 5197 + }, + { + "epoch": 0.5058880778588808, + "grad_norm": 1.4388292220880394, + "learning_rate": 5.148136913391976e-06, + "loss": 0.4782, + "step": 5198 + }, + { + "epoch": 0.505985401459854, + "grad_norm": 1.1216554729777923, + "learning_rate": 5.1465614427152495e-06, + "loss": 0.3055, + "step": 5199 + }, + { + "epoch": 0.5060827250608273, + "grad_norm": 1.3681833182810281, + "learning_rate": 5.144985957474625e-06, + "loss": 0.5088, + "step": 5200 + }, + { + "epoch": 0.5061800486618004, + "grad_norm": 1.4227728052800053, + "learning_rate": 5.1434104578266575e-06, + "loss": 0.49, + "step": 5201 + }, + { + "epoch": 0.5062773722627737, + "grad_norm": 1.5583998480624413, + "learning_rate": 5.1418349439279024e-06, + "loss": 0.458, + "step": 5202 + }, + { + "epoch": 0.506374695863747, + "grad_norm": 1.5438214613710497, + "learning_rate": 5.140259415934924e-06, + "loss": 0.4232, + "step": 5203 + }, + { + "epoch": 0.5064720194647202, + "grad_norm": 1.4731204164002378, + "learning_rate": 5.1386838740042786e-06, + "loss": 0.3355, + "step": 5204 + }, + { + "epoch": 0.5065693430656935, + "grad_norm": 1.4701803020269792, + "learning_rate": 5.137108318292533e-06, + "loss": 0.3414, + "step": 5205 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 1.5475226693657602, + "learning_rate": 5.135532748956249e-06, + "loss": 0.447, + "step": 5206 + }, + { + "epoch": 0.5067639902676399, + "grad_norm": 1.5093316050228651, + "learning_rate": 5.1339571661519934e-06, + "loss": 0.4464, + "step": 5207 + }, + { + "epoch": 0.5068613138686131, + "grad_norm": 1.471389509018518, + "learning_rate": 5.132381570036331e-06, + "loss": 0.4475, + "step": 5208 + }, + { + "epoch": 0.5069586374695864, + "grad_norm": 1.1213425416723295, + "learning_rate": 5.130805960765831e-06, + "loss": 0.2829, + "step": 5209 + }, + { + "epoch": 0.5070559610705596, + "grad_norm": 1.6683753632476268, + "learning_rate": 5.129230338497062e-06, + "loss": 0.7265, + "step": 5210 + }, + { + "epoch": 0.5071532846715329, + "grad_norm": 1.3873827996457901, + "learning_rate": 5.127654703386596e-06, + "loss": 0.3492, + "step": 5211 + }, + { + "epoch": 0.5072506082725061, + "grad_norm": 1.7396379641179833, + "learning_rate": 5.126079055591002e-06, + "loss": 0.4454, + "step": 5212 + }, + { + "epoch": 0.5073479318734794, + "grad_norm": 1.3220037880062712, + "learning_rate": 5.1245033952668556e-06, + "loss": 0.3256, + "step": 5213 + }, + { + "epoch": 0.5074452554744525, + "grad_norm": 1.6095446154646171, + "learning_rate": 5.122927722570731e-06, + "loss": 0.5942, + "step": 5214 + }, + { + "epoch": 0.5075425790754258, + "grad_norm": 1.3887406344231006, + "learning_rate": 5.121352037659201e-06, + "loss": 0.4056, + "step": 5215 + }, + { + "epoch": 0.507639902676399, + "grad_norm": 1.7774251730576711, + "learning_rate": 5.119776340688846e-06, + "loss": 0.3668, + "step": 5216 + }, + { + "epoch": 0.5077372262773723, + "grad_norm": 1.3880009700458447, + "learning_rate": 5.118200631816241e-06, + "loss": 0.4426, + "step": 5217 + }, + { + "epoch": 0.5078345498783455, + "grad_norm": 1.1618270402831925, + "learning_rate": 5.116624911197968e-06, + "loss": 0.3173, + "step": 5218 + }, + { + "epoch": 0.5079318734793188, + "grad_norm": 1.3052971608998365, + "learning_rate": 5.115049178990606e-06, + "loss": 0.3429, + "step": 5219 + }, + { + "epoch": 0.5080291970802919, + "grad_norm": 1.395739889101101, + "learning_rate": 5.113473435350736e-06, + "loss": 0.2611, + "step": 5220 + }, + { + "epoch": 0.5081265206812652, + "grad_norm": 1.4207574833436742, + "learning_rate": 5.11189768043494e-06, + "loss": 0.3232, + "step": 5221 + }, + { + "epoch": 0.5082238442822384, + "grad_norm": 1.5056864717126979, + "learning_rate": 5.110321914399803e-06, + "loss": 0.5251, + "step": 5222 + }, + { + "epoch": 0.5083211678832117, + "grad_norm": 1.3428681607214714, + "learning_rate": 5.108746137401911e-06, + "loss": 0.4092, + "step": 5223 + }, + { + "epoch": 0.5084184914841849, + "grad_norm": 1.4359802837452804, + "learning_rate": 5.107170349597847e-06, + "loss": 0.3664, + "step": 5224 + }, + { + "epoch": 0.5085158150851582, + "grad_norm": 1.1453923446073062, + "learning_rate": 5.105594551144201e-06, + "loss": 0.3425, + "step": 5225 + }, + { + "epoch": 0.5086131386861313, + "grad_norm": 1.378996935231743, + "learning_rate": 5.104018742197557e-06, + "loss": 0.2565, + "step": 5226 + }, + { + "epoch": 0.5087104622871046, + "grad_norm": 1.3853043771234994, + "learning_rate": 5.1024429229145086e-06, + "loss": 0.3338, + "step": 5227 + }, + { + "epoch": 0.5088077858880778, + "grad_norm": 1.0898316085303188, + "learning_rate": 5.1008670934516444e-06, + "loss": 0.2411, + "step": 5228 + }, + { + "epoch": 0.5089051094890511, + "grad_norm": 1.3490666624583345, + "learning_rate": 5.099291253965554e-06, + "loss": 0.3622, + "step": 5229 + }, + { + "epoch": 0.5090024330900244, + "grad_norm": 1.4649127587911646, + "learning_rate": 5.097715404612832e-06, + "loss": 0.5378, + "step": 5230 + }, + { + "epoch": 0.5090997566909976, + "grad_norm": 2.5116790720889153, + "learning_rate": 5.096139545550068e-06, + "loss": 0.4589, + "step": 5231 + }, + { + "epoch": 0.5091970802919707, + "grad_norm": 1.2142760492789682, + "learning_rate": 5.094563676933859e-06, + "loss": 0.1932, + "step": 5232 + }, + { + "epoch": 0.509294403892944, + "grad_norm": 1.485833425316814, + "learning_rate": 5.0929877989207995e-06, + "loss": 0.5478, + "step": 5233 + }, + { + "epoch": 0.5093917274939173, + "grad_norm": 1.717694044853761, + "learning_rate": 5.091411911667486e-06, + "loss": 0.5881, + "step": 5234 + }, + { + "epoch": 0.5094890510948905, + "grad_norm": 1.4710499226690918, + "learning_rate": 5.089836015330514e-06, + "loss": 0.497, + "step": 5235 + }, + { + "epoch": 0.5095863746958638, + "grad_norm": 1.5055591566004431, + "learning_rate": 5.088260110066483e-06, + "loss": 0.4424, + "step": 5236 + }, + { + "epoch": 0.509683698296837, + "grad_norm": 1.5383315311116867, + "learning_rate": 5.086684196031989e-06, + "loss": 0.4836, + "step": 5237 + }, + { + "epoch": 0.5097810218978103, + "grad_norm": 1.5059324412548862, + "learning_rate": 5.0851082733836336e-06, + "loss": 0.3933, + "step": 5238 + }, + { + "epoch": 0.5098783454987834, + "grad_norm": 1.3820898031766242, + "learning_rate": 5.083532342278018e-06, + "loss": 0.3332, + "step": 5239 + }, + { + "epoch": 0.5099756690997567, + "grad_norm": 1.3425848589171392, + "learning_rate": 5.081956402871741e-06, + "loss": 0.274, + "step": 5240 + }, + { + "epoch": 0.5100729927007299, + "grad_norm": 1.3989700922846977, + "learning_rate": 5.080380455321406e-06, + "loss": 0.5732, + "step": 5241 + }, + { + "epoch": 0.5101703163017032, + "grad_norm": 1.5860258932802054, + "learning_rate": 5.078804499783616e-06, + "loss": 0.6319, + "step": 5242 + }, + { + "epoch": 0.5102676399026764, + "grad_norm": 1.3338998346083977, + "learning_rate": 5.077228536414973e-06, + "loss": 0.3491, + "step": 5243 + }, + { + "epoch": 0.5103649635036497, + "grad_norm": 1.3993635042032706, + "learning_rate": 5.075652565372085e-06, + "loss": 0.3831, + "step": 5244 + }, + { + "epoch": 0.5104622871046228, + "grad_norm": 1.390036940804176, + "learning_rate": 5.074076586811554e-06, + "loss": 0.4699, + "step": 5245 + }, + { + "epoch": 0.5105596107055961, + "grad_norm": 1.4230846248188307, + "learning_rate": 5.072500600889987e-06, + "loss": 0.3224, + "step": 5246 + }, + { + "epoch": 0.5106569343065693, + "grad_norm": 1.3887220070504431, + "learning_rate": 5.0709246077639916e-06, + "loss": 0.3999, + "step": 5247 + }, + { + "epoch": 0.5107542579075426, + "grad_norm": 1.4176001371775404, + "learning_rate": 5.069348607590173e-06, + "loss": 0.4229, + "step": 5248 + }, + { + "epoch": 0.5108515815085158, + "grad_norm": 1.5911285682486571, + "learning_rate": 5.0677726005251415e-06, + "loss": 0.3247, + "step": 5249 + }, + { + "epoch": 0.5109489051094891, + "grad_norm": 1.4656677370001712, + "learning_rate": 5.066196586725506e-06, + "loss": 0.3794, + "step": 5250 + }, + { + "epoch": 0.5110462287104622, + "grad_norm": 1.3077387963312597, + "learning_rate": 5.064620566347873e-06, + "loss": 0.2733, + "step": 5251 + }, + { + "epoch": 0.5111435523114355, + "grad_norm": 1.5215572393596322, + "learning_rate": 5.063044539548856e-06, + "loss": 0.4483, + "step": 5252 + }, + { + "epoch": 0.5112408759124087, + "grad_norm": 1.6044973139991519, + "learning_rate": 5.061468506485062e-06, + "loss": 0.388, + "step": 5253 + }, + { + "epoch": 0.511338199513382, + "grad_norm": 1.5795254678004327, + "learning_rate": 5.059892467313108e-06, + "loss": 0.4996, + "step": 5254 + }, + { + "epoch": 0.5114355231143553, + "grad_norm": 1.3418824281029393, + "learning_rate": 5.058316422189601e-06, + "loss": 0.3284, + "step": 5255 + }, + { + "epoch": 0.5115328467153285, + "grad_norm": 1.3351414857676405, + "learning_rate": 5.056740371271156e-06, + "loss": 0.4204, + "step": 5256 + }, + { + "epoch": 0.5116301703163018, + "grad_norm": 1.1698063202662288, + "learning_rate": 5.055164314714386e-06, + "loss": 0.3215, + "step": 5257 + }, + { + "epoch": 0.5117274939172749, + "grad_norm": 2.2901542065268994, + "learning_rate": 5.053588252675901e-06, + "loss": 0.3884, + "step": 5258 + }, + { + "epoch": 0.5118248175182482, + "grad_norm": 1.6006747134471273, + "learning_rate": 5.052012185312322e-06, + "loss": 0.61, + "step": 5259 + }, + { + "epoch": 0.5119221411192214, + "grad_norm": 1.4627634968543248, + "learning_rate": 5.0504361127802585e-06, + "loss": 0.3961, + "step": 5260 + }, + { + "epoch": 0.5120194647201947, + "grad_norm": 1.6105826578136704, + "learning_rate": 5.048860035236328e-06, + "loss": 0.4894, + "step": 5261 + }, + { + "epoch": 0.5121167883211679, + "grad_norm": 1.4599600393235113, + "learning_rate": 5.047283952837146e-06, + "loss": 0.5297, + "step": 5262 + }, + { + "epoch": 0.5122141119221412, + "grad_norm": 1.361239327773337, + "learning_rate": 5.045707865739327e-06, + "loss": 0.362, + "step": 5263 + }, + { + "epoch": 0.5123114355231143, + "grad_norm": 1.4699823606821112, + "learning_rate": 5.044131774099489e-06, + "loss": 0.4075, + "step": 5264 + }, + { + "epoch": 0.5124087591240876, + "grad_norm": 1.3088432415790754, + "learning_rate": 5.042555678074251e-06, + "loss": 0.3839, + "step": 5265 + }, + { + "epoch": 0.5125060827250608, + "grad_norm": 1.624117318087578, + "learning_rate": 5.040979577820231e-06, + "loss": 0.5712, + "step": 5266 + }, + { + "epoch": 0.5126034063260341, + "grad_norm": 1.2961648122519063, + "learning_rate": 5.039403473494042e-06, + "loss": 0.362, + "step": 5267 + }, + { + "epoch": 0.5127007299270073, + "grad_norm": 1.4685680195502195, + "learning_rate": 5.037827365252306e-06, + "loss": 0.3314, + "step": 5268 + }, + { + "epoch": 0.5127980535279806, + "grad_norm": 1.2816964278880971, + "learning_rate": 5.036251253251641e-06, + "loss": 0.3747, + "step": 5269 + }, + { + "epoch": 0.5128953771289537, + "grad_norm": 1.5665013006200665, + "learning_rate": 5.034675137648669e-06, + "loss": 0.5275, + "step": 5270 + }, + { + "epoch": 0.512992700729927, + "grad_norm": 1.187991141093583, + "learning_rate": 5.0330990186000066e-06, + "loss": 0.2295, + "step": 5271 + }, + { + "epoch": 0.5130900243309002, + "grad_norm": 1.4630057642282757, + "learning_rate": 5.0315228962622745e-06, + "loss": 0.5126, + "step": 5272 + }, + { + "epoch": 0.5131873479318735, + "grad_norm": 1.1695969883568185, + "learning_rate": 5.029946770792091e-06, + "loss": 0.3545, + "step": 5273 + }, + { + "epoch": 0.5132846715328467, + "grad_norm": 1.5783866438813425, + "learning_rate": 5.02837064234608e-06, + "loss": 0.2771, + "step": 5274 + }, + { + "epoch": 0.51338199513382, + "grad_norm": 1.3160515372408526, + "learning_rate": 5.02679451108086e-06, + "loss": 0.3071, + "step": 5275 + }, + { + "epoch": 0.5134793187347931, + "grad_norm": 1.5085722437253624, + "learning_rate": 5.025218377153054e-06, + "loss": 0.4331, + "step": 5276 + }, + { + "epoch": 0.5135766423357664, + "grad_norm": 1.1762146515353866, + "learning_rate": 5.023642240719282e-06, + "loss": 0.2547, + "step": 5277 + }, + { + "epoch": 0.5136739659367396, + "grad_norm": 1.5173333362139103, + "learning_rate": 5.022066101936166e-06, + "loss": 0.4665, + "step": 5278 + }, + { + "epoch": 0.5137712895377129, + "grad_norm": 1.675836237647703, + "learning_rate": 5.020489960960327e-06, + "loss": 0.5712, + "step": 5279 + }, + { + "epoch": 0.5138686131386861, + "grad_norm": 1.6130314742137548, + "learning_rate": 5.018913817948388e-06, + "loss": 0.3821, + "step": 5280 + }, + { + "epoch": 0.5139659367396594, + "grad_norm": 1.3104199775852088, + "learning_rate": 5.017337673056972e-06, + "loss": 0.3385, + "step": 5281 + }, + { + "epoch": 0.5140632603406327, + "grad_norm": 1.4924240324281064, + "learning_rate": 5.015761526442701e-06, + "loss": 0.4171, + "step": 5282 + }, + { + "epoch": 0.5141605839416058, + "grad_norm": 1.3479665205558893, + "learning_rate": 5.0141853782621985e-06, + "loss": 0.2608, + "step": 5283 + }, + { + "epoch": 0.514257907542579, + "grad_norm": 1.4408433430141794, + "learning_rate": 5.012609228672084e-06, + "loss": 0.3956, + "step": 5284 + }, + { + "epoch": 0.5143552311435523, + "grad_norm": 1.7542815284847701, + "learning_rate": 5.011033077828983e-06, + "loss": 0.648, + "step": 5285 + }, + { + "epoch": 0.5144525547445256, + "grad_norm": 1.2458564747646057, + "learning_rate": 5.00945692588952e-06, + "loss": 0.2349, + "step": 5286 + }, + { + "epoch": 0.5145498783454988, + "grad_norm": 1.4077158176023123, + "learning_rate": 5.0078807730103156e-06, + "loss": 0.3964, + "step": 5287 + }, + { + "epoch": 0.5146472019464721, + "grad_norm": 1.0361485745222532, + "learning_rate": 5.006304619347994e-06, + "loss": 0.2808, + "step": 5288 + }, + { + "epoch": 0.5147445255474452, + "grad_norm": 1.4391256345470669, + "learning_rate": 5.004728465059178e-06, + "loss": 0.3177, + "step": 5289 + }, + { + "epoch": 0.5148418491484185, + "grad_norm": 1.4253934149846805, + "learning_rate": 5.003152310300491e-06, + "loss": 0.4867, + "step": 5290 + }, + { + "epoch": 0.5149391727493917, + "grad_norm": 1.3615984217344341, + "learning_rate": 5.001576155228557e-06, + "loss": 0.4332, + "step": 5291 + }, + { + "epoch": 0.515036496350365, + "grad_norm": 1.120942427893155, + "learning_rate": 5e-06, + "loss": 0.1725, + "step": 5292 + }, + { + "epoch": 0.5151338199513382, + "grad_norm": 1.177574032928437, + "learning_rate": 4.998423844771444e-06, + "loss": 0.251, + "step": 5293 + }, + { + "epoch": 0.5152311435523115, + "grad_norm": 1.1411912635988943, + "learning_rate": 4.996847689699511e-06, + "loss": 0.2571, + "step": 5294 + }, + { + "epoch": 0.5153284671532846, + "grad_norm": 1.2861568185105374, + "learning_rate": 4.995271534940825e-06, + "loss": 0.392, + "step": 5295 + }, + { + "epoch": 0.5154257907542579, + "grad_norm": 1.245600827102656, + "learning_rate": 4.993695380652008e-06, + "loss": 0.3169, + "step": 5296 + }, + { + "epoch": 0.5155231143552311, + "grad_norm": 1.2725138678853647, + "learning_rate": 4.992119226989685e-06, + "loss": 0.4123, + "step": 5297 + }, + { + "epoch": 0.5156204379562044, + "grad_norm": 1.1625577173979076, + "learning_rate": 4.990543074110483e-06, + "loss": 0.3327, + "step": 5298 + }, + { + "epoch": 0.5157177615571776, + "grad_norm": 1.207443759390374, + "learning_rate": 4.9889669221710186e-06, + "loss": 0.3293, + "step": 5299 + }, + { + "epoch": 0.5158150851581509, + "grad_norm": 1.0668765417891246, + "learning_rate": 4.987390771327917e-06, + "loss": 0.2386, + "step": 5300 + }, + { + "epoch": 0.5159124087591241, + "grad_norm": 1.3624658632814173, + "learning_rate": 4.985814621737803e-06, + "loss": 0.3544, + "step": 5301 + }, + { + "epoch": 0.5160097323600973, + "grad_norm": 1.5246947749129098, + "learning_rate": 4.9842384735573e-06, + "loss": 0.4873, + "step": 5302 + }, + { + "epoch": 0.5161070559610705, + "grad_norm": 1.5373783869834368, + "learning_rate": 4.9826623269430286e-06, + "loss": 0.4964, + "step": 5303 + }, + { + "epoch": 0.5162043795620438, + "grad_norm": 1.5948966893906018, + "learning_rate": 4.981086182051612e-06, + "loss": 0.4453, + "step": 5304 + }, + { + "epoch": 0.516301703163017, + "grad_norm": 1.637733600057433, + "learning_rate": 4.979510039039674e-06, + "loss": 0.4774, + "step": 5305 + }, + { + "epoch": 0.5163990267639903, + "grad_norm": 1.4623466672153929, + "learning_rate": 4.977933898063836e-06, + "loss": 0.4613, + "step": 5306 + }, + { + "epoch": 0.5164963503649636, + "grad_norm": 1.1544349445832902, + "learning_rate": 4.9763577592807195e-06, + "loss": 0.283, + "step": 5307 + }, + { + "epoch": 0.5165936739659367, + "grad_norm": 1.4818676300702187, + "learning_rate": 4.974781622846946e-06, + "loss": 0.4101, + "step": 5308 + }, + { + "epoch": 0.51669099756691, + "grad_norm": 1.277791589279903, + "learning_rate": 4.973205488919141e-06, + "loss": 0.346, + "step": 5309 + }, + { + "epoch": 0.5167883211678832, + "grad_norm": 1.2007044433173495, + "learning_rate": 4.971629357653922e-06, + "loss": 0.3009, + "step": 5310 + }, + { + "epoch": 0.5168856447688565, + "grad_norm": 1.4503422826410344, + "learning_rate": 4.97005322920791e-06, + "loss": 0.4332, + "step": 5311 + }, + { + "epoch": 0.5169829683698297, + "grad_norm": 1.3530624974287928, + "learning_rate": 4.968477103737728e-06, + "loss": 0.4242, + "step": 5312 + }, + { + "epoch": 0.517080291970803, + "grad_norm": 1.5703917526059643, + "learning_rate": 4.966900981399995e-06, + "loss": 0.5539, + "step": 5313 + }, + { + "epoch": 0.5171776155717761, + "grad_norm": 1.4475897916308973, + "learning_rate": 4.965324862351333e-06, + "loss": 0.3058, + "step": 5314 + }, + { + "epoch": 0.5172749391727494, + "grad_norm": 1.8442399618571903, + "learning_rate": 4.963748746748359e-06, + "loss": 0.4812, + "step": 5315 + }, + { + "epoch": 0.5173722627737226, + "grad_norm": 1.5981610160651096, + "learning_rate": 4.962172634747695e-06, + "loss": 0.4771, + "step": 5316 + }, + { + "epoch": 0.5174695863746959, + "grad_norm": 1.3855322261409106, + "learning_rate": 4.96059652650596e-06, + "loss": 0.4074, + "step": 5317 + }, + { + "epoch": 0.5175669099756691, + "grad_norm": 1.4602293858548268, + "learning_rate": 4.959020422179771e-06, + "loss": 0.4845, + "step": 5318 + }, + { + "epoch": 0.5176642335766424, + "grad_norm": 1.6579157531970967, + "learning_rate": 4.957444321925748e-06, + "loss": 0.4785, + "step": 5319 + }, + { + "epoch": 0.5177615571776155, + "grad_norm": 1.4080106098360512, + "learning_rate": 4.955868225900512e-06, + "loss": 0.5201, + "step": 5320 + }, + { + "epoch": 0.5178588807785888, + "grad_norm": 1.6072174721003103, + "learning_rate": 4.954292134260675e-06, + "loss": 0.4127, + "step": 5321 + }, + { + "epoch": 0.517956204379562, + "grad_norm": 1.6250051351612822, + "learning_rate": 4.952716047162855e-06, + "loss": 0.4988, + "step": 5322 + }, + { + "epoch": 0.5180535279805353, + "grad_norm": 1.3265372266035595, + "learning_rate": 4.951139964763675e-06, + "loss": 0.3624, + "step": 5323 + }, + { + "epoch": 0.5181508515815085, + "grad_norm": 1.5066553923654853, + "learning_rate": 4.949563887219744e-06, + "loss": 0.5188, + "step": 5324 + }, + { + "epoch": 0.5182481751824818, + "grad_norm": 1.2772262472533884, + "learning_rate": 4.94798781468768e-06, + "loss": 0.3861, + "step": 5325 + }, + { + "epoch": 0.518345498783455, + "grad_norm": 1.5033158369519337, + "learning_rate": 4.9464117473240995e-06, + "loss": 0.537, + "step": 5326 + }, + { + "epoch": 0.5184428223844282, + "grad_norm": 1.5627108622367212, + "learning_rate": 4.944835685285616e-06, + "loss": 0.3678, + "step": 5327 + }, + { + "epoch": 0.5185401459854014, + "grad_norm": 1.2039569108862966, + "learning_rate": 4.943259628728845e-06, + "loss": 0.3805, + "step": 5328 + }, + { + "epoch": 0.5186374695863747, + "grad_norm": 1.3601407054032584, + "learning_rate": 4.941683577810399e-06, + "loss": 0.294, + "step": 5329 + }, + { + "epoch": 0.5187347931873479, + "grad_norm": 1.742643219741679, + "learning_rate": 4.940107532686895e-06, + "loss": 0.555, + "step": 5330 + }, + { + "epoch": 0.5188321167883212, + "grad_norm": 1.4838949129832981, + "learning_rate": 4.9385314935149385e-06, + "loss": 0.4845, + "step": 5331 + }, + { + "epoch": 0.5189294403892944, + "grad_norm": 1.4315876658221396, + "learning_rate": 4.936955460451145e-06, + "loss": 0.3971, + "step": 5332 + }, + { + "epoch": 0.5190267639902676, + "grad_norm": 1.5188978586317299, + "learning_rate": 4.935379433652127e-06, + "loss": 0.4045, + "step": 5333 + }, + { + "epoch": 0.5191240875912408, + "grad_norm": 1.872810738707678, + "learning_rate": 4.933803413274497e-06, + "loss": 0.6562, + "step": 5334 + }, + { + "epoch": 0.5192214111922141, + "grad_norm": 1.3765965642622116, + "learning_rate": 4.93222739947486e-06, + "loss": 0.4553, + "step": 5335 + }, + { + "epoch": 0.5193187347931874, + "grad_norm": 1.4118604005258877, + "learning_rate": 4.9306513924098275e-06, + "loss": 0.323, + "step": 5336 + }, + { + "epoch": 0.5194160583941606, + "grad_norm": 1.4505421644339682, + "learning_rate": 4.929075392236009e-06, + "loss": 0.4844, + "step": 5337 + }, + { + "epoch": 0.5195133819951339, + "grad_norm": 1.478650186480109, + "learning_rate": 4.927499399110014e-06, + "loss": 0.365, + "step": 5338 + }, + { + "epoch": 0.519610705596107, + "grad_norm": 1.5534636314464154, + "learning_rate": 4.925923413188447e-06, + "loss": 0.4952, + "step": 5339 + }, + { + "epoch": 0.5197080291970803, + "grad_norm": 1.1642216823214826, + "learning_rate": 4.924347434627916e-06, + "loss": 0.263, + "step": 5340 + }, + { + "epoch": 0.5198053527980535, + "grad_norm": 1.4445920231002787, + "learning_rate": 4.922771463585029e-06, + "loss": 0.3051, + "step": 5341 + }, + { + "epoch": 0.5199026763990268, + "grad_norm": 1.2772889665060643, + "learning_rate": 4.921195500216386e-06, + "loss": 0.3481, + "step": 5342 + }, + { + "epoch": 0.52, + "grad_norm": 1.7273031864035941, + "learning_rate": 4.9196195446785946e-06, + "loss": 0.3866, + "step": 5343 + }, + { + "epoch": 0.5200973236009733, + "grad_norm": 1.4867375896340647, + "learning_rate": 4.91804359712826e-06, + "loss": 0.5459, + "step": 5344 + }, + { + "epoch": 0.5201946472019465, + "grad_norm": 1.2278391032105154, + "learning_rate": 4.916467657721985e-06, + "loss": 0.3528, + "step": 5345 + }, + { + "epoch": 0.5202919708029197, + "grad_norm": 1.3953934683380196, + "learning_rate": 4.914891726616367e-06, + "loss": 0.2001, + "step": 5346 + }, + { + "epoch": 0.5203892944038929, + "grad_norm": 1.1224301975356168, + "learning_rate": 4.913315803968012e-06, + "loss": 0.3356, + "step": 5347 + }, + { + "epoch": 0.5204866180048662, + "grad_norm": 1.701811887962682, + "learning_rate": 4.91173988993352e-06, + "loss": 0.6132, + "step": 5348 + }, + { + "epoch": 0.5205839416058394, + "grad_norm": 1.2358617400087601, + "learning_rate": 4.910163984669488e-06, + "loss": 0.324, + "step": 5349 + }, + { + "epoch": 0.5206812652068127, + "grad_norm": 1.2515284730093246, + "learning_rate": 4.908588088332515e-06, + "loss": 0.3994, + "step": 5350 + }, + { + "epoch": 0.5207785888077859, + "grad_norm": 1.4126383317035256, + "learning_rate": 4.907012201079201e-06, + "loss": 0.3223, + "step": 5351 + }, + { + "epoch": 0.5208759124087591, + "grad_norm": 1.3208070088520005, + "learning_rate": 4.905436323066143e-06, + "loss": 0.3576, + "step": 5352 + }, + { + "epoch": 0.5209732360097323, + "grad_norm": 1.297050280388292, + "learning_rate": 4.903860454449933e-06, + "loss": 0.3447, + "step": 5353 + }, + { + "epoch": 0.5210705596107056, + "grad_norm": 1.5683955548247523, + "learning_rate": 4.90228459538717e-06, + "loss": 0.3439, + "step": 5354 + }, + { + "epoch": 0.5211678832116788, + "grad_norm": 1.2908503301070766, + "learning_rate": 4.900708746034447e-06, + "loss": 0.3223, + "step": 5355 + }, + { + "epoch": 0.5212652068126521, + "grad_norm": 1.2697856941531227, + "learning_rate": 4.899132906548358e-06, + "loss": 0.3964, + "step": 5356 + }, + { + "epoch": 0.5213625304136253, + "grad_norm": 1.475725693714655, + "learning_rate": 4.897557077085493e-06, + "loss": 0.5808, + "step": 5357 + }, + { + "epoch": 0.5214598540145985, + "grad_norm": 1.5730752507111745, + "learning_rate": 4.895981257802444e-06, + "loss": 0.5628, + "step": 5358 + }, + { + "epoch": 0.5215571776155717, + "grad_norm": 1.2644855409084375, + "learning_rate": 4.894405448855802e-06, + "loss": 0.3988, + "step": 5359 + }, + { + "epoch": 0.521654501216545, + "grad_norm": 1.4471549173244473, + "learning_rate": 4.892829650402154e-06, + "loss": 0.4499, + "step": 5360 + }, + { + "epoch": 0.5217518248175183, + "grad_norm": 1.2611962054210362, + "learning_rate": 4.891253862598091e-06, + "loss": 0.2343, + "step": 5361 + }, + { + "epoch": 0.5218491484184915, + "grad_norm": 1.466687479015995, + "learning_rate": 4.889678085600197e-06, + "loss": 0.4877, + "step": 5362 + }, + { + "epoch": 0.5219464720194648, + "grad_norm": 1.3646762563889987, + "learning_rate": 4.888102319565062e-06, + "loss": 0.3101, + "step": 5363 + }, + { + "epoch": 0.522043795620438, + "grad_norm": 1.6519749006530624, + "learning_rate": 4.886526564649266e-06, + "loss": 0.5344, + "step": 5364 + }, + { + "epoch": 0.5221411192214112, + "grad_norm": 1.3831849539510426, + "learning_rate": 4.884950821009395e-06, + "loss": 0.4052, + "step": 5365 + }, + { + "epoch": 0.5222384428223844, + "grad_norm": 1.97221130791929, + "learning_rate": 4.883375088802035e-06, + "loss": 0.5541, + "step": 5366 + }, + { + "epoch": 0.5223357664233577, + "grad_norm": 1.1782514408166704, + "learning_rate": 4.88179936818376e-06, + "loss": 0.3286, + "step": 5367 + }, + { + "epoch": 0.5224330900243309, + "grad_norm": 1.5700750218829738, + "learning_rate": 4.8802236593111565e-06, + "loss": 0.5532, + "step": 5368 + }, + { + "epoch": 0.5225304136253042, + "grad_norm": 1.3709483273324035, + "learning_rate": 4.878647962340801e-06, + "loss": 0.3527, + "step": 5369 + }, + { + "epoch": 0.5226277372262774, + "grad_norm": 1.4098238143686732, + "learning_rate": 4.8770722774292725e-06, + "loss": 0.3371, + "step": 5370 + }, + { + "epoch": 0.5227250608272506, + "grad_norm": 1.290909432387948, + "learning_rate": 4.875496604733146e-06, + "loss": 0.2814, + "step": 5371 + }, + { + "epoch": 0.5228223844282238, + "grad_norm": 1.2700354411218717, + "learning_rate": 4.873920944408999e-06, + "loss": 0.3527, + "step": 5372 + }, + { + "epoch": 0.5229197080291971, + "grad_norm": 1.3275619210782128, + "learning_rate": 4.872345296613405e-06, + "loss": 0.2641, + "step": 5373 + }, + { + "epoch": 0.5230170316301703, + "grad_norm": 1.5426199603306376, + "learning_rate": 4.870769661502939e-06, + "loss": 0.4184, + "step": 5374 + }, + { + "epoch": 0.5231143552311436, + "grad_norm": 1.6311268113198076, + "learning_rate": 4.86919403923417e-06, + "loss": 0.5194, + "step": 5375 + }, + { + "epoch": 0.5232116788321168, + "grad_norm": 1.3170502216666653, + "learning_rate": 4.867618429963669e-06, + "loss": 0.297, + "step": 5376 + }, + { + "epoch": 0.52330900243309, + "grad_norm": 1.5149566120443692, + "learning_rate": 4.866042833848009e-06, + "loss": 0.3749, + "step": 5377 + }, + { + "epoch": 0.5234063260340632, + "grad_norm": 1.624425493066898, + "learning_rate": 4.864467251043752e-06, + "loss": 0.5824, + "step": 5378 + }, + { + "epoch": 0.5235036496350365, + "grad_norm": 1.4121059982238326, + "learning_rate": 4.8628916817074684e-06, + "loss": 0.3096, + "step": 5379 + }, + { + "epoch": 0.5236009732360097, + "grad_norm": 1.4549118049345602, + "learning_rate": 4.861316125995722e-06, + "loss": 0.2738, + "step": 5380 + }, + { + "epoch": 0.523698296836983, + "grad_norm": 1.5217440069013852, + "learning_rate": 4.859740584065079e-06, + "loss": 0.4939, + "step": 5381 + }, + { + "epoch": 0.5237956204379562, + "grad_norm": 1.3476948608174777, + "learning_rate": 4.858165056072099e-06, + "loss": 0.2559, + "step": 5382 + }, + { + "epoch": 0.5238929440389294, + "grad_norm": 1.478647767867753, + "learning_rate": 4.856589542173344e-06, + "loss": 0.3895, + "step": 5383 + }, + { + "epoch": 0.5239902676399026, + "grad_norm": 1.568105636726477, + "learning_rate": 4.8550140425253764e-06, + "loss": 0.6688, + "step": 5384 + }, + { + "epoch": 0.5240875912408759, + "grad_norm": 1.572433026802508, + "learning_rate": 4.853438557284751e-06, + "loss": 0.287, + "step": 5385 + }, + { + "epoch": 0.5241849148418491, + "grad_norm": 1.5713408030733105, + "learning_rate": 4.851863086608026e-06, + "loss": 0.4803, + "step": 5386 + }, + { + "epoch": 0.5242822384428224, + "grad_norm": 1.7900814170009662, + "learning_rate": 4.850287630651757e-06, + "loss": 0.4003, + "step": 5387 + }, + { + "epoch": 0.5243795620437957, + "grad_norm": 1.7120394010733622, + "learning_rate": 4.848712189572502e-06, + "loss": 0.4937, + "step": 5388 + }, + { + "epoch": 0.5244768856447689, + "grad_norm": 1.406520902183401, + "learning_rate": 4.8471367635268056e-06, + "loss": 0.4204, + "step": 5389 + }, + { + "epoch": 0.524574209245742, + "grad_norm": 1.3796929554815212, + "learning_rate": 4.845561352671224e-06, + "loss": 0.4107, + "step": 5390 + }, + { + "epoch": 0.5246715328467153, + "grad_norm": 1.6170913004250942, + "learning_rate": 4.843985957162304e-06, + "loss": 0.4272, + "step": 5391 + }, + { + "epoch": 0.5247688564476886, + "grad_norm": 1.3186027697776659, + "learning_rate": 4.842410577156599e-06, + "loss": 0.3675, + "step": 5392 + }, + { + "epoch": 0.5248661800486618, + "grad_norm": 1.3083922273699464, + "learning_rate": 4.840835212810649e-06, + "loss": 0.3653, + "step": 5393 + }, + { + "epoch": 0.5249635036496351, + "grad_norm": 2.6172644760426738, + "learning_rate": 4.839259864281002e-06, + "loss": 0.4221, + "step": 5394 + }, + { + "epoch": 0.5250608272506083, + "grad_norm": 1.5818666725738313, + "learning_rate": 4.837684531724202e-06, + "loss": 0.5843, + "step": 5395 + }, + { + "epoch": 0.5251581508515815, + "grad_norm": 1.6978305802627989, + "learning_rate": 4.8361092152967896e-06, + "loss": 0.5218, + "step": 5396 + }, + { + "epoch": 0.5252554744525547, + "grad_norm": 1.6514057355698206, + "learning_rate": 4.834533915155305e-06, + "loss": 0.6913, + "step": 5397 + }, + { + "epoch": 0.525352798053528, + "grad_norm": 1.6530126926514057, + "learning_rate": 4.832958631456286e-06, + "loss": 0.4403, + "step": 5398 + }, + { + "epoch": 0.5254501216545012, + "grad_norm": 1.3228907945561075, + "learning_rate": 4.831383364356274e-06, + "loss": 0.342, + "step": 5399 + }, + { + "epoch": 0.5255474452554745, + "grad_norm": 1.442083349631125, + "learning_rate": 4.829808114011798e-06, + "loss": 0.353, + "step": 5400 + }, + { + "epoch": 0.5256447688564477, + "grad_norm": 1.1397565906398535, + "learning_rate": 4.828232880579393e-06, + "loss": 0.2772, + "step": 5401 + }, + { + "epoch": 0.5257420924574209, + "grad_norm": 1.8186944146854849, + "learning_rate": 4.826657664215596e-06, + "loss": 0.3717, + "step": 5402 + }, + { + "epoch": 0.5258394160583941, + "grad_norm": 1.202086756198358, + "learning_rate": 4.825082465076931e-06, + "loss": 0.2336, + "step": 5403 + }, + { + "epoch": 0.5259367396593674, + "grad_norm": 1.1275223429933794, + "learning_rate": 4.8235072833199285e-06, + "loss": 0.3068, + "step": 5404 + }, + { + "epoch": 0.5260340632603406, + "grad_norm": 1.3474112025287421, + "learning_rate": 4.821932119101116e-06, + "loss": 0.3252, + "step": 5405 + }, + { + "epoch": 0.5261313868613139, + "grad_norm": 1.4563248381806024, + "learning_rate": 4.820356972577019e-06, + "loss": 0.5844, + "step": 5406 + }, + { + "epoch": 0.5262287104622871, + "grad_norm": 1.6810836312607915, + "learning_rate": 4.81878184390416e-06, + "loss": 0.3299, + "step": 5407 + }, + { + "epoch": 0.5263260340632604, + "grad_norm": 1.3337967881752477, + "learning_rate": 4.81720673323906e-06, + "loss": 0.2969, + "step": 5408 + }, + { + "epoch": 0.5264233576642335, + "grad_norm": 1.5825437116085013, + "learning_rate": 4.815631640738239e-06, + "loss": 0.6384, + "step": 5409 + }, + { + "epoch": 0.5265206812652068, + "grad_norm": 1.7317820919120244, + "learning_rate": 4.814056566558218e-06, + "loss": 0.6892, + "step": 5410 + }, + { + "epoch": 0.52661800486618, + "grad_norm": 1.327109363645959, + "learning_rate": 4.812481510855508e-06, + "loss": 0.2553, + "step": 5411 + }, + { + "epoch": 0.5267153284671533, + "grad_norm": 1.285187694779396, + "learning_rate": 4.8109064737866235e-06, + "loss": 0.3509, + "step": 5412 + }, + { + "epoch": 0.5268126520681266, + "grad_norm": 1.3226001941679035, + "learning_rate": 4.809331455508083e-06, + "loss": 0.4332, + "step": 5413 + }, + { + "epoch": 0.5269099756690998, + "grad_norm": 1.3383124584471806, + "learning_rate": 4.807756456176391e-06, + "loss": 0.3537, + "step": 5414 + }, + { + "epoch": 0.527007299270073, + "grad_norm": 1.5958030790776172, + "learning_rate": 4.806181475948057e-06, + "loss": 0.4861, + "step": 5415 + }, + { + "epoch": 0.5271046228710462, + "grad_norm": 1.4065320050083008, + "learning_rate": 4.8046065149795905e-06, + "loss": 0.4506, + "step": 5416 + }, + { + "epoch": 0.5272019464720195, + "grad_norm": 1.420411750842393, + "learning_rate": 4.803031573427495e-06, + "loss": 0.233, + "step": 5417 + }, + { + "epoch": 0.5272992700729927, + "grad_norm": 1.4294795317597138, + "learning_rate": 4.801456651448271e-06, + "loss": 0.4164, + "step": 5418 + }, + { + "epoch": 0.527396593673966, + "grad_norm": 1.362469100601773, + "learning_rate": 4.799881749198423e-06, + "loss": 0.3538, + "step": 5419 + }, + { + "epoch": 0.5274939172749392, + "grad_norm": 1.3971331603170367, + "learning_rate": 4.798306866834448e-06, + "loss": 0.3763, + "step": 5420 + }, + { + "epoch": 0.5275912408759124, + "grad_norm": 1.5699734298012762, + "learning_rate": 4.796732004512846e-06, + "loss": 0.3529, + "step": 5421 + }, + { + "epoch": 0.5276885644768856, + "grad_norm": 1.7138701783939359, + "learning_rate": 4.795157162390106e-06, + "loss": 0.4578, + "step": 5422 + }, + { + "epoch": 0.5277858880778589, + "grad_norm": 1.3247389770210467, + "learning_rate": 4.793582340622726e-06, + "loss": 0.3972, + "step": 5423 + }, + { + "epoch": 0.5278832116788321, + "grad_norm": 1.5437024653602935, + "learning_rate": 4.792007539367198e-06, + "loss": 0.5013, + "step": 5424 + }, + { + "epoch": 0.5279805352798054, + "grad_norm": 1.4497454321128689, + "learning_rate": 4.790432758780006e-06, + "loss": 0.4699, + "step": 5425 + }, + { + "epoch": 0.5280778588807786, + "grad_norm": 1.3974542837564343, + "learning_rate": 4.78885799901764e-06, + "loss": 0.4862, + "step": 5426 + }, + { + "epoch": 0.5281751824817518, + "grad_norm": 1.3731823524717879, + "learning_rate": 4.7872832602365845e-06, + "loss": 0.3659, + "step": 5427 + }, + { + "epoch": 0.528272506082725, + "grad_norm": 1.5664970390664423, + "learning_rate": 4.785708542593323e-06, + "loss": 0.2403, + "step": 5428 + }, + { + "epoch": 0.5283698296836983, + "grad_norm": 1.380386293082662, + "learning_rate": 4.784133846244334e-06, + "loss": 0.485, + "step": 5429 + }, + { + "epoch": 0.5284671532846715, + "grad_norm": 1.5356984818871535, + "learning_rate": 4.7825591713460985e-06, + "loss": 0.2544, + "step": 5430 + }, + { + "epoch": 0.5285644768856448, + "grad_norm": 1.4034684540370157, + "learning_rate": 4.780984518055093e-06, + "loss": 0.348, + "step": 5431 + }, + { + "epoch": 0.528661800486618, + "grad_norm": 1.3435931446878362, + "learning_rate": 4.779409886527787e-06, + "loss": 0.3918, + "step": 5432 + }, + { + "epoch": 0.5287591240875913, + "grad_norm": 1.7290638199826511, + "learning_rate": 4.777835276920658e-06, + "loss": 0.3752, + "step": 5433 + }, + { + "epoch": 0.5288564476885644, + "grad_norm": 1.3964564607404608, + "learning_rate": 4.776260689390174e-06, + "loss": 0.4607, + "step": 5434 + }, + { + "epoch": 0.5289537712895377, + "grad_norm": 1.5752706870334248, + "learning_rate": 4.774686124092805e-06, + "loss": 0.5425, + "step": 5435 + }, + { + "epoch": 0.5290510948905109, + "grad_norm": 1.6222659365687908, + "learning_rate": 4.773111581185011e-06, + "loss": 0.4261, + "step": 5436 + }, + { + "epoch": 0.5291484184914842, + "grad_norm": 1.5325806733097729, + "learning_rate": 4.77153706082326e-06, + "loss": 0.4287, + "step": 5437 + }, + { + "epoch": 0.5292457420924574, + "grad_norm": 1.4178414616055643, + "learning_rate": 4.769962563164012e-06, + "loss": 0.4252, + "step": 5438 + }, + { + "epoch": 0.5293430656934307, + "grad_norm": 1.219527363450225, + "learning_rate": 4.768388088363726e-06, + "loss": 0.3256, + "step": 5439 + }, + { + "epoch": 0.5294403892944038, + "grad_norm": 1.4044434705679236, + "learning_rate": 4.766813636578856e-06, + "loss": 0.3532, + "step": 5440 + }, + { + "epoch": 0.5295377128953771, + "grad_norm": 1.488609858401998, + "learning_rate": 4.765239207965859e-06, + "loss": 0.5043, + "step": 5441 + }, + { + "epoch": 0.5296350364963504, + "grad_norm": 1.5286609014215788, + "learning_rate": 4.763664802681188e-06, + "loss": 0.4806, + "step": 5442 + }, + { + "epoch": 0.5297323600973236, + "grad_norm": 1.2058920314310113, + "learning_rate": 4.762090420881289e-06, + "loss": 0.3372, + "step": 5443 + }, + { + "epoch": 0.5298296836982969, + "grad_norm": 1.4678890058409144, + "learning_rate": 4.760516062722611e-06, + "loss": 0.3401, + "step": 5444 + }, + { + "epoch": 0.5299270072992701, + "grad_norm": 1.2043465692225879, + "learning_rate": 4.758941728361599e-06, + "loss": 0.286, + "step": 5445 + }, + { + "epoch": 0.5300243309002433, + "grad_norm": 1.431438020574917, + "learning_rate": 4.757367417954699e-06, + "loss": 0.3746, + "step": 5446 + }, + { + "epoch": 0.5301216545012165, + "grad_norm": 0.9607556178702715, + "learning_rate": 4.7557931316583445e-06, + "loss": 0.2165, + "step": 5447 + }, + { + "epoch": 0.5302189781021898, + "grad_norm": 1.2786048007241864, + "learning_rate": 4.754218869628977e-06, + "loss": 0.3931, + "step": 5448 + }, + { + "epoch": 0.530316301703163, + "grad_norm": 1.4084953184852351, + "learning_rate": 4.752644632023032e-06, + "loss": 0.4777, + "step": 5449 + }, + { + "epoch": 0.5304136253041363, + "grad_norm": 1.5772624160633437, + "learning_rate": 4.751070418996941e-06, + "loss": 0.6249, + "step": 5450 + }, + { + "epoch": 0.5305109489051095, + "grad_norm": 1.5359217940776095, + "learning_rate": 4.749496230707135e-06, + "loss": 0.4116, + "step": 5451 + }, + { + "epoch": 0.5306082725060828, + "grad_norm": 1.4236847814504423, + "learning_rate": 4.747922067310044e-06, + "loss": 0.3391, + "step": 5452 + }, + { + "epoch": 0.5307055961070559, + "grad_norm": 1.3461221560216945, + "learning_rate": 4.746347928962092e-06, + "loss": 0.303, + "step": 5453 + }, + { + "epoch": 0.5308029197080292, + "grad_norm": 1.5420013871984513, + "learning_rate": 4.744773815819702e-06, + "loss": 0.4648, + "step": 5454 + }, + { + "epoch": 0.5309002433090024, + "grad_norm": 1.6829548130055092, + "learning_rate": 4.743199728039294e-06, + "loss": 0.5981, + "step": 5455 + }, + { + "epoch": 0.5309975669099757, + "grad_norm": 1.939319310177043, + "learning_rate": 4.741625665777287e-06, + "loss": 0.2136, + "step": 5456 + }, + { + "epoch": 0.5310948905109489, + "grad_norm": 1.3623424456432953, + "learning_rate": 4.740051629190099e-06, + "loss": 0.3761, + "step": 5457 + }, + { + "epoch": 0.5311922141119222, + "grad_norm": 1.6252547577919036, + "learning_rate": 4.738477618434139e-06, + "loss": 0.4033, + "step": 5458 + }, + { + "epoch": 0.5312895377128953, + "grad_norm": 1.3668336246823372, + "learning_rate": 4.736903633665817e-06, + "loss": 0.3116, + "step": 5459 + }, + { + "epoch": 0.5313868613138686, + "grad_norm": 1.1925691526183526, + "learning_rate": 4.735329675041545e-06, + "loss": 0.2819, + "step": 5460 + }, + { + "epoch": 0.5314841849148418, + "grad_norm": 1.1214266292808592, + "learning_rate": 4.733755742717724e-06, + "loss": 0.2096, + "step": 5461 + }, + { + "epoch": 0.5315815085158151, + "grad_norm": 1.381929708473621, + "learning_rate": 4.732181836850759e-06, + "loss": 0.3632, + "step": 5462 + }, + { + "epoch": 0.5316788321167883, + "grad_norm": 1.458382151558617, + "learning_rate": 4.730607957597049e-06, + "loss": 0.2973, + "step": 5463 + }, + { + "epoch": 0.5317761557177616, + "grad_norm": 1.2169151187468834, + "learning_rate": 4.729034105112994e-06, + "loss": 0.1986, + "step": 5464 + }, + { + "epoch": 0.5318734793187347, + "grad_norm": 1.3161584637902233, + "learning_rate": 4.727460279554984e-06, + "loss": 0.3633, + "step": 5465 + }, + { + "epoch": 0.531970802919708, + "grad_norm": 1.7241663812065842, + "learning_rate": 4.725886481079414e-06, + "loss": 0.3693, + "step": 5466 + }, + { + "epoch": 0.5320681265206813, + "grad_norm": 1.3926831562940198, + "learning_rate": 4.724312709842676e-06, + "loss": 0.5849, + "step": 5467 + }, + { + "epoch": 0.5321654501216545, + "grad_norm": 1.151762993800745, + "learning_rate": 4.72273896600115e-06, + "loss": 0.3172, + "step": 5468 + }, + { + "epoch": 0.5322627737226278, + "grad_norm": 1.6789544213557044, + "learning_rate": 4.721165249711223e-06, + "loss": 0.4716, + "step": 5469 + }, + { + "epoch": 0.532360097323601, + "grad_norm": 1.5509784870930794, + "learning_rate": 4.719591561129278e-06, + "loss": 0.604, + "step": 5470 + }, + { + "epoch": 0.5324574209245742, + "grad_norm": 1.5299002478998982, + "learning_rate": 4.7180179004116924e-06, + "loss": 0.4283, + "step": 5471 + }, + { + "epoch": 0.5325547445255474, + "grad_norm": 1.4384498774998542, + "learning_rate": 4.716444267714841e-06, + "loss": 0.3953, + "step": 5472 + }, + { + "epoch": 0.5326520681265207, + "grad_norm": 1.6558761174677732, + "learning_rate": 4.714870663195096e-06, + "loss": 0.3006, + "step": 5473 + }, + { + "epoch": 0.5327493917274939, + "grad_norm": 1.8723588779783473, + "learning_rate": 4.713297087008828e-06, + "loss": 0.5612, + "step": 5474 + }, + { + "epoch": 0.5328467153284672, + "grad_norm": 1.5086914821044224, + "learning_rate": 4.711723539312407e-06, + "loss": 0.3611, + "step": 5475 + }, + { + "epoch": 0.5329440389294404, + "grad_norm": 1.3672828859024702, + "learning_rate": 4.710150020262194e-06, + "loss": 0.3355, + "step": 5476 + }, + { + "epoch": 0.5330413625304137, + "grad_norm": 1.5872060278291604, + "learning_rate": 4.708576530014551e-06, + "loss": 0.551, + "step": 5477 + }, + { + "epoch": 0.5331386861313868, + "grad_norm": 1.2317312878100395, + "learning_rate": 4.707003068725839e-06, + "loss": 0.2379, + "step": 5478 + }, + { + "epoch": 0.5332360097323601, + "grad_norm": 1.5485872752470256, + "learning_rate": 4.705429636552411e-06, + "loss": 0.4116, + "step": 5479 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.394266055239885, + "learning_rate": 4.703856233650621e-06, + "loss": 0.3408, + "step": 5480 + }, + { + "epoch": 0.5334306569343066, + "grad_norm": 1.2707129040965355, + "learning_rate": 4.702282860176818e-06, + "loss": 0.3329, + "step": 5481 + }, + { + "epoch": 0.5335279805352798, + "grad_norm": 1.148637922794862, + "learning_rate": 4.7007095162873525e-06, + "loss": 0.2492, + "step": 5482 + }, + { + "epoch": 0.5336253041362531, + "grad_norm": 1.432055320891234, + "learning_rate": 4.699136202138565e-06, + "loss": 0.3254, + "step": 5483 + }, + { + "epoch": 0.5337226277372262, + "grad_norm": 1.5691038576110405, + "learning_rate": 4.697562917886798e-06, + "loss": 0.4628, + "step": 5484 + }, + { + "epoch": 0.5338199513381995, + "grad_norm": 1.4163562023075924, + "learning_rate": 4.69598966368839e-06, + "loss": 0.4073, + "step": 5485 + }, + { + "epoch": 0.5339172749391727, + "grad_norm": 1.290810065128507, + "learning_rate": 4.694416439699676e-06, + "loss": 0.3465, + "step": 5486 + }, + { + "epoch": 0.534014598540146, + "grad_norm": 1.4067443826338528, + "learning_rate": 4.692843246076988e-06, + "loss": 0.4324, + "step": 5487 + }, + { + "epoch": 0.5341119221411192, + "grad_norm": 1.6253002329552109, + "learning_rate": 4.691270082976655e-06, + "loss": 0.4271, + "step": 5488 + }, + { + "epoch": 0.5342092457420925, + "grad_norm": 1.6775288750200037, + "learning_rate": 4.689696950555006e-06, + "loss": 0.5078, + "step": 5489 + }, + { + "epoch": 0.5343065693430656, + "grad_norm": 1.2291968659823704, + "learning_rate": 4.6881238489683596e-06, + "loss": 0.2067, + "step": 5490 + }, + { + "epoch": 0.5344038929440389, + "grad_norm": 1.1572029723004311, + "learning_rate": 4.686550778373037e-06, + "loss": 0.3056, + "step": 5491 + }, + { + "epoch": 0.5345012165450121, + "grad_norm": 1.6140721486071878, + "learning_rate": 4.684977738925357e-06, + "loss": 0.294, + "step": 5492 + }, + { + "epoch": 0.5345985401459854, + "grad_norm": 1.314893492892741, + "learning_rate": 4.683404730781635e-06, + "loss": 0.2931, + "step": 5493 + }, + { + "epoch": 0.5346958637469587, + "grad_norm": 1.3510129395633992, + "learning_rate": 4.6818317540981775e-06, + "loss": 0.3709, + "step": 5494 + }, + { + "epoch": 0.5347931873479319, + "grad_norm": 1.740798327926576, + "learning_rate": 4.6802588090312935e-06, + "loss": 0.3371, + "step": 5495 + }, + { + "epoch": 0.5348905109489052, + "grad_norm": 1.1472362022276765, + "learning_rate": 4.6786858957372905e-06, + "loss": 0.2323, + "step": 5496 + }, + { + "epoch": 0.5349878345498783, + "grad_norm": 1.6487002773465846, + "learning_rate": 4.6771130143724654e-06, + "loss": 0.5127, + "step": 5497 + }, + { + "epoch": 0.5350851581508516, + "grad_norm": 1.1265346867571007, + "learning_rate": 4.675540165093119e-06, + "loss": 0.3499, + "step": 5498 + }, + { + "epoch": 0.5351824817518248, + "grad_norm": 1.526197622964153, + "learning_rate": 4.673967348055546e-06, + "loss": 0.4363, + "step": 5499 + }, + { + "epoch": 0.5352798053527981, + "grad_norm": 1.2805452967462896, + "learning_rate": 4.67239456341604e-06, + "loss": 0.4261, + "step": 5500 + }, + { + "epoch": 0.5353771289537713, + "grad_norm": 1.6046810398881801, + "learning_rate": 4.670821811330884e-06, + "loss": 0.4014, + "step": 5501 + }, + { + "epoch": 0.5354744525547446, + "grad_norm": 1.3047924342186952, + "learning_rate": 4.669249091956368e-06, + "loss": 0.2704, + "step": 5502 + }, + { + "epoch": 0.5355717761557177, + "grad_norm": 1.4224088735505336, + "learning_rate": 4.667676405448776e-06, + "loss": 0.4957, + "step": 5503 + }, + { + "epoch": 0.535669099756691, + "grad_norm": 1.258935225874021, + "learning_rate": 4.666103751964381e-06, + "loss": 0.368, + "step": 5504 + }, + { + "epoch": 0.5357664233576642, + "grad_norm": 1.4746017759013634, + "learning_rate": 4.664531131659461e-06, + "loss": 0.3029, + "step": 5505 + }, + { + "epoch": 0.5358637469586375, + "grad_norm": 2.2111545631424576, + "learning_rate": 4.66295854469029e-06, + "loss": 0.5238, + "step": 5506 + }, + { + "epoch": 0.5359610705596107, + "grad_norm": 1.2922254194717746, + "learning_rate": 4.661385991213135e-06, + "loss": 0.3183, + "step": 5507 + }, + { + "epoch": 0.536058394160584, + "grad_norm": 1.5073042887713033, + "learning_rate": 4.6598134713842625e-06, + "loss": 0.481, + "step": 5508 + }, + { + "epoch": 0.5361557177615571, + "grad_norm": 1.6172418999851652, + "learning_rate": 4.658240985359934e-06, + "loss": 0.4367, + "step": 5509 + }, + { + "epoch": 0.5362530413625304, + "grad_norm": 1.2326762425071527, + "learning_rate": 4.656668533296409e-06, + "loss": 0.2636, + "step": 5510 + }, + { + "epoch": 0.5363503649635036, + "grad_norm": 1.4249179682627615, + "learning_rate": 4.655096115349943e-06, + "loss": 0.3937, + "step": 5511 + }, + { + "epoch": 0.5364476885644769, + "grad_norm": 1.6515521234134023, + "learning_rate": 4.653523731676788e-06, + "loss": 0.7111, + "step": 5512 + }, + { + "epoch": 0.5365450121654501, + "grad_norm": 1.9900282583674413, + "learning_rate": 4.651951382433193e-06, + "loss": 0.3396, + "step": 5513 + }, + { + "epoch": 0.5366423357664234, + "grad_norm": 1.012284934196896, + "learning_rate": 4.650379067775404e-06, + "loss": 0.1941, + "step": 5514 + }, + { + "epoch": 0.5367396593673966, + "grad_norm": 1.5676311925959359, + "learning_rate": 4.64880678785966e-06, + "loss": 0.5501, + "step": 5515 + }, + { + "epoch": 0.5368369829683698, + "grad_norm": 1.439297257451148, + "learning_rate": 4.647234542842203e-06, + "loss": 0.4706, + "step": 5516 + }, + { + "epoch": 0.536934306569343, + "grad_norm": 1.6521861765513306, + "learning_rate": 4.645662332879264e-06, + "loss": 0.4775, + "step": 5517 + }, + { + "epoch": 0.5370316301703163, + "grad_norm": 1.4252575056075478, + "learning_rate": 4.644090158127079e-06, + "loss": 0.4707, + "step": 5518 + }, + { + "epoch": 0.5371289537712896, + "grad_norm": 3.029626785307384, + "learning_rate": 4.642518018741873e-06, + "loss": 0.2853, + "step": 5519 + }, + { + "epoch": 0.5372262773722628, + "grad_norm": 1.4336668406467705, + "learning_rate": 4.64094591487987e-06, + "loss": 0.4253, + "step": 5520 + }, + { + "epoch": 0.5373236009732361, + "grad_norm": 1.2542843827843542, + "learning_rate": 4.639373846697295e-06, + "loss": 0.4461, + "step": 5521 + }, + { + "epoch": 0.5374209245742092, + "grad_norm": 1.7471595600632224, + "learning_rate": 4.63780181435036e-06, + "loss": 0.5609, + "step": 5522 + }, + { + "epoch": 0.5375182481751825, + "grad_norm": 1.382828706980413, + "learning_rate": 4.636229817995281e-06, + "loss": 0.3993, + "step": 5523 + }, + { + "epoch": 0.5376155717761557, + "grad_norm": 1.1783662179749346, + "learning_rate": 4.63465785778827e-06, + "loss": 0.2375, + "step": 5524 + }, + { + "epoch": 0.537712895377129, + "grad_norm": 1.1098812184057025, + "learning_rate": 4.633085933885533e-06, + "loss": 0.2646, + "step": 5525 + }, + { + "epoch": 0.5378102189781022, + "grad_norm": 1.2965070823067013, + "learning_rate": 4.631514046443271e-06, + "loss": 0.2288, + "step": 5526 + }, + { + "epoch": 0.5379075425790755, + "grad_norm": 1.554738106215467, + "learning_rate": 4.6299421956176846e-06, + "loss": 0.475, + "step": 5527 + }, + { + "epoch": 0.5380048661800486, + "grad_norm": 1.7108794598772463, + "learning_rate": 4.62837038156497e-06, + "loss": 0.5758, + "step": 5528 + }, + { + "epoch": 0.5381021897810219, + "grad_norm": 1.4189529576262987, + "learning_rate": 4.626798604441319e-06, + "loss": 0.446, + "step": 5529 + }, + { + "epoch": 0.5381995133819951, + "grad_norm": 1.539521589404697, + "learning_rate": 4.625226864402919e-06, + "loss": 0.4016, + "step": 5530 + }, + { + "epoch": 0.5382968369829684, + "grad_norm": 1.6995404519914559, + "learning_rate": 4.623655161605957e-06, + "loss": 0.3043, + "step": 5531 + }, + { + "epoch": 0.5383941605839416, + "grad_norm": 1.805961456483182, + "learning_rate": 4.622083496206614e-06, + "loss": 0.4687, + "step": 5532 + }, + { + "epoch": 0.5384914841849149, + "grad_norm": 1.3127929269793055, + "learning_rate": 4.620511868361064e-06, + "loss": 0.4509, + "step": 5533 + }, + { + "epoch": 0.538588807785888, + "grad_norm": 1.2341439719027067, + "learning_rate": 4.618940278225484e-06, + "loss": 0.3663, + "step": 5534 + }, + { + "epoch": 0.5386861313868613, + "grad_norm": 1.8175411493609606, + "learning_rate": 4.617368725956043e-06, + "loss": 0.3676, + "step": 5535 + }, + { + "epoch": 0.5387834549878345, + "grad_norm": 2.0661946352391194, + "learning_rate": 4.615797211708908e-06, + "loss": 0.2821, + "step": 5536 + }, + { + "epoch": 0.5388807785888078, + "grad_norm": 1.2776192184330863, + "learning_rate": 4.614225735640238e-06, + "loss": 0.3573, + "step": 5537 + }, + { + "epoch": 0.538978102189781, + "grad_norm": 1.19012757771488, + "learning_rate": 4.612654297906194e-06, + "loss": 0.2752, + "step": 5538 + }, + { + "epoch": 0.5390754257907543, + "grad_norm": 1.4951729194125682, + "learning_rate": 4.611082898662932e-06, + "loss": 0.3265, + "step": 5539 + }, + { + "epoch": 0.5391727493917275, + "grad_norm": 1.7553617219847182, + "learning_rate": 4.6095115380666e-06, + "loss": 0.6098, + "step": 5540 + }, + { + "epoch": 0.5392700729927007, + "grad_norm": 1.9351011623385188, + "learning_rate": 4.607940216273347e-06, + "loss": 0.4677, + "step": 5541 + }, + { + "epoch": 0.5393673965936739, + "grad_norm": 1.6215327024782136, + "learning_rate": 4.606368933439315e-06, + "loss": 0.5683, + "step": 5542 + }, + { + "epoch": 0.5394647201946472, + "grad_norm": 1.6944511792130725, + "learning_rate": 4.604797689720645e-06, + "loss": 0.2309, + "step": 5543 + }, + { + "epoch": 0.5395620437956204, + "grad_norm": 1.2236426521264896, + "learning_rate": 4.603226485273471e-06, + "loss": 0.2641, + "step": 5544 + }, + { + "epoch": 0.5396593673965937, + "grad_norm": 1.5572973993338344, + "learning_rate": 4.601655320253925e-06, + "loss": 0.5321, + "step": 5545 + }, + { + "epoch": 0.539756690997567, + "grad_norm": 1.572086982766627, + "learning_rate": 4.600084194818134e-06, + "loss": 0.3651, + "step": 5546 + }, + { + "epoch": 0.5398540145985401, + "grad_norm": 1.4415785937152057, + "learning_rate": 4.598513109122226e-06, + "loss": 0.3424, + "step": 5547 + }, + { + "epoch": 0.5399513381995134, + "grad_norm": 1.1637778734255635, + "learning_rate": 4.596942063322314e-06, + "loss": 0.3374, + "step": 5548 + }, + { + "epoch": 0.5400486618004866, + "grad_norm": 1.324226581921668, + "learning_rate": 4.595371057574517e-06, + "loss": 0.3143, + "step": 5549 + }, + { + "epoch": 0.5401459854014599, + "grad_norm": 1.3513031639064905, + "learning_rate": 4.593800092034947e-06, + "loss": 0.5002, + "step": 5550 + }, + { + "epoch": 0.5402433090024331, + "grad_norm": 1.510265351531761, + "learning_rate": 4.5922291668597105e-06, + "loss": 0.4232, + "step": 5551 + }, + { + "epoch": 0.5403406326034064, + "grad_norm": 1.374401135510563, + "learning_rate": 4.590658282204913e-06, + "loss": 0.361, + "step": 5552 + }, + { + "epoch": 0.5404379562043795, + "grad_norm": 1.3851090235439831, + "learning_rate": 4.5890874382266535e-06, + "loss": 0.3712, + "step": 5553 + }, + { + "epoch": 0.5405352798053528, + "grad_norm": 1.3636680863042874, + "learning_rate": 4.58751663508103e-06, + "loss": 0.3843, + "step": 5554 + }, + { + "epoch": 0.540632603406326, + "grad_norm": 1.3296990435555314, + "learning_rate": 4.585945872924129e-06, + "loss": 0.4047, + "step": 5555 + }, + { + "epoch": 0.5407299270072993, + "grad_norm": 1.624264550675679, + "learning_rate": 4.584375151912043e-06, + "loss": 0.3867, + "step": 5556 + }, + { + "epoch": 0.5408272506082725, + "grad_norm": 1.3684504316145363, + "learning_rate": 4.5828044722008515e-06, + "loss": 0.3617, + "step": 5557 + }, + { + "epoch": 0.5409245742092458, + "grad_norm": 1.5271544668466714, + "learning_rate": 4.5812338339466395e-06, + "loss": 0.4888, + "step": 5558 + }, + { + "epoch": 0.541021897810219, + "grad_norm": 1.4265749041883962, + "learning_rate": 4.579663237305476e-06, + "loss": 0.4771, + "step": 5559 + }, + { + "epoch": 0.5411192214111922, + "grad_norm": 1.509210363535176, + "learning_rate": 4.578092682433435e-06, + "loss": 0.4779, + "step": 5560 + }, + { + "epoch": 0.5412165450121654, + "grad_norm": 1.5133305909360217, + "learning_rate": 4.576522169486586e-06, + "loss": 0.3973, + "step": 5561 + }, + { + "epoch": 0.5413138686131387, + "grad_norm": 1.6143705002164137, + "learning_rate": 4.574951698620987e-06, + "loss": 0.3723, + "step": 5562 + }, + { + "epoch": 0.5414111922141119, + "grad_norm": 1.2292198018803793, + "learning_rate": 4.5733812699927e-06, + "loss": 0.3499, + "step": 5563 + }, + { + "epoch": 0.5415085158150852, + "grad_norm": 1.3580361273205916, + "learning_rate": 4.571810883757777e-06, + "loss": 0.407, + "step": 5564 + }, + { + "epoch": 0.5416058394160584, + "grad_norm": 1.3493904786417004, + "learning_rate": 4.570240540072271e-06, + "loss": 0.3004, + "step": 5565 + }, + { + "epoch": 0.5417031630170316, + "grad_norm": 1.6057393015458046, + "learning_rate": 4.568670239092226e-06, + "loss": 0.5274, + "step": 5566 + }, + { + "epoch": 0.5418004866180048, + "grad_norm": 1.3129465161054514, + "learning_rate": 4.567099980973684e-06, + "loss": 0.2682, + "step": 5567 + }, + { + "epoch": 0.5418978102189781, + "grad_norm": 1.5646855137657936, + "learning_rate": 4.565529765872686e-06, + "loss": 0.4151, + "step": 5568 + }, + { + "epoch": 0.5419951338199513, + "grad_norm": 1.338838472621847, + "learning_rate": 4.56395959394526e-06, + "loss": 0.3384, + "step": 5569 + }, + { + "epoch": 0.5420924574209246, + "grad_norm": 1.2489369163341921, + "learning_rate": 4.562389465347435e-06, + "loss": 0.2837, + "step": 5570 + }, + { + "epoch": 0.5421897810218979, + "grad_norm": 1.6571482256041326, + "learning_rate": 4.56081938023524e-06, + "loss": 0.4298, + "step": 5571 + }, + { + "epoch": 0.542287104622871, + "grad_norm": 1.471288556912, + "learning_rate": 4.559249338764695e-06, + "loss": 0.4059, + "step": 5572 + }, + { + "epoch": 0.5423844282238443, + "grad_norm": 1.2544028941551706, + "learning_rate": 4.5576793410918115e-06, + "loss": 0.372, + "step": 5573 + }, + { + "epoch": 0.5424817518248175, + "grad_norm": 1.2899575103001732, + "learning_rate": 4.556109387372604e-06, + "loss": 0.3339, + "step": 5574 + }, + { + "epoch": 0.5425790754257908, + "grad_norm": 1.6117297972366638, + "learning_rate": 4.554539477763079e-06, + "loss": 0.3651, + "step": 5575 + }, + { + "epoch": 0.542676399026764, + "grad_norm": 1.3462531262771449, + "learning_rate": 4.552969612419242e-06, + "loss": 0.3098, + "step": 5576 + }, + { + "epoch": 0.5427737226277373, + "grad_norm": 1.2037265809376663, + "learning_rate": 4.551399791497087e-06, + "loss": 0.2987, + "step": 5577 + }, + { + "epoch": 0.5428710462287104, + "grad_norm": 1.3939366715986383, + "learning_rate": 4.549830015152612e-06, + "loss": 0.4183, + "step": 5578 + }, + { + "epoch": 0.5429683698296837, + "grad_norm": 1.513347290677701, + "learning_rate": 4.5482602835418065e-06, + "loss": 0.3991, + "step": 5579 + }, + { + "epoch": 0.5430656934306569, + "grad_norm": 1.1821112586133669, + "learning_rate": 4.546690596820652e-06, + "loss": 0.3178, + "step": 5580 + }, + { + "epoch": 0.5431630170316302, + "grad_norm": 1.2066819756592286, + "learning_rate": 4.54512095514513e-06, + "loss": 0.347, + "step": 5581 + }, + { + "epoch": 0.5432603406326034, + "grad_norm": 1.5202387165541005, + "learning_rate": 4.54355135867122e-06, + "loss": 0.4614, + "step": 5582 + }, + { + "epoch": 0.5433576642335767, + "grad_norm": 1.6204853432853796, + "learning_rate": 4.541981807554894e-06, + "loss": 0.413, + "step": 5583 + }, + { + "epoch": 0.5434549878345499, + "grad_norm": 1.3689442710094215, + "learning_rate": 4.540412301952116e-06, + "loss": 0.3677, + "step": 5584 + }, + { + "epoch": 0.5435523114355231, + "grad_norm": 1.219397739528809, + "learning_rate": 4.538842842018849e-06, + "loss": 0.2662, + "step": 5585 + }, + { + "epoch": 0.5436496350364963, + "grad_norm": 1.4157059534769942, + "learning_rate": 4.537273427911053e-06, + "loss": 0.3228, + "step": 5586 + }, + { + "epoch": 0.5437469586374696, + "grad_norm": 1.4212798665178918, + "learning_rate": 4.535704059784681e-06, + "loss": 0.4971, + "step": 5587 + }, + { + "epoch": 0.5438442822384428, + "grad_norm": 1.613847462306087, + "learning_rate": 4.534134737795682e-06, + "loss": 0.6941, + "step": 5588 + }, + { + "epoch": 0.5439416058394161, + "grad_norm": 1.272085771271991, + "learning_rate": 4.532565462099999e-06, + "loss": 0.2215, + "step": 5589 + }, + { + "epoch": 0.5440389294403893, + "grad_norm": 1.3523598672753734, + "learning_rate": 4.5309962328535765e-06, + "loss": 0.3506, + "step": 5590 + }, + { + "epoch": 0.5441362530413625, + "grad_norm": 1.78915091883293, + "learning_rate": 4.529427050212344e-06, + "loss": 0.2881, + "step": 5591 + }, + { + "epoch": 0.5442335766423357, + "grad_norm": 1.3070852210376076, + "learning_rate": 4.527857914332234e-06, + "loss": 0.3958, + "step": 5592 + }, + { + "epoch": 0.544330900243309, + "grad_norm": 1.2331174891417618, + "learning_rate": 4.526288825369175e-06, + "loss": 0.3897, + "step": 5593 + }, + { + "epoch": 0.5444282238442822, + "grad_norm": 1.449725615092237, + "learning_rate": 4.524719783479088e-06, + "loss": 0.309, + "step": 5594 + }, + { + "epoch": 0.5445255474452555, + "grad_norm": 1.2391040847938808, + "learning_rate": 4.523150788817886e-06, + "loss": 0.3107, + "step": 5595 + }, + { + "epoch": 0.5446228710462288, + "grad_norm": 1.6561577969203813, + "learning_rate": 4.521581841541483e-06, + "loss": 0.4173, + "step": 5596 + }, + { + "epoch": 0.5447201946472019, + "grad_norm": 1.3282576550823684, + "learning_rate": 4.5200129418057885e-06, + "loss": 0.3216, + "step": 5597 + }, + { + "epoch": 0.5448175182481751, + "grad_norm": 1.4024081414615832, + "learning_rate": 4.518444089766701e-06, + "loss": 0.3722, + "step": 5598 + }, + { + "epoch": 0.5449148418491484, + "grad_norm": 1.5089106538535892, + "learning_rate": 4.516875285580121e-06, + "loss": 0.303, + "step": 5599 + }, + { + "epoch": 0.5450121654501217, + "grad_norm": 1.5031317760291008, + "learning_rate": 4.51530652940194e-06, + "loss": 0.4373, + "step": 5600 + }, + { + "epoch": 0.5451094890510949, + "grad_norm": 1.471008431203656, + "learning_rate": 4.513737821388049e-06, + "loss": 0.4556, + "step": 5601 + }, + { + "epoch": 0.5452068126520682, + "grad_norm": 1.7413080844105027, + "learning_rate": 4.512169161694328e-06, + "loss": 0.5666, + "step": 5602 + }, + { + "epoch": 0.5453041362530414, + "grad_norm": 1.8071416376670888, + "learning_rate": 4.510600550476657e-06, + "loss": 0.3715, + "step": 5603 + }, + { + "epoch": 0.5454014598540146, + "grad_norm": 1.476639760735993, + "learning_rate": 4.509031987890913e-06, + "loss": 0.3323, + "step": 5604 + }, + { + "epoch": 0.5454987834549878, + "grad_norm": 1.3999815360622823, + "learning_rate": 4.507463474092959e-06, + "loss": 0.3506, + "step": 5605 + }, + { + "epoch": 0.5455961070559611, + "grad_norm": 1.4655325528983436, + "learning_rate": 4.505895009238663e-06, + "loss": 0.4597, + "step": 5606 + }, + { + "epoch": 0.5456934306569343, + "grad_norm": 1.1327268370762924, + "learning_rate": 4.504326593483883e-06, + "loss": 0.3451, + "step": 5607 + }, + { + "epoch": 0.5457907542579076, + "grad_norm": 1.2823893435741631, + "learning_rate": 4.502758226984477e-06, + "loss": 0.2785, + "step": 5608 + }, + { + "epoch": 0.5458880778588808, + "grad_norm": 1.5753262318211483, + "learning_rate": 4.501189909896289e-06, + "loss": 0.269, + "step": 5609 + }, + { + "epoch": 0.545985401459854, + "grad_norm": 1.5009827573371675, + "learning_rate": 4.499621642375166e-06, + "loss": 0.4056, + "step": 5610 + }, + { + "epoch": 0.5460827250608272, + "grad_norm": 1.4636375241999275, + "learning_rate": 4.498053424576949e-06, + "loss": 0.451, + "step": 5611 + }, + { + "epoch": 0.5461800486618005, + "grad_norm": 1.5065599684505087, + "learning_rate": 4.496485256657472e-06, + "loss": 0.4837, + "step": 5612 + }, + { + "epoch": 0.5462773722627737, + "grad_norm": 1.291365998662591, + "learning_rate": 4.4949171387725636e-06, + "loss": 0.3273, + "step": 5613 + }, + { + "epoch": 0.546374695863747, + "grad_norm": 1.5794146716287192, + "learning_rate": 4.4933490710780495e-06, + "loss": 0.4363, + "step": 5614 + }, + { + "epoch": 0.5464720194647202, + "grad_norm": 1.5379662651856578, + "learning_rate": 4.491781053729752e-06, + "loss": 0.5449, + "step": 5615 + }, + { + "epoch": 0.5465693430656934, + "grad_norm": 1.322945266574682, + "learning_rate": 4.490213086883482e-06, + "loss": 0.4107, + "step": 5616 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 1.3543729282674888, + "learning_rate": 4.48864517069505e-06, + "loss": 0.4165, + "step": 5617 + }, + { + "epoch": 0.5467639902676399, + "grad_norm": 1.5852531121498197, + "learning_rate": 4.487077305320261e-06, + "loss": 0.6354, + "step": 5618 + }, + { + "epoch": 0.5468613138686131, + "grad_norm": 1.3876975460155319, + "learning_rate": 4.4855094909149175e-06, + "loss": 0.2162, + "step": 5619 + }, + { + "epoch": 0.5469586374695864, + "grad_norm": 1.3842422077220742, + "learning_rate": 4.483941727634811e-06, + "loss": 0.4646, + "step": 5620 + }, + { + "epoch": 0.5470559610705596, + "grad_norm": 1.628949720502462, + "learning_rate": 4.482374015635733e-06, + "loss": 0.4539, + "step": 5621 + }, + { + "epoch": 0.5471532846715328, + "grad_norm": 1.3342073178854172, + "learning_rate": 4.480806355073467e-06, + "loss": 0.3692, + "step": 5622 + }, + { + "epoch": 0.547250608272506, + "grad_norm": 1.5354680722593141, + "learning_rate": 4.479238746103792e-06, + "loss": 0.2665, + "step": 5623 + }, + { + "epoch": 0.5473479318734793, + "grad_norm": 1.7451242234355575, + "learning_rate": 4.477671188882483e-06, + "loss": 0.6151, + "step": 5624 + }, + { + "epoch": 0.5474452554744526, + "grad_norm": 1.6591621564148866, + "learning_rate": 4.476103683565309e-06, + "loss": 0.4289, + "step": 5625 + }, + { + "epoch": 0.5475425790754258, + "grad_norm": 1.594493169110207, + "learning_rate": 4.474536230308036e-06, + "loss": 0.5853, + "step": 5626 + }, + { + "epoch": 0.5476399026763991, + "grad_norm": 1.2386168016252284, + "learning_rate": 4.472968829266419e-06, + "loss": 0.2942, + "step": 5627 + }, + { + "epoch": 0.5477372262773723, + "grad_norm": 1.4275181054672363, + "learning_rate": 4.4714014805962125e-06, + "loss": 0.2697, + "step": 5628 + }, + { + "epoch": 0.5478345498783455, + "grad_norm": 1.1916440632594898, + "learning_rate": 4.4698341844531655e-06, + "loss": 0.2913, + "step": 5629 + }, + { + "epoch": 0.5479318734793187, + "grad_norm": 1.6471889884187896, + "learning_rate": 4.468266940993025e-06, + "loss": 0.4558, + "step": 5630 + }, + { + "epoch": 0.548029197080292, + "grad_norm": 1.377839018544238, + "learning_rate": 4.466699750371522e-06, + "loss": 0.3219, + "step": 5631 + }, + { + "epoch": 0.5481265206812652, + "grad_norm": 1.5019494739979196, + "learning_rate": 4.465132612744394e-06, + "loss": 0.3668, + "step": 5632 + }, + { + "epoch": 0.5482238442822385, + "grad_norm": 1.3771030641989979, + "learning_rate": 4.463565528267367e-06, + "loss": 0.2507, + "step": 5633 + }, + { + "epoch": 0.5483211678832117, + "grad_norm": 1.8499678170781386, + "learning_rate": 4.4619984970961626e-06, + "loss": 0.5311, + "step": 5634 + }, + { + "epoch": 0.5484184914841849, + "grad_norm": 1.363181736328119, + "learning_rate": 4.460431519386498e-06, + "loss": 0.3994, + "step": 5635 + }, + { + "epoch": 0.5485158150851581, + "grad_norm": 1.4284270520142166, + "learning_rate": 4.458864595294085e-06, + "loss": 0.3709, + "step": 5636 + }, + { + "epoch": 0.5486131386861314, + "grad_norm": 1.2894122025064025, + "learning_rate": 4.457297724974632e-06, + "loss": 0.4439, + "step": 5637 + }, + { + "epoch": 0.5487104622871046, + "grad_norm": 1.3245364457901871, + "learning_rate": 4.4557309085838355e-06, + "loss": 0.278, + "step": 5638 + }, + { + "epoch": 0.5488077858880779, + "grad_norm": 1.306145681203802, + "learning_rate": 4.454164146277393e-06, + "loss": 0.3399, + "step": 5639 + }, + { + "epoch": 0.5489051094890511, + "grad_norm": 1.641617393076927, + "learning_rate": 4.452597438210996e-06, + "loss": 0.5549, + "step": 5640 + }, + { + "epoch": 0.5490024330900243, + "grad_norm": 1.6762169758319294, + "learning_rate": 4.451030784540327e-06, + "loss": 0.5193, + "step": 5641 + }, + { + "epoch": 0.5490997566909975, + "grad_norm": 1.7392269005403265, + "learning_rate": 4.449464185421066e-06, + "loss": 0.4333, + "step": 5642 + }, + { + "epoch": 0.5491970802919708, + "grad_norm": 1.223662331142104, + "learning_rate": 4.4478976410088875e-06, + "loss": 0.3168, + "step": 5643 + }, + { + "epoch": 0.549294403892944, + "grad_norm": 1.1527568529264938, + "learning_rate": 4.446331151459461e-06, + "loss": 0.2816, + "step": 5644 + }, + { + "epoch": 0.5493917274939173, + "grad_norm": 1.4589827408583553, + "learning_rate": 4.444764716928448e-06, + "loss": 0.4807, + "step": 5645 + }, + { + "epoch": 0.5494890510948905, + "grad_norm": 1.3650449695096383, + "learning_rate": 4.443198337571505e-06, + "loss": 0.4112, + "step": 5646 + }, + { + "epoch": 0.5495863746958638, + "grad_norm": 1.2069856209139433, + "learning_rate": 4.4416320135442855e-06, + "loss": 0.3413, + "step": 5647 + }, + { + "epoch": 0.5496836982968369, + "grad_norm": 1.2385661763085989, + "learning_rate": 4.440065745002438e-06, + "loss": 0.2793, + "step": 5648 + }, + { + "epoch": 0.5497810218978102, + "grad_norm": 1.2572932148786662, + "learning_rate": 4.4384995321016e-06, + "loss": 0.2908, + "step": 5649 + }, + { + "epoch": 0.5498783454987834, + "grad_norm": 1.3903538300228837, + "learning_rate": 4.436933374997408e-06, + "loss": 0.4652, + "step": 5650 + }, + { + "epoch": 0.5499756690997567, + "grad_norm": 1.2046055786935932, + "learning_rate": 4.435367273845496e-06, + "loss": 0.3401, + "step": 5651 + }, + { + "epoch": 0.55007299270073, + "grad_norm": 1.4198268349598033, + "learning_rate": 4.433801228801482e-06, + "loss": 0.297, + "step": 5652 + }, + { + "epoch": 0.5501703163017032, + "grad_norm": 4.186875709571215, + "learning_rate": 4.432235240020988e-06, + "loss": 0.373, + "step": 5653 + }, + { + "epoch": 0.5502676399026764, + "grad_norm": 1.1066768232323798, + "learning_rate": 4.430669307659627e-06, + "loss": 0.2626, + "step": 5654 + }, + { + "epoch": 0.5503649635036496, + "grad_norm": 1.6041719345732885, + "learning_rate": 4.429103431873009e-06, + "loss": 0.3763, + "step": 5655 + }, + { + "epoch": 0.5504622871046229, + "grad_norm": 1.4456818804536653, + "learning_rate": 4.427537612816732e-06, + "loss": 0.486, + "step": 5656 + }, + { + "epoch": 0.5505596107055961, + "grad_norm": 1.5069550759467354, + "learning_rate": 4.425971850646394e-06, + "loss": 0.4548, + "step": 5657 + }, + { + "epoch": 0.5506569343065694, + "grad_norm": 1.5218902555793785, + "learning_rate": 4.424406145517589e-06, + "loss": 0.4255, + "step": 5658 + }, + { + "epoch": 0.5507542579075426, + "grad_norm": 1.3616780032585183, + "learning_rate": 4.422840497585896e-06, + "loss": 0.2637, + "step": 5659 + }, + { + "epoch": 0.5508515815085158, + "grad_norm": 1.1705809505584408, + "learning_rate": 4.4212749070068974e-06, + "loss": 0.2304, + "step": 5660 + }, + { + "epoch": 0.550948905109489, + "grad_norm": 1.5206775411300353, + "learning_rate": 4.419709373936167e-06, + "loss": 0.4589, + "step": 5661 + }, + { + "epoch": 0.5510462287104623, + "grad_norm": 1.529486906246868, + "learning_rate": 4.418143898529276e-06, + "loss": 0.5809, + "step": 5662 + }, + { + "epoch": 0.5511435523114355, + "grad_norm": 1.4965602057951768, + "learning_rate": 4.416578480941781e-06, + "loss": 0.4344, + "step": 5663 + }, + { + "epoch": 0.5512408759124088, + "grad_norm": 1.627661066713738, + "learning_rate": 4.4150131213292406e-06, + "loss": 0.5971, + "step": 5664 + }, + { + "epoch": 0.551338199513382, + "grad_norm": 1.5204052605710328, + "learning_rate": 4.4134478198472065e-06, + "loss": 0.2793, + "step": 5665 + }, + { + "epoch": 0.5514355231143553, + "grad_norm": 1.6608910942592672, + "learning_rate": 4.411882576651224e-06, + "loss": 0.4815, + "step": 5666 + }, + { + "epoch": 0.5515328467153284, + "grad_norm": 1.5777835287666877, + "learning_rate": 4.410317391896829e-06, + "loss": 0.5266, + "step": 5667 + }, + { + "epoch": 0.5516301703163017, + "grad_norm": 1.316546333624811, + "learning_rate": 4.408752265739559e-06, + "loss": 0.3166, + "step": 5668 + }, + { + "epoch": 0.5517274939172749, + "grad_norm": 1.5371700987399954, + "learning_rate": 4.407187198334941e-06, + "loss": 0.4064, + "step": 5669 + }, + { + "epoch": 0.5518248175182482, + "grad_norm": 1.616571428829941, + "learning_rate": 4.405622189838492e-06, + "loss": 0.5294, + "step": 5670 + }, + { + "epoch": 0.5519221411192214, + "grad_norm": 1.7448740545910149, + "learning_rate": 4.404057240405733e-06, + "loss": 0.5254, + "step": 5671 + }, + { + "epoch": 0.5520194647201947, + "grad_norm": 1.6516182391127119, + "learning_rate": 4.4024923501921725e-06, + "loss": 0.3904, + "step": 5672 + }, + { + "epoch": 0.5521167883211678, + "grad_norm": 1.2729215922575123, + "learning_rate": 4.400927519353316e-06, + "loss": 0.3724, + "step": 5673 + }, + { + "epoch": 0.5522141119221411, + "grad_norm": 1.424783017553896, + "learning_rate": 4.399362748044658e-06, + "loss": 0.3236, + "step": 5674 + }, + { + "epoch": 0.5523114355231143, + "grad_norm": 1.4235511374310892, + "learning_rate": 4.397798036421693e-06, + "loss": 0.3861, + "step": 5675 + }, + { + "epoch": 0.5524087591240876, + "grad_norm": 1.1966914780753308, + "learning_rate": 4.3962333846399075e-06, + "loss": 0.3466, + "step": 5676 + }, + { + "epoch": 0.5525060827250609, + "grad_norm": 1.311271001249418, + "learning_rate": 4.394668792854782e-06, + "loss": 0.4016, + "step": 5677 + }, + { + "epoch": 0.5526034063260341, + "grad_norm": 1.2245364616140033, + "learning_rate": 4.393104261221791e-06, + "loss": 0.307, + "step": 5678 + }, + { + "epoch": 0.5527007299270073, + "grad_norm": 1.1602939305667588, + "learning_rate": 4.391539789896401e-06, + "loss": 0.2741, + "step": 5679 + }, + { + "epoch": 0.5527980535279805, + "grad_norm": 1.6516764851396022, + "learning_rate": 4.389975379034078e-06, + "loss": 0.4003, + "step": 5680 + }, + { + "epoch": 0.5528953771289538, + "grad_norm": 1.2223406009470155, + "learning_rate": 4.388411028790276e-06, + "loss": 0.2746, + "step": 5681 + }, + { + "epoch": 0.552992700729927, + "grad_norm": 1.3704512522620642, + "learning_rate": 4.386846739320445e-06, + "loss": 0.4675, + "step": 5682 + }, + { + "epoch": 0.5530900243309003, + "grad_norm": 1.4224145724853485, + "learning_rate": 4.38528251078003e-06, + "loss": 0.479, + "step": 5683 + }, + { + "epoch": 0.5531873479318735, + "grad_norm": 1.4287947905854521, + "learning_rate": 4.383718343324473e-06, + "loss": 0.442, + "step": 5684 + }, + { + "epoch": 0.5532846715328467, + "grad_norm": 1.2537548413473198, + "learning_rate": 4.3821542371092e-06, + "loss": 0.3575, + "step": 5685 + }, + { + "epoch": 0.5533819951338199, + "grad_norm": 1.3718420524377242, + "learning_rate": 4.380590192289641e-06, + "loss": 0.3794, + "step": 5686 + }, + { + "epoch": 0.5534793187347932, + "grad_norm": 1.3868837070406654, + "learning_rate": 4.379026209021216e-06, + "loss": 0.4507, + "step": 5687 + }, + { + "epoch": 0.5535766423357664, + "grad_norm": 1.2436446274636699, + "learning_rate": 4.377462287459338e-06, + "loss": 0.3465, + "step": 5688 + }, + { + "epoch": 0.5536739659367397, + "grad_norm": 1.3879155971385935, + "learning_rate": 4.3758984277594135e-06, + "loss": 0.4479, + "step": 5689 + }, + { + "epoch": 0.5537712895377129, + "grad_norm": 1.5156770677569695, + "learning_rate": 4.374334630076847e-06, + "loss": 0.451, + "step": 5690 + }, + { + "epoch": 0.5538686131386862, + "grad_norm": 1.382281443891787, + "learning_rate": 4.372770894567033e-06, + "loss": 0.3563, + "step": 5691 + }, + { + "epoch": 0.5539659367396593, + "grad_norm": 1.339720873300227, + "learning_rate": 4.371207221385361e-06, + "loss": 0.3686, + "step": 5692 + }, + { + "epoch": 0.5540632603406326, + "grad_norm": 1.3006558286211083, + "learning_rate": 4.369643610687213e-06, + "loss": 0.3267, + "step": 5693 + }, + { + "epoch": 0.5541605839416058, + "grad_norm": 1.5130281519316149, + "learning_rate": 4.368080062627967e-06, + "loss": 0.5552, + "step": 5694 + }, + { + "epoch": 0.5542579075425791, + "grad_norm": 1.5043682231045181, + "learning_rate": 4.366516577362996e-06, + "loss": 0.6171, + "step": 5695 + }, + { + "epoch": 0.5543552311435523, + "grad_norm": 1.7431379385735046, + "learning_rate": 4.36495315504766e-06, + "loss": 0.367, + "step": 5696 + }, + { + "epoch": 0.5544525547445256, + "grad_norm": 1.4365088321402324, + "learning_rate": 4.363389795837319e-06, + "loss": 0.3972, + "step": 5697 + }, + { + "epoch": 0.5545498783454987, + "grad_norm": 1.4406249867602698, + "learning_rate": 4.361826499887326e-06, + "loss": 0.4694, + "step": 5698 + }, + { + "epoch": 0.554647201946472, + "grad_norm": 4.114315181959679, + "learning_rate": 4.360263267353026e-06, + "loss": 0.5357, + "step": 5699 + }, + { + "epoch": 0.5547445255474452, + "grad_norm": 1.5396913899178413, + "learning_rate": 4.358700098389757e-06, + "loss": 0.5587, + "step": 5700 + }, + { + "epoch": 0.5548418491484185, + "grad_norm": 1.373870776059156, + "learning_rate": 4.357136993152854e-06, + "loss": 0.2794, + "step": 5701 + }, + { + "epoch": 0.5549391727493918, + "grad_norm": 1.1133884846757365, + "learning_rate": 4.3555739517976445e-06, + "loss": 0.2657, + "step": 5702 + }, + { + "epoch": 0.555036496350365, + "grad_norm": 1.5146218352253658, + "learning_rate": 4.3540109744794464e-06, + "loss": 0.4651, + "step": 5703 + }, + { + "epoch": 0.5551338199513381, + "grad_norm": 1.4914633706505942, + "learning_rate": 4.352448061353574e-06, + "loss": 0.3688, + "step": 5704 + }, + { + "epoch": 0.5552311435523114, + "grad_norm": 1.1492070325180173, + "learning_rate": 4.350885212575339e-06, + "loss": 0.3122, + "step": 5705 + }, + { + "epoch": 0.5553284671532847, + "grad_norm": 1.6119735627290095, + "learning_rate": 4.3493224283000365e-06, + "loss": 0.4979, + "step": 5706 + }, + { + "epoch": 0.5554257907542579, + "grad_norm": 1.6176815291929274, + "learning_rate": 4.3477597086829644e-06, + "loss": 0.4076, + "step": 5707 + }, + { + "epoch": 0.5555231143552312, + "grad_norm": 1.541033726575006, + "learning_rate": 4.346197053879411e-06, + "loss": 0.5452, + "step": 5708 + }, + { + "epoch": 0.5556204379562044, + "grad_norm": 1.6140867720751837, + "learning_rate": 4.344634464044659e-06, + "loss": 0.5637, + "step": 5709 + }, + { + "epoch": 0.5557177615571777, + "grad_norm": 1.372663767342997, + "learning_rate": 4.3430719393339825e-06, + "loss": 0.2497, + "step": 5710 + }, + { + "epoch": 0.5558150851581508, + "grad_norm": 1.6676721242118582, + "learning_rate": 4.341509479902652e-06, + "loss": 0.3345, + "step": 5711 + }, + { + "epoch": 0.5559124087591241, + "grad_norm": 1.5318986725427477, + "learning_rate": 4.339947085905928e-06, + "loss": 0.4867, + "step": 5712 + }, + { + "epoch": 0.5560097323600973, + "grad_norm": 1.4807254296988144, + "learning_rate": 4.338384757499069e-06, + "loss": 0.4826, + "step": 5713 + }, + { + "epoch": 0.5561070559610706, + "grad_norm": 1.3413221916117712, + "learning_rate": 4.336822494837322e-06, + "loss": 0.2589, + "step": 5714 + }, + { + "epoch": 0.5562043795620438, + "grad_norm": 1.3675349638686607, + "learning_rate": 4.335260298075932e-06, + "loss": 0.476, + "step": 5715 + }, + { + "epoch": 0.5563017031630171, + "grad_norm": 1.6791691889716327, + "learning_rate": 4.333698167370136e-06, + "loss": 0.4672, + "step": 5716 + }, + { + "epoch": 0.5563990267639902, + "grad_norm": 1.7601989822178437, + "learning_rate": 4.3321361028751615e-06, + "loss": 0.4119, + "step": 5717 + }, + { + "epoch": 0.5564963503649635, + "grad_norm": 1.5395900383761547, + "learning_rate": 4.330574104746232e-06, + "loss": 0.3063, + "step": 5718 + }, + { + "epoch": 0.5565936739659367, + "grad_norm": 1.6715153102236342, + "learning_rate": 4.329012173138565e-06, + "loss": 0.5143, + "step": 5719 + }, + { + "epoch": 0.55669099756691, + "grad_norm": 1.4393361016871615, + "learning_rate": 4.327450308207373e-06, + "loss": 0.4984, + "step": 5720 + }, + { + "epoch": 0.5567883211678832, + "grad_norm": 1.4389287234077905, + "learning_rate": 4.3258885101078565e-06, + "loss": 0.36, + "step": 5721 + }, + { + "epoch": 0.5568856447688565, + "grad_norm": 1.451516522520128, + "learning_rate": 4.324326778995212e-06, + "loss": 0.3245, + "step": 5722 + }, + { + "epoch": 0.5569829683698296, + "grad_norm": 1.5139970983782318, + "learning_rate": 4.322765115024633e-06, + "loss": 0.4673, + "step": 5723 + }, + { + "epoch": 0.5570802919708029, + "grad_norm": 1.6551885982590882, + "learning_rate": 4.321203518351298e-06, + "loss": 0.5166, + "step": 5724 + }, + { + "epoch": 0.5571776155717761, + "grad_norm": 1.293506061205028, + "learning_rate": 4.319641989130388e-06, + "loss": 0.3144, + "step": 5725 + }, + { + "epoch": 0.5572749391727494, + "grad_norm": 1.480498703625906, + "learning_rate": 4.318080527517071e-06, + "loss": 0.4621, + "step": 5726 + }, + { + "epoch": 0.5573722627737226, + "grad_norm": 1.802412969469596, + "learning_rate": 4.316519133666513e-06, + "loss": 0.5655, + "step": 5727 + }, + { + "epoch": 0.5574695863746959, + "grad_norm": 1.4758025136034716, + "learning_rate": 4.314957807733867e-06, + "loss": 0.4271, + "step": 5728 + }, + { + "epoch": 0.557566909975669, + "grad_norm": 1.3462634864634564, + "learning_rate": 4.313396549874284e-06, + "loss": 0.3833, + "step": 5729 + }, + { + "epoch": 0.5576642335766423, + "grad_norm": 1.4167591326587325, + "learning_rate": 4.311835360242908e-06, + "loss": 0.5547, + "step": 5730 + }, + { + "epoch": 0.5577615571776156, + "grad_norm": 1.4905562346969488, + "learning_rate": 4.310274238994879e-06, + "loss": 0.3975, + "step": 5731 + }, + { + "epoch": 0.5578588807785888, + "grad_norm": 1.35949627565503, + "learning_rate": 4.30871318628532e-06, + "loss": 0.4013, + "step": 5732 + }, + { + "epoch": 0.5579562043795621, + "grad_norm": 1.475535363492599, + "learning_rate": 4.307152202269356e-06, + "loss": 0.3366, + "step": 5733 + }, + { + "epoch": 0.5580535279805353, + "grad_norm": 1.409351827426732, + "learning_rate": 4.305591287102105e-06, + "loss": 0.4553, + "step": 5734 + }, + { + "epoch": 0.5581508515815086, + "grad_norm": 1.5223924231145742, + "learning_rate": 4.3040304409386735e-06, + "loss": 0.389, + "step": 5735 + }, + { + "epoch": 0.5582481751824817, + "grad_norm": 1.5817444341183888, + "learning_rate": 4.302469663934164e-06, + "loss": 0.544, + "step": 5736 + }, + { + "epoch": 0.558345498783455, + "grad_norm": 1.2791959533486645, + "learning_rate": 4.300908956243674e-06, + "loss": 0.1936, + "step": 5737 + }, + { + "epoch": 0.5584428223844282, + "grad_norm": 1.6055462672321668, + "learning_rate": 4.299348318022293e-06, + "loss": 0.4708, + "step": 5738 + }, + { + "epoch": 0.5585401459854015, + "grad_norm": 2.723561556469742, + "learning_rate": 4.297787749425096e-06, + "loss": 0.4138, + "step": 5739 + }, + { + "epoch": 0.5586374695863747, + "grad_norm": 1.5183098206163341, + "learning_rate": 4.296227250607163e-06, + "loss": 0.4572, + "step": 5740 + }, + { + "epoch": 0.558734793187348, + "grad_norm": 1.3705952925621963, + "learning_rate": 4.294666821723564e-06, + "loss": 0.2, + "step": 5741 + }, + { + "epoch": 0.5588321167883211, + "grad_norm": 1.3938022278720688, + "learning_rate": 4.293106462929353e-06, + "loss": 0.4189, + "step": 5742 + }, + { + "epoch": 0.5589294403892944, + "grad_norm": 1.573159138093502, + "learning_rate": 4.291546174379588e-06, + "loss": 0.4792, + "step": 5743 + }, + { + "epoch": 0.5590267639902676, + "grad_norm": 1.5477733122965147, + "learning_rate": 4.289985956229315e-06, + "loss": 0.381, + "step": 5744 + }, + { + "epoch": 0.5591240875912409, + "grad_norm": 1.604271658231785, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.3831, + "step": 5745 + }, + { + "epoch": 0.5592214111922141, + "grad_norm": 1.7620343181545848, + "learning_rate": 4.2868657317474e-06, + "loss": 0.4775, + "step": 5746 + }, + { + "epoch": 0.5593187347931874, + "grad_norm": 1.6098626148059358, + "learning_rate": 4.285305725725814e-06, + "loss": 0.6441, + "step": 5747 + }, + { + "epoch": 0.5594160583941605, + "grad_norm": 2.5639184886931563, + "learning_rate": 4.283745790723837e-06, + "loss": 0.3698, + "step": 5748 + }, + { + "epoch": 0.5595133819951338, + "grad_norm": 1.8228770515133712, + "learning_rate": 4.282185926896483e-06, + "loss": 0.7033, + "step": 5749 + }, + { + "epoch": 0.559610705596107, + "grad_norm": 1.592250193071945, + "learning_rate": 4.280626134398753e-06, + "loss": 0.3942, + "step": 5750 + }, + { + "epoch": 0.5597080291970803, + "grad_norm": 1.349043245723801, + "learning_rate": 4.279066413385646e-06, + "loss": 0.307, + "step": 5751 + }, + { + "epoch": 0.5598053527980535, + "grad_norm": 1.371459433976103, + "learning_rate": 4.2775067640121554e-06, + "loss": 0.345, + "step": 5752 + }, + { + "epoch": 0.5599026763990268, + "grad_norm": 1.4470123192896125, + "learning_rate": 4.27594718643326e-06, + "loss": 0.4186, + "step": 5753 + }, + { + "epoch": 0.56, + "grad_norm": 1.3477849484233957, + "learning_rate": 4.274387680803936e-06, + "loss": 0.3387, + "step": 5754 + }, + { + "epoch": 0.5600973236009732, + "grad_norm": 1.4259890888219338, + "learning_rate": 4.272828247279156e-06, + "loss": 0.4646, + "step": 5755 + }, + { + "epoch": 0.5601946472019464, + "grad_norm": 1.1946278524912584, + "learning_rate": 4.27126888601388e-06, + "loss": 0.367, + "step": 5756 + }, + { + "epoch": 0.5602919708029197, + "grad_norm": 1.5421013434092867, + "learning_rate": 4.269709597163062e-06, + "loss": 0.4504, + "step": 5757 + }, + { + "epoch": 0.560389294403893, + "grad_norm": 1.9498639157078792, + "learning_rate": 4.26815038088165e-06, + "loss": 0.4335, + "step": 5758 + }, + { + "epoch": 0.5604866180048662, + "grad_norm": 2.0494519966647915, + "learning_rate": 4.2665912373245875e-06, + "loss": 0.5742, + "step": 5759 + }, + { + "epoch": 0.5605839416058395, + "grad_norm": 1.5747561614186576, + "learning_rate": 4.265032166646801e-06, + "loss": 0.3735, + "step": 5760 + }, + { + "epoch": 0.5606812652068126, + "grad_norm": 1.843850149247229, + "learning_rate": 4.26347316900322e-06, + "loss": 0.3419, + "step": 5761 + }, + { + "epoch": 0.5607785888077859, + "grad_norm": 1.4477279686493996, + "learning_rate": 4.261914244548764e-06, + "loss": 0.332, + "step": 5762 + }, + { + "epoch": 0.5608759124087591, + "grad_norm": 1.6852735737655822, + "learning_rate": 4.260355393438345e-06, + "loss": 0.4122, + "step": 5763 + }, + { + "epoch": 0.5609732360097324, + "grad_norm": 1.4724437448995873, + "learning_rate": 4.2587966158268624e-06, + "loss": 0.2869, + "step": 5764 + }, + { + "epoch": 0.5610705596107056, + "grad_norm": 1.640785790693152, + "learning_rate": 4.2572379118692155e-06, + "loss": 0.5461, + "step": 5765 + }, + { + "epoch": 0.5611678832116789, + "grad_norm": 1.3247785707104316, + "learning_rate": 4.255679281720295e-06, + "loss": 0.2946, + "step": 5766 + }, + { + "epoch": 0.561265206812652, + "grad_norm": 1.3947349848303983, + "learning_rate": 4.254120725534983e-06, + "loss": 0.3114, + "step": 5767 + }, + { + "epoch": 0.5613625304136253, + "grad_norm": 1.4054907974625854, + "learning_rate": 4.25256224346815e-06, + "loss": 0.4066, + "step": 5768 + }, + { + "epoch": 0.5614598540145985, + "grad_norm": 1.539561776443452, + "learning_rate": 4.251003835674668e-06, + "loss": 0.3611, + "step": 5769 + }, + { + "epoch": 0.5615571776155718, + "grad_norm": 1.4914655286978868, + "learning_rate": 4.249445502309395e-06, + "loss": 0.5131, + "step": 5770 + }, + { + "epoch": 0.561654501216545, + "grad_norm": 1.519511166319719, + "learning_rate": 4.247887243527184e-06, + "loss": 0.5641, + "step": 5771 + }, + { + "epoch": 0.5617518248175183, + "grad_norm": 1.8221036424020411, + "learning_rate": 4.246329059482879e-06, + "loss": 0.5552, + "step": 5772 + }, + { + "epoch": 0.5618491484184915, + "grad_norm": 1.6851180891545114, + "learning_rate": 4.24477095033132e-06, + "loss": 0.3155, + "step": 5773 + }, + { + "epoch": 0.5619464720194647, + "grad_norm": 1.3545287112899682, + "learning_rate": 4.243212916227336e-06, + "loss": 0.3855, + "step": 5774 + }, + { + "epoch": 0.5620437956204379, + "grad_norm": 1.58335034043397, + "learning_rate": 4.241654957325749e-06, + "loss": 0.4907, + "step": 5775 + }, + { + "epoch": 0.5621411192214112, + "grad_norm": 1.5281306822501477, + "learning_rate": 4.240097073781374e-06, + "loss": 0.5293, + "step": 5776 + }, + { + "epoch": 0.5622384428223844, + "grad_norm": 1.4214115435503911, + "learning_rate": 4.238539265749022e-06, + "loss": 0.4531, + "step": 5777 + }, + { + "epoch": 0.5623357664233577, + "grad_norm": 1.3606956468499203, + "learning_rate": 4.236981533383489e-06, + "loss": 0.3181, + "step": 5778 + }, + { + "epoch": 0.562433090024331, + "grad_norm": 1.4931280557146895, + "learning_rate": 4.2354238768395705e-06, + "loss": 0.4433, + "step": 5779 + }, + { + "epoch": 0.5625304136253041, + "grad_norm": 1.5924777768678957, + "learning_rate": 4.233866296272052e-06, + "loss": 0.2889, + "step": 5780 + }, + { + "epoch": 0.5626277372262773, + "grad_norm": 1.3373551013926965, + "learning_rate": 4.23230879183571e-06, + "loss": 0.339, + "step": 5781 + }, + { + "epoch": 0.5627250608272506, + "grad_norm": 1.3025930805591237, + "learning_rate": 4.230751363685316e-06, + "loss": 0.3216, + "step": 5782 + }, + { + "epoch": 0.5628223844282239, + "grad_norm": 1.301444677764997, + "learning_rate": 4.22919401197563e-06, + "loss": 0.3536, + "step": 5783 + }, + { + "epoch": 0.5629197080291971, + "grad_norm": 1.3653717626770583, + "learning_rate": 4.22763673686141e-06, + "loss": 0.3509, + "step": 5784 + }, + { + "epoch": 0.5630170316301704, + "grad_norm": 1.183773971282679, + "learning_rate": 4.226079538497404e-06, + "loss": 0.2994, + "step": 5785 + }, + { + "epoch": 0.5631143552311435, + "grad_norm": 1.1602299214100964, + "learning_rate": 4.224522417038348e-06, + "loss": 0.337, + "step": 5786 + }, + { + "epoch": 0.5632116788321168, + "grad_norm": 2.2904940870900288, + "learning_rate": 4.2229653726389765e-06, + "loss": 0.4567, + "step": 5787 + }, + { + "epoch": 0.56330900243309, + "grad_norm": 1.487082411549415, + "learning_rate": 4.221408405454014e-06, + "loss": 0.3703, + "step": 5788 + }, + { + "epoch": 0.5634063260340633, + "grad_norm": 1.257244811281916, + "learning_rate": 4.219851515638175e-06, + "loss": 0.3922, + "step": 5789 + }, + { + "epoch": 0.5635036496350365, + "grad_norm": 1.6782680457611208, + "learning_rate": 4.218294703346171e-06, + "loss": 0.5096, + "step": 5790 + }, + { + "epoch": 0.5636009732360098, + "grad_norm": 1.4268072747672031, + "learning_rate": 4.216737968732703e-06, + "loss": 0.4082, + "step": 5791 + }, + { + "epoch": 0.5636982968369829, + "grad_norm": 1.611062507700133, + "learning_rate": 4.215181311952466e-06, + "loss": 0.4716, + "step": 5792 + }, + { + "epoch": 0.5637956204379562, + "grad_norm": 1.41265833144514, + "learning_rate": 4.213624733160143e-06, + "loss": 0.3368, + "step": 5793 + }, + { + "epoch": 0.5638929440389294, + "grad_norm": 1.5428232631419951, + "learning_rate": 4.212068232510413e-06, + "loss": 0.3018, + "step": 5794 + }, + { + "epoch": 0.5639902676399027, + "grad_norm": 1.373505560498114, + "learning_rate": 4.2105118101579505e-06, + "loss": 0.176, + "step": 5795 + }, + { + "epoch": 0.5640875912408759, + "grad_norm": 1.2307753408032756, + "learning_rate": 4.2089554662574115e-06, + "loss": 0.201, + "step": 5796 + }, + { + "epoch": 0.5641849148418492, + "grad_norm": 1.587634977474687, + "learning_rate": 4.207399200963454e-06, + "loss": 0.5812, + "step": 5797 + }, + { + "epoch": 0.5642822384428224, + "grad_norm": 1.2387652008017036, + "learning_rate": 4.205843014430724e-06, + "loss": 0.2995, + "step": 5798 + }, + { + "epoch": 0.5643795620437956, + "grad_norm": 1.6778596380542672, + "learning_rate": 4.204286906813865e-06, + "loss": 0.3894, + "step": 5799 + }, + { + "epoch": 0.5644768856447688, + "grad_norm": 1.2434548684379014, + "learning_rate": 4.202730878267503e-06, + "loss": 0.3497, + "step": 5800 + }, + { + "epoch": 0.5645742092457421, + "grad_norm": 1.7892108172776426, + "learning_rate": 4.201174928946265e-06, + "loss": 0.5834, + "step": 5801 + }, + { + "epoch": 0.5646715328467153, + "grad_norm": 1.5107863752021933, + "learning_rate": 4.199619059004764e-06, + "loss": 0.4172, + "step": 5802 + }, + { + "epoch": 0.5647688564476886, + "grad_norm": 1.670276055504903, + "learning_rate": 4.19806326859761e-06, + "loss": 0.5403, + "step": 5803 + }, + { + "epoch": 0.5648661800486618, + "grad_norm": 1.6019603700248415, + "learning_rate": 4.196507557879401e-06, + "loss": 0.461, + "step": 5804 + }, + { + "epoch": 0.564963503649635, + "grad_norm": 1.8552966871289693, + "learning_rate": 4.19495192700473e-06, + "loss": 0.308, + "step": 5805 + }, + { + "epoch": 0.5650608272506082, + "grad_norm": 1.2212880434574651, + "learning_rate": 4.193396376128183e-06, + "loss": 0.3277, + "step": 5806 + }, + { + "epoch": 0.5651581508515815, + "grad_norm": 1.3939012659208478, + "learning_rate": 4.191840905404332e-06, + "loss": 0.4034, + "step": 5807 + }, + { + "epoch": 0.5652554744525548, + "grad_norm": 1.5378765048385783, + "learning_rate": 4.190285514987746e-06, + "loss": 0.4876, + "step": 5808 + }, + { + "epoch": 0.565352798053528, + "grad_norm": 1.3633671162406111, + "learning_rate": 4.1887302050329864e-06, + "loss": 0.3709, + "step": 5809 + }, + { + "epoch": 0.5654501216545013, + "grad_norm": 1.564768784356169, + "learning_rate": 4.1871749756946075e-06, + "loss": 0.4103, + "step": 5810 + }, + { + "epoch": 0.5655474452554744, + "grad_norm": 1.724405105952962, + "learning_rate": 4.185619827127148e-06, + "loss": 0.4854, + "step": 5811 + }, + { + "epoch": 0.5656447688564477, + "grad_norm": 1.622295917436671, + "learning_rate": 4.184064759485148e-06, + "loss": 0.4564, + "step": 5812 + }, + { + "epoch": 0.5657420924574209, + "grad_norm": 1.5517152524744877, + "learning_rate": 4.182509772923134e-06, + "loss": 0.5225, + "step": 5813 + }, + { + "epoch": 0.5658394160583942, + "grad_norm": 1.4068333581415946, + "learning_rate": 4.180954867595628e-06, + "loss": 0.4325, + "step": 5814 + }, + { + "epoch": 0.5659367396593674, + "grad_norm": 1.6735239186488111, + "learning_rate": 4.179400043657138e-06, + "loss": 0.3124, + "step": 5815 + }, + { + "epoch": 0.5660340632603407, + "grad_norm": 1.5777685688239573, + "learning_rate": 4.17784530126217e-06, + "loss": 0.5863, + "step": 5816 + }, + { + "epoch": 0.5661313868613139, + "grad_norm": 1.5423998592768402, + "learning_rate": 4.176290640565223e-06, + "loss": 0.3238, + "step": 5817 + }, + { + "epoch": 0.5662287104622871, + "grad_norm": 1.2853432377266125, + "learning_rate": 4.174736061720778e-06, + "loss": 0.3231, + "step": 5818 + }, + { + "epoch": 0.5663260340632603, + "grad_norm": 1.3506108279634295, + "learning_rate": 4.173181564883318e-06, + "loss": 0.3853, + "step": 5819 + }, + { + "epoch": 0.5664233576642336, + "grad_norm": 1.4787948123078365, + "learning_rate": 4.171627150207314e-06, + "loss": 0.4626, + "step": 5820 + }, + { + "epoch": 0.5665206812652068, + "grad_norm": 1.2974132661734263, + "learning_rate": 4.170072817847232e-06, + "loss": 0.3034, + "step": 5821 + }, + { + "epoch": 0.5666180048661801, + "grad_norm": 1.6572259151028126, + "learning_rate": 4.1685185679575226e-06, + "loss": 0.4438, + "step": 5822 + }, + { + "epoch": 0.5667153284671533, + "grad_norm": 1.47164631421599, + "learning_rate": 4.166964400692633e-06, + "loss": 0.4198, + "step": 5823 + }, + { + "epoch": 0.5668126520681265, + "grad_norm": 1.7131371141330365, + "learning_rate": 4.165410316207004e-06, + "loss": 0.7682, + "step": 5824 + }, + { + "epoch": 0.5669099756690997, + "grad_norm": 1.2962205453467226, + "learning_rate": 4.1638563146550646e-06, + "loss": 0.324, + "step": 5825 + }, + { + "epoch": 0.567007299270073, + "grad_norm": 1.4390408094927323, + "learning_rate": 4.162302396191237e-06, + "loss": 0.4436, + "step": 5826 + }, + { + "epoch": 0.5671046228710462, + "grad_norm": 1.3518474476934619, + "learning_rate": 4.160748560969935e-06, + "loss": 0.3033, + "step": 5827 + }, + { + "epoch": 0.5672019464720195, + "grad_norm": 4.787110968736985, + "learning_rate": 4.159194809145567e-06, + "loss": 0.2463, + "step": 5828 + }, + { + "epoch": 0.5672992700729927, + "grad_norm": 1.525896562397849, + "learning_rate": 4.157641140872524e-06, + "loss": 0.5168, + "step": 5829 + }, + { + "epoch": 0.5673965936739659, + "grad_norm": 1.6175950868182933, + "learning_rate": 4.1560875563052e-06, + "loss": 0.617, + "step": 5830 + }, + { + "epoch": 0.5674939172749391, + "grad_norm": 1.1931895932856638, + "learning_rate": 4.154534055597973e-06, + "loss": 0.2237, + "step": 5831 + }, + { + "epoch": 0.5675912408759124, + "grad_norm": 1.448923661863635, + "learning_rate": 4.15298063890522e-06, + "loss": 0.4892, + "step": 5832 + }, + { + "epoch": 0.5676885644768856, + "grad_norm": 1.617135751974373, + "learning_rate": 4.151427306381298e-06, + "loss": 0.3665, + "step": 5833 + }, + { + "epoch": 0.5677858880778589, + "grad_norm": 1.4221255741874326, + "learning_rate": 4.1498740581805675e-06, + "loss": 0.4273, + "step": 5834 + }, + { + "epoch": 0.5678832116788322, + "grad_norm": 1.0648209522518866, + "learning_rate": 4.148320894457375e-06, + "loss": 0.2857, + "step": 5835 + }, + { + "epoch": 0.5679805352798053, + "grad_norm": 1.4399789913287415, + "learning_rate": 4.146767815366058e-06, + "loss": 0.4965, + "step": 5836 + }, + { + "epoch": 0.5680778588807786, + "grad_norm": 1.4435146387650877, + "learning_rate": 4.1452148210609466e-06, + "loss": 0.4407, + "step": 5837 + }, + { + "epoch": 0.5681751824817518, + "grad_norm": 1.327625898939338, + "learning_rate": 4.143661911696365e-06, + "loss": 0.3226, + "step": 5838 + }, + { + "epoch": 0.5682725060827251, + "grad_norm": 1.4223642031676123, + "learning_rate": 4.142109087426625e-06, + "loss": 0.3701, + "step": 5839 + }, + { + "epoch": 0.5683698296836983, + "grad_norm": 1.4250017935758312, + "learning_rate": 4.140556348406033e-06, + "loss": 0.4687, + "step": 5840 + }, + { + "epoch": 0.5684671532846716, + "grad_norm": 3.7430897940959205, + "learning_rate": 4.139003694788885e-06, + "loss": 0.3839, + "step": 5841 + }, + { + "epoch": 0.5685644768856448, + "grad_norm": 1.5338848566461847, + "learning_rate": 4.13745112672947e-06, + "loss": 0.4931, + "step": 5842 + }, + { + "epoch": 0.568661800486618, + "grad_norm": 1.7129285540671277, + "learning_rate": 4.135898644382065e-06, + "loss": 0.3602, + "step": 5843 + }, + { + "epoch": 0.5687591240875912, + "grad_norm": 1.5385589550427017, + "learning_rate": 4.1343462479009425e-06, + "loss": 0.3521, + "step": 5844 + }, + { + "epoch": 0.5688564476885645, + "grad_norm": 1.3906904055183722, + "learning_rate": 4.132793937440366e-06, + "loss": 0.5322, + "step": 5845 + }, + { + "epoch": 0.5689537712895377, + "grad_norm": 1.5661330139359453, + "learning_rate": 4.13124171315459e-06, + "loss": 0.443, + "step": 5846 + }, + { + "epoch": 0.569051094890511, + "grad_norm": 1.5404268210814747, + "learning_rate": 4.129689575197857e-06, + "loss": 0.308, + "step": 5847 + }, + { + "epoch": 0.5691484184914842, + "grad_norm": 1.274700451238236, + "learning_rate": 4.128137523724407e-06, + "loss": 0.3413, + "step": 5848 + }, + { + "epoch": 0.5692457420924574, + "grad_norm": 1.5479710866706966, + "learning_rate": 4.126585558888466e-06, + "loss": 0.6331, + "step": 5849 + }, + { + "epoch": 0.5693430656934306, + "grad_norm": 1.3218775395217923, + "learning_rate": 4.125033680844257e-06, + "loss": 0.2476, + "step": 5850 + }, + { + "epoch": 0.5694403892944039, + "grad_norm": 1.6071205273848088, + "learning_rate": 4.123481889745987e-06, + "loss": 0.3284, + "step": 5851 + }, + { + "epoch": 0.5695377128953771, + "grad_norm": 1.93464499516687, + "learning_rate": 4.1219301857478615e-06, + "loss": 0.4431, + "step": 5852 + }, + { + "epoch": 0.5696350364963504, + "grad_norm": 1.4423423146713994, + "learning_rate": 4.120378569004074e-06, + "loss": 0.3544, + "step": 5853 + }, + { + "epoch": 0.5697323600973236, + "grad_norm": 1.5059937888393515, + "learning_rate": 4.118827039668808e-06, + "loss": 0.4748, + "step": 5854 + }, + { + "epoch": 0.5698296836982968, + "grad_norm": 1.5952685732122698, + "learning_rate": 4.1172755978962395e-06, + "loss": 0.5302, + "step": 5855 + }, + { + "epoch": 0.56992700729927, + "grad_norm": 1.8488532094666221, + "learning_rate": 4.115724243840537e-06, + "loss": 0.4407, + "step": 5856 + }, + { + "epoch": 0.5700243309002433, + "grad_norm": 1.3853091530444268, + "learning_rate": 4.114172977655863e-06, + "loss": 0.2285, + "step": 5857 + }, + { + "epoch": 0.5701216545012165, + "grad_norm": 1.6154002020331932, + "learning_rate": 4.112621799496362e-06, + "loss": 0.3314, + "step": 5858 + }, + { + "epoch": 0.5702189781021898, + "grad_norm": 1.5521941006794486, + "learning_rate": 4.111070709516178e-06, + "loss": 0.5284, + "step": 5859 + }, + { + "epoch": 0.570316301703163, + "grad_norm": 1.2797231182502546, + "learning_rate": 4.109519707869447e-06, + "loss": 0.2959, + "step": 5860 + }, + { + "epoch": 0.5704136253041363, + "grad_norm": 2.052441780296125, + "learning_rate": 4.107968794710287e-06, + "loss": 0.4323, + "step": 5861 + }, + { + "epoch": 0.5705109489051094, + "grad_norm": 1.573755693129952, + "learning_rate": 4.106417970192817e-06, + "loss": 0.3962, + "step": 5862 + }, + { + "epoch": 0.5706082725060827, + "grad_norm": 1.4515060101738473, + "learning_rate": 4.1048672344711416e-06, + "loss": 0.306, + "step": 5863 + }, + { + "epoch": 0.570705596107056, + "grad_norm": 2.3333979552266806, + "learning_rate": 4.103316587699362e-06, + "loss": 0.5012, + "step": 5864 + }, + { + "epoch": 0.5708029197080292, + "grad_norm": 1.3698113250288773, + "learning_rate": 4.101766030031562e-06, + "loss": 0.2574, + "step": 5865 + }, + { + "epoch": 0.5709002433090025, + "grad_norm": 1.5962661627636017, + "learning_rate": 4.100215561621824e-06, + "loss": 0.5743, + "step": 5866 + }, + { + "epoch": 0.5709975669099757, + "grad_norm": 1.1767731091783282, + "learning_rate": 4.098665182624219e-06, + "loss": 0.2687, + "step": 5867 + }, + { + "epoch": 0.5710948905109489, + "grad_norm": 1.481903266910654, + "learning_rate": 4.09711489319281e-06, + "loss": 0.371, + "step": 5868 + }, + { + "epoch": 0.5711922141119221, + "grad_norm": 3.0869616390473187, + "learning_rate": 4.095564693481647e-06, + "loss": 0.4937, + "step": 5869 + }, + { + "epoch": 0.5712895377128954, + "grad_norm": 1.4882391976250102, + "learning_rate": 4.094014583644776e-06, + "loss": 0.4624, + "step": 5870 + }, + { + "epoch": 0.5713868613138686, + "grad_norm": 1.5597033076707199, + "learning_rate": 4.092464563836235e-06, + "loss": 0.2965, + "step": 5871 + }, + { + "epoch": 0.5714841849148419, + "grad_norm": 1.4837817160889117, + "learning_rate": 4.090914634210047e-06, + "loss": 0.4667, + "step": 5872 + }, + { + "epoch": 0.5715815085158151, + "grad_norm": 1.3993305785679218, + "learning_rate": 4.0893647949202295e-06, + "loss": 0.2788, + "step": 5873 + }, + { + "epoch": 0.5716788321167883, + "grad_norm": 1.6431800927654712, + "learning_rate": 4.087815046120793e-06, + "loss": 0.4118, + "step": 5874 + }, + { + "epoch": 0.5717761557177615, + "grad_norm": 1.5376803780790709, + "learning_rate": 4.086265387965738e-06, + "loss": 0.379, + "step": 5875 + }, + { + "epoch": 0.5718734793187348, + "grad_norm": 1.2067547824117613, + "learning_rate": 4.08471582060905e-06, + "loss": 0.2887, + "step": 5876 + }, + { + "epoch": 0.571970802919708, + "grad_norm": 1.6017665826050145, + "learning_rate": 4.083166344204714e-06, + "loss": 0.2924, + "step": 5877 + }, + { + "epoch": 0.5720681265206813, + "grad_norm": 1.3717343825307584, + "learning_rate": 4.081616958906704e-06, + "loss": 0.3096, + "step": 5878 + }, + { + "epoch": 0.5721654501216545, + "grad_norm": 1.5016467274235754, + "learning_rate": 4.0800676648689784e-06, + "loss": 0.4965, + "step": 5879 + }, + { + "epoch": 0.5722627737226277, + "grad_norm": 1.5870899842392396, + "learning_rate": 4.078518462245496e-06, + "loss": 0.5304, + "step": 5880 + }, + { + "epoch": 0.5723600973236009, + "grad_norm": 1.6020729554050612, + "learning_rate": 4.076969351190199e-06, + "loss": 0.4312, + "step": 5881 + }, + { + "epoch": 0.5724574209245742, + "grad_norm": 1.802059687935232, + "learning_rate": 4.075420331857027e-06, + "loss": 0.5438, + "step": 5882 + }, + { + "epoch": 0.5725547445255474, + "grad_norm": 1.4712870174602888, + "learning_rate": 4.073871404399904e-06, + "loss": 0.2845, + "step": 5883 + }, + { + "epoch": 0.5726520681265207, + "grad_norm": 1.6255044887625074, + "learning_rate": 4.072322568972748e-06, + "loss": 0.5035, + "step": 5884 + }, + { + "epoch": 0.572749391727494, + "grad_norm": 1.3042007726205176, + "learning_rate": 4.0707738257294685e-06, + "loss": 0.3706, + "step": 5885 + }, + { + "epoch": 0.5728467153284672, + "grad_norm": 1.4403083569133328, + "learning_rate": 4.069225174823968e-06, + "loss": 0.4408, + "step": 5886 + }, + { + "epoch": 0.5729440389294403, + "grad_norm": 1.4172122503136095, + "learning_rate": 4.067676616410131e-06, + "loss": 0.3956, + "step": 5887 + }, + { + "epoch": 0.5730413625304136, + "grad_norm": 1.570813454958392, + "learning_rate": 4.0661281506418415e-06, + "loss": 0.3844, + "step": 5888 + }, + { + "epoch": 0.5731386861313869, + "grad_norm": 2.69485000844076, + "learning_rate": 4.064579777672974e-06, + "loss": 0.5282, + "step": 5889 + }, + { + "epoch": 0.5732360097323601, + "grad_norm": 2.729473669436288, + "learning_rate": 4.0630314976573875e-06, + "loss": 0.3562, + "step": 5890 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 1.4667635531646994, + "learning_rate": 4.061483310748936e-06, + "loss": 0.4355, + "step": 5891 + }, + { + "epoch": 0.5734306569343066, + "grad_norm": 1.640858224912779, + "learning_rate": 4.059935217101466e-06, + "loss": 0.2991, + "step": 5892 + }, + { + "epoch": 0.5735279805352798, + "grad_norm": 1.6782606585064863, + "learning_rate": 4.058387216868812e-06, + "loss": 0.5343, + "step": 5893 + }, + { + "epoch": 0.573625304136253, + "grad_norm": 1.3295772466437539, + "learning_rate": 4.056839310204798e-06, + "loss": 0.3044, + "step": 5894 + }, + { + "epoch": 0.5737226277372263, + "grad_norm": 1.5424168625843726, + "learning_rate": 4.055291497263241e-06, + "loss": 0.3207, + "step": 5895 + }, + { + "epoch": 0.5738199513381995, + "grad_norm": 1.5329511107321385, + "learning_rate": 4.053743778197951e-06, + "loss": 0.4069, + "step": 5896 + }, + { + "epoch": 0.5739172749391728, + "grad_norm": 1.2981917808667458, + "learning_rate": 4.052196153162721e-06, + "loss": 0.3858, + "step": 5897 + }, + { + "epoch": 0.574014598540146, + "grad_norm": 1.4777369596744594, + "learning_rate": 4.0506486223113416e-06, + "loss": 0.3397, + "step": 5898 + }, + { + "epoch": 0.5741119221411192, + "grad_norm": 1.4437053095518653, + "learning_rate": 4.049101185797592e-06, + "loss": 0.2613, + "step": 5899 + }, + { + "epoch": 0.5742092457420924, + "grad_norm": 1.440015250893471, + "learning_rate": 4.047553843775245e-06, + "loss": 0.3958, + "step": 5900 + }, + { + "epoch": 0.5743065693430657, + "grad_norm": 1.661304462765742, + "learning_rate": 4.046006596398055e-06, + "loss": 0.4794, + "step": 5901 + }, + { + "epoch": 0.5744038929440389, + "grad_norm": 1.4906714762034023, + "learning_rate": 4.044459443819777e-06, + "loss": 0.4079, + "step": 5902 + }, + { + "epoch": 0.5745012165450122, + "grad_norm": 1.7062130152009058, + "learning_rate": 4.042912386194151e-06, + "loss": 0.2706, + "step": 5903 + }, + { + "epoch": 0.5745985401459854, + "grad_norm": 1.497257427508622, + "learning_rate": 4.04136542367491e-06, + "loss": 0.2953, + "step": 5904 + }, + { + "epoch": 0.5746958637469587, + "grad_norm": 1.5618541145531117, + "learning_rate": 4.039818556415775e-06, + "loss": 0.2384, + "step": 5905 + }, + { + "epoch": 0.5747931873479318, + "grad_norm": 1.4904780157464865, + "learning_rate": 4.038271784570461e-06, + "loss": 0.4474, + "step": 5906 + }, + { + "epoch": 0.5748905109489051, + "grad_norm": 1.5448198634442605, + "learning_rate": 4.036725108292673e-06, + "loss": 0.2991, + "step": 5907 + }, + { + "epoch": 0.5749878345498783, + "grad_norm": 1.0511404224851633, + "learning_rate": 4.035178527736099e-06, + "loss": 0.2233, + "step": 5908 + }, + { + "epoch": 0.5750851581508516, + "grad_norm": 1.9371543083664262, + "learning_rate": 4.033632043054429e-06, + "loss": 0.4486, + "step": 5909 + }, + { + "epoch": 0.5751824817518248, + "grad_norm": 1.297375410121607, + "learning_rate": 4.032085654401337e-06, + "loss": 0.4129, + "step": 5910 + }, + { + "epoch": 0.5752798053527981, + "grad_norm": 2.079135046994335, + "learning_rate": 4.030539361930491e-06, + "loss": 0.3684, + "step": 5911 + }, + { + "epoch": 0.5753771289537712, + "grad_norm": 1.0717900790281243, + "learning_rate": 4.028993165795541e-06, + "loss": 0.2739, + "step": 5912 + }, + { + "epoch": 0.5754744525547445, + "grad_norm": 1.2387767967459138, + "learning_rate": 4.027447066150138e-06, + "loss": 0.3119, + "step": 5913 + }, + { + "epoch": 0.5755717761557178, + "grad_norm": 1.3658545399373896, + "learning_rate": 4.02590106314792e-06, + "loss": 0.4284, + "step": 5914 + }, + { + "epoch": 0.575669099756691, + "grad_norm": 1.4542222341215179, + "learning_rate": 4.02435515694251e-06, + "loss": 0.3987, + "step": 5915 + }, + { + "epoch": 0.5757664233576643, + "grad_norm": 1.356249933198848, + "learning_rate": 4.022809347687527e-06, + "loss": 0.3397, + "step": 5916 + }, + { + "epoch": 0.5758637469586375, + "grad_norm": 1.3049778415362594, + "learning_rate": 4.021263635536581e-06, + "loss": 0.2848, + "step": 5917 + }, + { + "epoch": 0.5759610705596107, + "grad_norm": 1.331455273987577, + "learning_rate": 4.019718020643269e-06, + "loss": 0.3336, + "step": 5918 + }, + { + "epoch": 0.5760583941605839, + "grad_norm": 1.307637289290154, + "learning_rate": 4.018172503161179e-06, + "loss": 0.3612, + "step": 5919 + }, + { + "epoch": 0.5761557177615572, + "grad_norm": 1.7867104340406879, + "learning_rate": 4.016627083243891e-06, + "loss": 0.2943, + "step": 5920 + }, + { + "epoch": 0.5762530413625304, + "grad_norm": 1.360497329252258, + "learning_rate": 4.015081761044975e-06, + "loss": 0.3606, + "step": 5921 + }, + { + "epoch": 0.5763503649635037, + "grad_norm": 1.1556745592201176, + "learning_rate": 4.013536536717991e-06, + "loss": 0.283, + "step": 5922 + }, + { + "epoch": 0.5764476885644769, + "grad_norm": 1.6250093899969087, + "learning_rate": 4.011991410416486e-06, + "loss": 0.5778, + "step": 5923 + }, + { + "epoch": 0.5765450121654502, + "grad_norm": 1.6953126406339947, + "learning_rate": 4.010446382294001e-06, + "loss": 0.4984, + "step": 5924 + }, + { + "epoch": 0.5766423357664233, + "grad_norm": 1.4142486281163247, + "learning_rate": 4.008901452504069e-06, + "loss": 0.2631, + "step": 5925 + }, + { + "epoch": 0.5767396593673966, + "grad_norm": 1.402890112232536, + "learning_rate": 4.007356621200208e-06, + "loss": 0.3787, + "step": 5926 + }, + { + "epoch": 0.5768369829683698, + "grad_norm": 1.3405076764750274, + "learning_rate": 4.005811888535929e-06, + "loss": 0.3728, + "step": 5927 + }, + { + "epoch": 0.5769343065693431, + "grad_norm": 1.2939337503168566, + "learning_rate": 4.0042672546647345e-06, + "loss": 0.3303, + "step": 5928 + }, + { + "epoch": 0.5770316301703163, + "grad_norm": 1.4749454746836559, + "learning_rate": 4.002722719740115e-06, + "loss": 0.5413, + "step": 5929 + }, + { + "epoch": 0.5771289537712896, + "grad_norm": 1.4847428518554533, + "learning_rate": 4.001178283915552e-06, + "loss": 0.3488, + "step": 5930 + }, + { + "epoch": 0.5772262773722627, + "grad_norm": 1.3332206360555243, + "learning_rate": 3.999633947344516e-06, + "loss": 0.3981, + "step": 5931 + }, + { + "epoch": 0.577323600973236, + "grad_norm": 1.4944277418937246, + "learning_rate": 3.99808971018047e-06, + "loss": 0.408, + "step": 5932 + }, + { + "epoch": 0.5774209245742092, + "grad_norm": 1.737536187103843, + "learning_rate": 3.996545572576866e-06, + "loss": 0.5298, + "step": 5933 + }, + { + "epoch": 0.5775182481751825, + "grad_norm": 1.2474936630760807, + "learning_rate": 3.995001534687145e-06, + "loss": 0.3373, + "step": 5934 + }, + { + "epoch": 0.5776155717761557, + "grad_norm": 1.4019318297079624, + "learning_rate": 3.993457596664738e-06, + "loss": 0.4404, + "step": 5935 + }, + { + "epoch": 0.577712895377129, + "grad_norm": 1.5277272399095878, + "learning_rate": 3.99191375866307e-06, + "loss": 0.3877, + "step": 5936 + }, + { + "epoch": 0.5778102189781021, + "grad_norm": 1.3728494732881964, + "learning_rate": 3.99037002083555e-06, + "loss": 0.4021, + "step": 5937 + }, + { + "epoch": 0.5779075425790754, + "grad_norm": 1.4679863750542683, + "learning_rate": 3.988826383335582e-06, + "loss": 0.2306, + "step": 5938 + }, + { + "epoch": 0.5780048661800486, + "grad_norm": 1.4183002010977648, + "learning_rate": 3.987282846316557e-06, + "loss": 0.3651, + "step": 5939 + }, + { + "epoch": 0.5781021897810219, + "grad_norm": 1.368718973443921, + "learning_rate": 3.98573940993186e-06, + "loss": 0.2663, + "step": 5940 + }, + { + "epoch": 0.5781995133819952, + "grad_norm": 1.424469979748807, + "learning_rate": 3.98419607433486e-06, + "loss": 0.4032, + "step": 5941 + }, + { + "epoch": 0.5782968369829684, + "grad_norm": 1.5021555829848496, + "learning_rate": 3.98265283967892e-06, + "loss": 0.3981, + "step": 5942 + }, + { + "epoch": 0.5783941605839416, + "grad_norm": 1.1988530300284257, + "learning_rate": 3.9811097061173955e-06, + "loss": 0.274, + "step": 5943 + }, + { + "epoch": 0.5784914841849148, + "grad_norm": 1.4405791434646016, + "learning_rate": 3.979566673803623e-06, + "loss": 0.3331, + "step": 5944 + }, + { + "epoch": 0.5785888077858881, + "grad_norm": 1.534921565086639, + "learning_rate": 3.978023742890937e-06, + "loss": 0.4452, + "step": 5945 + }, + { + "epoch": 0.5786861313868613, + "grad_norm": 1.4450376507034977, + "learning_rate": 3.9764809135326606e-06, + "loss": 0.4099, + "step": 5946 + }, + { + "epoch": 0.5787834549878346, + "grad_norm": 1.5192232187486505, + "learning_rate": 3.974938185882106e-06, + "loss": 0.3869, + "step": 5947 + }, + { + "epoch": 0.5788807785888078, + "grad_norm": 1.962930100077256, + "learning_rate": 3.973395560092572e-06, + "loss": 0.3893, + "step": 5948 + }, + { + "epoch": 0.5789781021897811, + "grad_norm": 1.5090216497507596, + "learning_rate": 3.971853036317353e-06, + "loss": 0.2462, + "step": 5949 + }, + { + "epoch": 0.5790754257907542, + "grad_norm": 1.4732758612376924, + "learning_rate": 3.970310614709729e-06, + "loss": 0.4167, + "step": 5950 + }, + { + "epoch": 0.5791727493917275, + "grad_norm": 1.5161312083783107, + "learning_rate": 3.968768295422974e-06, + "loss": 0.4823, + "step": 5951 + }, + { + "epoch": 0.5792700729927007, + "grad_norm": 1.271524314428882, + "learning_rate": 3.967226078610346e-06, + "loss": 0.2618, + "step": 5952 + }, + { + "epoch": 0.579367396593674, + "grad_norm": 1.6686446453083665, + "learning_rate": 3.965683964425098e-06, + "loss": 0.5373, + "step": 5953 + }, + { + "epoch": 0.5794647201946472, + "grad_norm": 1.5677241158773498, + "learning_rate": 3.964141953020472e-06, + "loss": 0.4947, + "step": 5954 + }, + { + "epoch": 0.5795620437956205, + "grad_norm": 1.6303707281605235, + "learning_rate": 3.962600044549694e-06, + "loss": 0.3536, + "step": 5955 + }, + { + "epoch": 0.5796593673965936, + "grad_norm": 1.390322405864935, + "learning_rate": 3.961058239165987e-06, + "loss": 0.2983, + "step": 5956 + }, + { + "epoch": 0.5797566909975669, + "grad_norm": 1.2547340868974297, + "learning_rate": 3.959516537022561e-06, + "loss": 0.4017, + "step": 5957 + }, + { + "epoch": 0.5798540145985401, + "grad_norm": 1.6559553064129353, + "learning_rate": 3.95797493827262e-06, + "loss": 0.3667, + "step": 5958 + }, + { + "epoch": 0.5799513381995134, + "grad_norm": 1.400516009603213, + "learning_rate": 3.956433443069346e-06, + "loss": 0.2541, + "step": 5959 + }, + { + "epoch": 0.5800486618004866, + "grad_norm": 1.4053238255928961, + "learning_rate": 3.954892051565923e-06, + "loss": 0.4111, + "step": 5960 + }, + { + "epoch": 0.5801459854014599, + "grad_norm": 1.5918312490297373, + "learning_rate": 3.953350763915521e-06, + "loss": 0.4221, + "step": 5961 + }, + { + "epoch": 0.580243309002433, + "grad_norm": 1.68289191183515, + "learning_rate": 3.951809580271295e-06, + "loss": 0.4138, + "step": 5962 + }, + { + "epoch": 0.5803406326034063, + "grad_norm": 1.72947481772404, + "learning_rate": 3.950268500786396e-06, + "loss": 0.5751, + "step": 5963 + }, + { + "epoch": 0.5804379562043795, + "grad_norm": 1.3780465620559466, + "learning_rate": 3.948727525613961e-06, + "loss": 0.37, + "step": 5964 + }, + { + "epoch": 0.5805352798053528, + "grad_norm": 1.4712553368068928, + "learning_rate": 3.94718665490712e-06, + "loss": 0.2758, + "step": 5965 + }, + { + "epoch": 0.580632603406326, + "grad_norm": 1.5602113365415762, + "learning_rate": 3.9456458888189856e-06, + "loss": 0.3545, + "step": 5966 + }, + { + "epoch": 0.5807299270072993, + "grad_norm": 1.508331841877293, + "learning_rate": 3.944105227502667e-06, + "loss": 0.3542, + "step": 5967 + }, + { + "epoch": 0.5808272506082726, + "grad_norm": 1.584067157882251, + "learning_rate": 3.942564671111262e-06, + "loss": 0.6656, + "step": 5968 + }, + { + "epoch": 0.5809245742092457, + "grad_norm": 1.4773530004606112, + "learning_rate": 3.9410242197978575e-06, + "loss": 0.2512, + "step": 5969 + }, + { + "epoch": 0.581021897810219, + "grad_norm": 1.4681060678066031, + "learning_rate": 3.939483873715525e-06, + "loss": 0.4127, + "step": 5970 + }, + { + "epoch": 0.5811192214111922, + "grad_norm": 1.5003292993845707, + "learning_rate": 3.937943633017331e-06, + "loss": 0.4188, + "step": 5971 + }, + { + "epoch": 0.5812165450121655, + "grad_norm": 1.4828753475139602, + "learning_rate": 3.936403497856333e-06, + "loss": 0.3884, + "step": 5972 + }, + { + "epoch": 0.5813138686131387, + "grad_norm": 1.4017949212477416, + "learning_rate": 3.934863468385572e-06, + "loss": 0.18, + "step": 5973 + }, + { + "epoch": 0.581411192214112, + "grad_norm": 1.5505106423217885, + "learning_rate": 3.933323544758083e-06, + "loss": 0.348, + "step": 5974 + }, + { + "epoch": 0.5815085158150851, + "grad_norm": 1.7306909698333623, + "learning_rate": 3.931783727126888e-06, + "loss": 0.4749, + "step": 5975 + }, + { + "epoch": 0.5816058394160584, + "grad_norm": 1.3779307256596018, + "learning_rate": 3.930244015645004e-06, + "loss": 0.383, + "step": 5976 + }, + { + "epoch": 0.5817031630170316, + "grad_norm": 1.3276773932659995, + "learning_rate": 3.928704410465426e-06, + "loss": 0.4589, + "step": 5977 + }, + { + "epoch": 0.5818004866180049, + "grad_norm": 1.5639466264695725, + "learning_rate": 3.92716491174115e-06, + "loss": 0.3932, + "step": 5978 + }, + { + "epoch": 0.5818978102189781, + "grad_norm": 1.552109156997534, + "learning_rate": 3.925625519625159e-06, + "loss": 0.4782, + "step": 5979 + }, + { + "epoch": 0.5819951338199514, + "grad_norm": 1.3282942720139936, + "learning_rate": 3.924086234270417e-06, + "loss": 0.469, + "step": 5980 + }, + { + "epoch": 0.5820924574209245, + "grad_norm": 1.24846177823519, + "learning_rate": 3.922547055829888e-06, + "loss": 0.2714, + "step": 5981 + }, + { + "epoch": 0.5821897810218978, + "grad_norm": 1.7363894336977386, + "learning_rate": 3.921007984456521e-06, + "loss": 0.6387, + "step": 5982 + }, + { + "epoch": 0.582287104622871, + "grad_norm": 1.4351834414527354, + "learning_rate": 3.919469020303254e-06, + "loss": 0.5076, + "step": 5983 + }, + { + "epoch": 0.5823844282238443, + "grad_norm": 1.2278370643463414, + "learning_rate": 3.917930163523014e-06, + "loss": 0.2925, + "step": 5984 + }, + { + "epoch": 0.5824817518248175, + "grad_norm": 1.596615328643663, + "learning_rate": 3.9163914142687185e-06, + "loss": 0.4553, + "step": 5985 + }, + { + "epoch": 0.5825790754257908, + "grad_norm": 1.8893893909568775, + "learning_rate": 3.914852772693274e-06, + "loss": 0.584, + "step": 5986 + }, + { + "epoch": 0.5826763990267639, + "grad_norm": 1.5446687628466054, + "learning_rate": 3.913314238949579e-06, + "loss": 0.3296, + "step": 5987 + }, + { + "epoch": 0.5827737226277372, + "grad_norm": 1.4060653565303132, + "learning_rate": 3.911775813190512e-06, + "loss": 0.5253, + "step": 5988 + }, + { + "epoch": 0.5828710462287104, + "grad_norm": 1.8736522281069823, + "learning_rate": 3.910237495568953e-06, + "loss": 0.3444, + "step": 5989 + }, + { + "epoch": 0.5829683698296837, + "grad_norm": 1.6316058797426887, + "learning_rate": 3.908699286237766e-06, + "loss": 0.4836, + "step": 5990 + }, + { + "epoch": 0.583065693430657, + "grad_norm": 1.6721928374594346, + "learning_rate": 3.9071611853498e-06, + "loss": 0.5202, + "step": 5991 + }, + { + "epoch": 0.5831630170316302, + "grad_norm": 1.5119042886433613, + "learning_rate": 3.905623193057898e-06, + "loss": 0.5214, + "step": 5992 + }, + { + "epoch": 0.5832603406326035, + "grad_norm": 1.5551785407648202, + "learning_rate": 3.904085309514892e-06, + "loss": 0.3067, + "step": 5993 + }, + { + "epoch": 0.5833576642335766, + "grad_norm": 3.193003671416416, + "learning_rate": 3.9025475348736045e-06, + "loss": 0.367, + "step": 5994 + }, + { + "epoch": 0.5834549878345499, + "grad_norm": 1.5549766999979842, + "learning_rate": 3.901009869286841e-06, + "loss": 0.439, + "step": 5995 + }, + { + "epoch": 0.5835523114355231, + "grad_norm": 1.690967247355317, + "learning_rate": 3.899472312907402e-06, + "loss": 0.4892, + "step": 5996 + }, + { + "epoch": 0.5836496350364964, + "grad_norm": 1.4513771456172895, + "learning_rate": 3.897934865888079e-06, + "loss": 0.4441, + "step": 5997 + }, + { + "epoch": 0.5837469586374696, + "grad_norm": 1.373683125444904, + "learning_rate": 3.896397528381642e-06, + "loss": 0.4698, + "step": 5998 + }, + { + "epoch": 0.5838442822384429, + "grad_norm": 2.093406649093295, + "learning_rate": 3.894860300540861e-06, + "loss": 0.6103, + "step": 5999 + }, + { + "epoch": 0.583941605839416, + "grad_norm": 1.5935136208360565, + "learning_rate": 3.893323182518492e-06, + "loss": 0.4388, + "step": 6000 + }, + { + "epoch": 0.5840389294403893, + "grad_norm": 1.3272675382885624, + "learning_rate": 3.891786174467281e-06, + "loss": 0.352, + "step": 6001 + }, + { + "epoch": 0.5841362530413625, + "grad_norm": 1.159572570089231, + "learning_rate": 3.8902492765399565e-06, + "loss": 0.261, + "step": 6002 + }, + { + "epoch": 0.5842335766423358, + "grad_norm": 1.6999281531466546, + "learning_rate": 3.888712488889243e-06, + "loss": 0.4206, + "step": 6003 + }, + { + "epoch": 0.584330900243309, + "grad_norm": 2.632293650046838, + "learning_rate": 3.8871758116678536e-06, + "loss": 0.3876, + "step": 6004 + }, + { + "epoch": 0.5844282238442823, + "grad_norm": 1.2638049755722311, + "learning_rate": 3.885639245028489e-06, + "loss": 0.3541, + "step": 6005 + }, + { + "epoch": 0.5845255474452554, + "grad_norm": 1.5066151979539468, + "learning_rate": 3.884102789123835e-06, + "loss": 0.4295, + "step": 6006 + }, + { + "epoch": 0.5846228710462287, + "grad_norm": 1.5095710579673731, + "learning_rate": 3.882566444106573e-06, + "loss": 0.392, + "step": 6007 + }, + { + "epoch": 0.5847201946472019, + "grad_norm": 1.7154230690330807, + "learning_rate": 3.881030210129373e-06, + "loss": 0.5574, + "step": 6008 + }, + { + "epoch": 0.5848175182481752, + "grad_norm": 1.436552647774059, + "learning_rate": 3.8794940873448865e-06, + "loss": 0.4207, + "step": 6009 + }, + { + "epoch": 0.5849148418491484, + "grad_norm": 1.5310450046883748, + "learning_rate": 3.877958075905761e-06, + "loss": 0.4533, + "step": 6010 + }, + { + "epoch": 0.5850121654501217, + "grad_norm": 1.4218251984646402, + "learning_rate": 3.876422175964632e-06, + "loss": 0.4838, + "step": 6011 + }, + { + "epoch": 0.5851094890510949, + "grad_norm": 1.6683972275141359, + "learning_rate": 3.874886387674124e-06, + "loss": 0.4184, + "step": 6012 + }, + { + "epoch": 0.5852068126520681, + "grad_norm": 1.5049663027922862, + "learning_rate": 3.873350711186845e-06, + "loss": 0.4361, + "step": 6013 + }, + { + "epoch": 0.5853041362530413, + "grad_norm": 1.4013598717687963, + "learning_rate": 3.871815146655398e-06, + "loss": 0.3404, + "step": 6014 + }, + { + "epoch": 0.5854014598540146, + "grad_norm": 1.6941280886111705, + "learning_rate": 3.870279694232374e-06, + "loss": 0.4737, + "step": 6015 + }, + { + "epoch": 0.5854987834549878, + "grad_norm": 1.6982774253729849, + "learning_rate": 3.868744354070351e-06, + "loss": 0.5216, + "step": 6016 + }, + { + "epoch": 0.5855961070559611, + "grad_norm": 1.3167767102494852, + "learning_rate": 3.8672091263218965e-06, + "loss": 0.3244, + "step": 6017 + }, + { + "epoch": 0.5856934306569344, + "grad_norm": 1.0807635320998275, + "learning_rate": 3.865674011139567e-06, + "loss": 0.2452, + "step": 6018 + }, + { + "epoch": 0.5857907542579075, + "grad_norm": 1.467439269542744, + "learning_rate": 3.8641390086759095e-06, + "loss": 0.2768, + "step": 6019 + }, + { + "epoch": 0.5858880778588808, + "grad_norm": 1.5529447209135872, + "learning_rate": 3.862604119083456e-06, + "loss": 0.3467, + "step": 6020 + }, + { + "epoch": 0.585985401459854, + "grad_norm": 1.3118746493894737, + "learning_rate": 3.86106934251473e-06, + "loss": 0.3916, + "step": 6021 + }, + { + "epoch": 0.5860827250608273, + "grad_norm": 1.594330476253586, + "learning_rate": 3.859534679122244e-06, + "loss": 0.4206, + "step": 6022 + }, + { + "epoch": 0.5861800486618005, + "grad_norm": 1.6629647946097874, + "learning_rate": 3.8580001290585004e-06, + "loss": 0.5061, + "step": 6023 + }, + { + "epoch": 0.5862773722627738, + "grad_norm": 1.5447139865364772, + "learning_rate": 3.8564656924759824e-06, + "loss": 0.3372, + "step": 6024 + }, + { + "epoch": 0.5863746958637469, + "grad_norm": 1.4120860750800777, + "learning_rate": 3.854931369527172e-06, + "loss": 0.3227, + "step": 6025 + }, + { + "epoch": 0.5864720194647202, + "grad_norm": 1.4444649350514327, + "learning_rate": 3.853397160364537e-06, + "loss": 0.3111, + "step": 6026 + }, + { + "epoch": 0.5865693430656934, + "grad_norm": 2.2801677279097525, + "learning_rate": 3.851863065140528e-06, + "loss": 0.354, + "step": 6027 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 1.1520906849374228, + "learning_rate": 3.850329084007594e-06, + "loss": 0.2854, + "step": 6028 + }, + { + "epoch": 0.5867639902676399, + "grad_norm": 1.6821634186635506, + "learning_rate": 3.8487952171181656e-06, + "loss": 0.5804, + "step": 6029 + }, + { + "epoch": 0.5868613138686132, + "grad_norm": 1.7129198548140288, + "learning_rate": 3.8472614646246635e-06, + "loss": 0.3781, + "step": 6030 + }, + { + "epoch": 0.5869586374695863, + "grad_norm": 1.3054619049860754, + "learning_rate": 3.8457278266794985e-06, + "loss": 0.3634, + "step": 6031 + }, + { + "epoch": 0.5870559610705596, + "grad_norm": 1.5114654407645212, + "learning_rate": 3.844194303435068e-06, + "loss": 0.4674, + "step": 6032 + }, + { + "epoch": 0.5871532846715328, + "grad_norm": 1.463934043696164, + "learning_rate": 3.842660895043763e-06, + "loss": 0.3581, + "step": 6033 + }, + { + "epoch": 0.5872506082725061, + "grad_norm": 1.5736029771843554, + "learning_rate": 3.841127601657952e-06, + "loss": 0.489, + "step": 6034 + }, + { + "epoch": 0.5873479318734793, + "grad_norm": 1.5881308436070423, + "learning_rate": 3.839594423430006e-06, + "loss": 0.4743, + "step": 6035 + }, + { + "epoch": 0.5874452554744526, + "grad_norm": 1.2930329643968506, + "learning_rate": 3.838061360512273e-06, + "loss": 0.3213, + "step": 6036 + }, + { + "epoch": 0.5875425790754258, + "grad_norm": 1.5166564263357916, + "learning_rate": 3.8365284130571e-06, + "loss": 0.3018, + "step": 6037 + }, + { + "epoch": 0.587639902676399, + "grad_norm": 1.6269815540921038, + "learning_rate": 3.834995581216812e-06, + "loss": 0.437, + "step": 6038 + }, + { + "epoch": 0.5877372262773722, + "grad_norm": 1.526687051617541, + "learning_rate": 3.833462865143729e-06, + "loss": 0.4139, + "step": 6039 + }, + { + "epoch": 0.5878345498783455, + "grad_norm": 1.6329604086436111, + "learning_rate": 3.831930264990159e-06, + "loss": 0.4825, + "step": 6040 + }, + { + "epoch": 0.5879318734793187, + "grad_norm": 1.4586937958416275, + "learning_rate": 3.830397780908396e-06, + "loss": 0.4184, + "step": 6041 + }, + { + "epoch": 0.588029197080292, + "grad_norm": 1.0949958789869823, + "learning_rate": 3.828865413050724e-06, + "loss": 0.2606, + "step": 6042 + }, + { + "epoch": 0.5881265206812653, + "grad_norm": 1.706951916420232, + "learning_rate": 3.827333161569416e-06, + "loss": 0.5538, + "step": 6043 + }, + { + "epoch": 0.5882238442822384, + "grad_norm": 1.2414312399767005, + "learning_rate": 3.825801026616735e-06, + "loss": 0.276, + "step": 6044 + }, + { + "epoch": 0.5883211678832116, + "grad_norm": 1.4168850038610261, + "learning_rate": 3.824269008344925e-06, + "loss": 0.3251, + "step": 6045 + }, + { + "epoch": 0.5884184914841849, + "grad_norm": 1.6149522749518992, + "learning_rate": 3.822737106906226e-06, + "loss": 0.4471, + "step": 6046 + }, + { + "epoch": 0.5885158150851582, + "grad_norm": 1.6257326968494004, + "learning_rate": 3.821205322452863e-06, + "loss": 0.5379, + "step": 6047 + }, + { + "epoch": 0.5886131386861314, + "grad_norm": 1.6789960990710493, + "learning_rate": 3.819673655137056e-06, + "loss": 0.5337, + "step": 6048 + }, + { + "epoch": 0.5887104622871047, + "grad_norm": 1.5794864485093818, + "learning_rate": 3.818142105111e-06, + "loss": 0.4465, + "step": 6049 + }, + { + "epoch": 0.5888077858880778, + "grad_norm": 1.3444717597988916, + "learning_rate": 3.816610672526891e-06, + "loss": 0.4042, + "step": 6050 + }, + { + "epoch": 0.5889051094890511, + "grad_norm": 1.554026093999093, + "learning_rate": 3.815079357536907e-06, + "loss": 0.3479, + "step": 6051 + }, + { + "epoch": 0.5890024330900243, + "grad_norm": 1.5256975673496376, + "learning_rate": 3.813548160293214e-06, + "loss": 0.363, + "step": 6052 + }, + { + "epoch": 0.5890997566909976, + "grad_norm": 1.5990062130729201, + "learning_rate": 3.8120170809479703e-06, + "loss": 0.517, + "step": 6053 + }, + { + "epoch": 0.5891970802919708, + "grad_norm": 1.3862958236289114, + "learning_rate": 3.810486119653319e-06, + "loss": 0.3521, + "step": 6054 + }, + { + "epoch": 0.5892944038929441, + "grad_norm": 1.5595567066074474, + "learning_rate": 3.808955276561396e-06, + "loss": 0.4872, + "step": 6055 + }, + { + "epoch": 0.5893917274939173, + "grad_norm": 1.4942331721746605, + "learning_rate": 3.807424551824316e-06, + "loss": 0.4232, + "step": 6056 + }, + { + "epoch": 0.5894890510948905, + "grad_norm": 1.6271760016251753, + "learning_rate": 3.805893945594191e-06, + "loss": 0.1589, + "step": 6057 + }, + { + "epoch": 0.5895863746958637, + "grad_norm": 1.259542850639245, + "learning_rate": 3.804363458023119e-06, + "loss": 0.3037, + "step": 6058 + }, + { + "epoch": 0.589683698296837, + "grad_norm": 2.0986186116245333, + "learning_rate": 3.8028330892631883e-06, + "loss": 0.4256, + "step": 6059 + }, + { + "epoch": 0.5897810218978102, + "grad_norm": 1.1096117008912445, + "learning_rate": 3.8013028394664663e-06, + "loss": 0.221, + "step": 6060 + }, + { + "epoch": 0.5898783454987835, + "grad_norm": 1.5376999143562857, + "learning_rate": 3.7997727087850184e-06, + "loss": 0.4521, + "step": 6061 + }, + { + "epoch": 0.5899756690997567, + "grad_norm": 1.4971816288421493, + "learning_rate": 3.7982426973708947e-06, + "loss": 0.3751, + "step": 6062 + }, + { + "epoch": 0.5900729927007299, + "grad_norm": 1.4982052238160424, + "learning_rate": 3.796712805376132e-06, + "loss": 0.3096, + "step": 6063 + }, + { + "epoch": 0.5901703163017031, + "grad_norm": 1.7366305968528648, + "learning_rate": 3.795183032952758e-06, + "loss": 0.3385, + "step": 6064 + }, + { + "epoch": 0.5902676399026764, + "grad_norm": 1.4992425118571515, + "learning_rate": 3.793653380252786e-06, + "loss": 0.3124, + "step": 6065 + }, + { + "epoch": 0.5903649635036496, + "grad_norm": 1.7940707674219727, + "learning_rate": 3.7921238474282208e-06, + "loss": 0.5323, + "step": 6066 + }, + { + "epoch": 0.5904622871046229, + "grad_norm": 1.3390490786081235, + "learning_rate": 3.7905944346310485e-06, + "loss": 0.3371, + "step": 6067 + }, + { + "epoch": 0.5905596107055961, + "grad_norm": 1.3833066843905326, + "learning_rate": 3.7890651420132517e-06, + "loss": 0.3587, + "step": 6068 + }, + { + "epoch": 0.5906569343065693, + "grad_norm": 1.3927231294487479, + "learning_rate": 3.7875359697267967e-06, + "loss": 0.3751, + "step": 6069 + }, + { + "epoch": 0.5907542579075425, + "grad_norm": 1.7130057319059748, + "learning_rate": 3.7860069179236393e-06, + "loss": 0.5281, + "step": 6070 + }, + { + "epoch": 0.5908515815085158, + "grad_norm": 1.3074475179026124, + "learning_rate": 3.784477986755718e-06, + "loss": 0.2435, + "step": 6071 + }, + { + "epoch": 0.590948905109489, + "grad_norm": 1.5623722024152904, + "learning_rate": 3.7829491763749666e-06, + "loss": 0.4698, + "step": 6072 + }, + { + "epoch": 0.5910462287104623, + "grad_norm": 1.3230466803794698, + "learning_rate": 3.781420486933305e-06, + "loss": 0.3305, + "step": 6073 + }, + { + "epoch": 0.5911435523114356, + "grad_norm": 1.5272031753513022, + "learning_rate": 3.7798919185826364e-06, + "loss": 0.503, + "step": 6074 + }, + { + "epoch": 0.5912408759124088, + "grad_norm": 1.5784992491871666, + "learning_rate": 3.7783634714748592e-06, + "loss": 0.5169, + "step": 6075 + }, + { + "epoch": 0.591338199513382, + "grad_norm": 1.5153655783840159, + "learning_rate": 3.776835145761854e-06, + "loss": 0.3641, + "step": 6076 + }, + { + "epoch": 0.5914355231143552, + "grad_norm": 1.7743615923163676, + "learning_rate": 3.7753069415954936e-06, + "loss": 0.3779, + "step": 6077 + }, + { + "epoch": 0.5915328467153285, + "grad_norm": 1.2694613862988158, + "learning_rate": 3.7737788591276337e-06, + "loss": 0.2303, + "step": 6078 + }, + { + "epoch": 0.5916301703163017, + "grad_norm": 1.663601878714137, + "learning_rate": 3.7722508985101225e-06, + "loss": 0.3174, + "step": 6079 + }, + { + "epoch": 0.591727493917275, + "grad_norm": 1.7689112235953703, + "learning_rate": 3.7707230598947964e-06, + "loss": 0.5926, + "step": 6080 + }, + { + "epoch": 0.5918248175182482, + "grad_norm": 1.4135407386201395, + "learning_rate": 3.769195343433473e-06, + "loss": 0.4129, + "step": 6081 + }, + { + "epoch": 0.5919221411192214, + "grad_norm": 1.5315687665205964, + "learning_rate": 3.767667749277965e-06, + "loss": 0.4421, + "step": 6082 + }, + { + "epoch": 0.5920194647201946, + "grad_norm": 1.5304124838362247, + "learning_rate": 3.7661402775800703e-06, + "loss": 0.4755, + "step": 6083 + }, + { + "epoch": 0.5921167883211679, + "grad_norm": 1.5824857082595416, + "learning_rate": 3.7646129284915754e-06, + "loss": 0.4987, + "step": 6084 + }, + { + "epoch": 0.5922141119221411, + "grad_norm": 1.4836716441134765, + "learning_rate": 3.763085702164252e-06, + "loss": 0.3379, + "step": 6085 + }, + { + "epoch": 0.5923114355231144, + "grad_norm": 1.480465529143924, + "learning_rate": 3.7615585987498627e-06, + "loss": 0.395, + "step": 6086 + }, + { + "epoch": 0.5924087591240876, + "grad_norm": 1.3348532398149553, + "learning_rate": 3.760031618400157e-06, + "loss": 0.1832, + "step": 6087 + }, + { + "epoch": 0.5925060827250608, + "grad_norm": 1.23992401805474, + "learning_rate": 3.7585047612668725e-06, + "loss": 0.2975, + "step": 6088 + }, + { + "epoch": 0.592603406326034, + "grad_norm": 1.438004612465625, + "learning_rate": 3.7569780275017313e-06, + "loss": 0.4668, + "step": 6089 + }, + { + "epoch": 0.5927007299270073, + "grad_norm": 1.2096369496620847, + "learning_rate": 3.7554514172564483e-06, + "loss": 0.2963, + "step": 6090 + }, + { + "epoch": 0.5927980535279805, + "grad_norm": 1.505986189790302, + "learning_rate": 3.753924930682725e-06, + "loss": 0.3663, + "step": 6091 + }, + { + "epoch": 0.5928953771289538, + "grad_norm": 1.5892521837839675, + "learning_rate": 3.752398567932245e-06, + "loss": 0.4481, + "step": 6092 + }, + { + "epoch": 0.592992700729927, + "grad_norm": 1.15344950754204, + "learning_rate": 3.7508723291566857e-06, + "loss": 0.2643, + "step": 6093 + }, + { + "epoch": 0.5930900243309002, + "grad_norm": 1.5285078134783145, + "learning_rate": 3.7493462145077107e-06, + "loss": 0.5585, + "step": 6094 + }, + { + "epoch": 0.5931873479318734, + "grad_norm": 1.1830124639401085, + "learning_rate": 3.7478202241369733e-06, + "loss": 0.2962, + "step": 6095 + }, + { + "epoch": 0.5932846715328467, + "grad_norm": 1.7168858909747537, + "learning_rate": 3.7462943581961077e-06, + "loss": 0.5145, + "step": 6096 + }, + { + "epoch": 0.59338199513382, + "grad_norm": 1.662499495197879, + "learning_rate": 3.7447686168367426e-06, + "loss": 0.5683, + "step": 6097 + }, + { + "epoch": 0.5934793187347932, + "grad_norm": 1.1240873343487263, + "learning_rate": 3.743243000210493e-06, + "loss": 0.2461, + "step": 6098 + }, + { + "epoch": 0.5935766423357665, + "grad_norm": 1.2828864834786387, + "learning_rate": 3.7417175084689573e-06, + "loss": 0.31, + "step": 6099 + }, + { + "epoch": 0.5936739659367397, + "grad_norm": 1.48494597044505, + "learning_rate": 3.7401921417637264e-06, + "loss": 0.3407, + "step": 6100 + }, + { + "epoch": 0.5937712895377129, + "grad_norm": 1.658622194496362, + "learning_rate": 3.738666900246377e-06, + "loss": 0.4679, + "step": 6101 + }, + { + "epoch": 0.5938686131386861, + "grad_norm": 1.2506596255334679, + "learning_rate": 3.7371417840684756e-06, + "loss": 0.3717, + "step": 6102 + }, + { + "epoch": 0.5939659367396594, + "grad_norm": 1.5335644727449282, + "learning_rate": 3.7356167933815677e-06, + "loss": 0.4834, + "step": 6103 + }, + { + "epoch": 0.5940632603406326, + "grad_norm": 1.414037124659017, + "learning_rate": 3.7340919283371974e-06, + "loss": 0.3945, + "step": 6104 + }, + { + "epoch": 0.5941605839416059, + "grad_norm": 1.7992199399598825, + "learning_rate": 3.73256718908689e-06, + "loss": 0.3953, + "step": 6105 + }, + { + "epoch": 0.5942579075425791, + "grad_norm": 1.3429696234522137, + "learning_rate": 3.731042575782161e-06, + "loss": 0.3966, + "step": 6106 + }, + { + "epoch": 0.5943552311435523, + "grad_norm": 1.7734539014419153, + "learning_rate": 3.72951808857451e-06, + "loss": 0.2219, + "step": 6107 + }, + { + "epoch": 0.5944525547445255, + "grad_norm": 1.862840260396579, + "learning_rate": 3.727993727615428e-06, + "loss": 0.2874, + "step": 6108 + }, + { + "epoch": 0.5945498783454988, + "grad_norm": 1.7581295829466408, + "learning_rate": 3.7264694930563916e-06, + "loss": 0.2755, + "step": 6109 + }, + { + "epoch": 0.594647201946472, + "grad_norm": 1.3464145855901808, + "learning_rate": 3.7249453850488635e-06, + "loss": 0.3241, + "step": 6110 + }, + { + "epoch": 0.5947445255474453, + "grad_norm": 1.5880395648168935, + "learning_rate": 3.7234214037442964e-06, + "loss": 0.5298, + "step": 6111 + }, + { + "epoch": 0.5948418491484185, + "grad_norm": 1.5626657016629657, + "learning_rate": 3.721897549294129e-06, + "loss": 0.3321, + "step": 6112 + }, + { + "epoch": 0.5949391727493917, + "grad_norm": 1.2336231307915122, + "learning_rate": 3.72037382184979e-06, + "loss": 0.3928, + "step": 6113 + }, + { + "epoch": 0.5950364963503649, + "grad_norm": 1.0442703640271636, + "learning_rate": 3.7188502215626876e-06, + "loss": 0.1625, + "step": 6114 + }, + { + "epoch": 0.5951338199513382, + "grad_norm": 1.2218465348071996, + "learning_rate": 3.7173267485842274e-06, + "loss": 0.3233, + "step": 6115 + }, + { + "epoch": 0.5952311435523114, + "grad_norm": 1.6977370543544572, + "learning_rate": 3.7158034030657973e-06, + "loss": 0.3452, + "step": 6116 + }, + { + "epoch": 0.5953284671532847, + "grad_norm": 1.3864970285168516, + "learning_rate": 3.714280185158771e-06, + "loss": 0.4554, + "step": 6117 + }, + { + "epoch": 0.5954257907542579, + "grad_norm": 1.502417682290556, + "learning_rate": 3.7127570950145132e-06, + "loss": 0.4773, + "step": 6118 + }, + { + "epoch": 0.5955231143552312, + "grad_norm": 1.370124844062712, + "learning_rate": 3.7112341327843744e-06, + "loss": 0.3619, + "step": 6119 + }, + { + "epoch": 0.5956204379562043, + "grad_norm": 1.2822973576080363, + "learning_rate": 3.7097112986196926e-06, + "loss": 0.3291, + "step": 6120 + }, + { + "epoch": 0.5957177615571776, + "grad_norm": 1.4169549181274235, + "learning_rate": 3.7081885926717908e-06, + "loss": 0.365, + "step": 6121 + }, + { + "epoch": 0.5958150851581508, + "grad_norm": 1.5093831790678773, + "learning_rate": 3.706666015091983e-06, + "loss": 0.4236, + "step": 6122 + }, + { + "epoch": 0.5959124087591241, + "grad_norm": 1.5976497584726386, + "learning_rate": 3.7051435660315682e-06, + "loss": 0.4744, + "step": 6123 + }, + { + "epoch": 0.5960097323600974, + "grad_norm": 1.1751598350897283, + "learning_rate": 3.7036212456418353e-06, + "loss": 0.2396, + "step": 6124 + }, + { + "epoch": 0.5961070559610706, + "grad_norm": 1.4597928872241048, + "learning_rate": 3.7020990540740542e-06, + "loss": 0.3955, + "step": 6125 + }, + { + "epoch": 0.5962043795620438, + "grad_norm": 1.5809562138786184, + "learning_rate": 3.7005769914794866e-06, + "loss": 0.2964, + "step": 6126 + }, + { + "epoch": 0.596301703163017, + "grad_norm": 2.128641822561641, + "learning_rate": 3.699055058009385e-06, + "loss": 0.3346, + "step": 6127 + }, + { + "epoch": 0.5963990267639903, + "grad_norm": 1.8437655173473075, + "learning_rate": 3.69753325381498e-06, + "loss": 0.3568, + "step": 6128 + }, + { + "epoch": 0.5964963503649635, + "grad_norm": 1.56867706007675, + "learning_rate": 3.696011579047496e-06, + "loss": 0.3537, + "step": 6129 + }, + { + "epoch": 0.5965936739659368, + "grad_norm": 1.3632014122787737, + "learning_rate": 3.6944900338581423e-06, + "loss": 0.4356, + "step": 6130 + }, + { + "epoch": 0.59669099756691, + "grad_norm": 1.4999014504166313, + "learning_rate": 3.6929686183981185e-06, + "loss": 0.3298, + "step": 6131 + }, + { + "epoch": 0.5967883211678832, + "grad_norm": 1.3682686155179566, + "learning_rate": 3.6914473328186045e-06, + "loss": 0.5047, + "step": 6132 + }, + { + "epoch": 0.5968856447688564, + "grad_norm": 1.365093218446463, + "learning_rate": 3.689926177270774e-06, + "loss": 0.2212, + "step": 6133 + }, + { + "epoch": 0.5969829683698297, + "grad_norm": 1.3199006290312039, + "learning_rate": 3.688405151905786e-06, + "loss": 0.3044, + "step": 6134 + }, + { + "epoch": 0.5970802919708029, + "grad_norm": 1.2308004749555046, + "learning_rate": 3.6868842568747833e-06, + "loss": 0.3186, + "step": 6135 + }, + { + "epoch": 0.5971776155717762, + "grad_norm": 1.513789294210874, + "learning_rate": 3.6853634923288966e-06, + "loss": 0.3573, + "step": 6136 + }, + { + "epoch": 0.5972749391727494, + "grad_norm": 1.347018237363608, + "learning_rate": 3.683842858419249e-06, + "loss": 0.2027, + "step": 6137 + }, + { + "epoch": 0.5973722627737226, + "grad_norm": 1.5573097236462499, + "learning_rate": 3.6823223552969483e-06, + "loss": 0.3346, + "step": 6138 + }, + { + "epoch": 0.5974695863746958, + "grad_norm": 1.2942317084290391, + "learning_rate": 3.6808019831130824e-06, + "loss": 0.2606, + "step": 6139 + }, + { + "epoch": 0.5975669099756691, + "grad_norm": 1.4921158378928883, + "learning_rate": 3.679281742018735e-06, + "loss": 0.2087, + "step": 6140 + }, + { + "epoch": 0.5976642335766423, + "grad_norm": 1.4239374649911147, + "learning_rate": 3.6777616321649723e-06, + "loss": 0.4631, + "step": 6141 + }, + { + "epoch": 0.5977615571776156, + "grad_norm": 1.6193438173543353, + "learning_rate": 3.67624165370285e-06, + "loss": 0.4785, + "step": 6142 + }, + { + "epoch": 0.5978588807785888, + "grad_norm": 1.6575105296817452, + "learning_rate": 3.6747218067834066e-06, + "loss": 0.3292, + "step": 6143 + }, + { + "epoch": 0.5979562043795621, + "grad_norm": 2.141420314092702, + "learning_rate": 3.673202091557673e-06, + "loss": 0.4711, + "step": 6144 + }, + { + "epoch": 0.5980535279805352, + "grad_norm": 1.3956600290387677, + "learning_rate": 3.671682508176664e-06, + "loss": 0.3087, + "step": 6145 + }, + { + "epoch": 0.5981508515815085, + "grad_norm": 1.567137816108438, + "learning_rate": 3.670163056791378e-06, + "loss": 0.3109, + "step": 6146 + }, + { + "epoch": 0.5982481751824817, + "grad_norm": 1.3736168233308903, + "learning_rate": 3.6686437375528072e-06, + "loss": 0.368, + "step": 6147 + }, + { + "epoch": 0.598345498783455, + "grad_norm": 1.8295882148419824, + "learning_rate": 3.667124550611927e-06, + "loss": 0.3267, + "step": 6148 + }, + { + "epoch": 0.5984428223844283, + "grad_norm": 1.4698423016876485, + "learning_rate": 3.665605496119701e-06, + "loss": 0.3553, + "step": 6149 + }, + { + "epoch": 0.5985401459854015, + "grad_norm": 2.5955540461387048, + "learning_rate": 3.6640865742270756e-06, + "loss": 0.4544, + "step": 6150 + }, + { + "epoch": 0.5986374695863746, + "grad_norm": 1.4561759340499938, + "learning_rate": 3.6625677850849877e-06, + "loss": 0.4342, + "step": 6151 + }, + { + "epoch": 0.5987347931873479, + "grad_norm": 1.4778311794275778, + "learning_rate": 3.6610491288443628e-06, + "loss": 0.3157, + "step": 6152 + }, + { + "epoch": 0.5988321167883212, + "grad_norm": 1.3844987318832855, + "learning_rate": 3.6595306056561077e-06, + "loss": 0.3598, + "step": 6153 + }, + { + "epoch": 0.5989294403892944, + "grad_norm": 1.5713692520761118, + "learning_rate": 3.6580122156711194e-06, + "loss": 0.5598, + "step": 6154 + }, + { + "epoch": 0.5990267639902677, + "grad_norm": 1.6172904014099247, + "learning_rate": 3.656493959040283e-06, + "loss": 0.3726, + "step": 6155 + }, + { + "epoch": 0.5991240875912409, + "grad_norm": 1.3148188316082874, + "learning_rate": 3.6549758359144693e-06, + "loss": 0.4045, + "step": 6156 + }, + { + "epoch": 0.5992214111922141, + "grad_norm": 1.4638324959927835, + "learning_rate": 3.6534578464445314e-06, + "loss": 0.3574, + "step": 6157 + }, + { + "epoch": 0.5993187347931873, + "grad_norm": 1.7060596485644455, + "learning_rate": 3.6519399907813148e-06, + "loss": 0.5914, + "step": 6158 + }, + { + "epoch": 0.5994160583941606, + "grad_norm": 1.2134973356911638, + "learning_rate": 3.6504222690756502e-06, + "loss": 0.296, + "step": 6159 + }, + { + "epoch": 0.5995133819951338, + "grad_norm": 1.5012102013489095, + "learning_rate": 3.648904681478357e-06, + "loss": 0.4598, + "step": 6160 + }, + { + "epoch": 0.5996107055961071, + "grad_norm": 1.668964169363922, + "learning_rate": 3.647387228140233e-06, + "loss": 0.6214, + "step": 6161 + }, + { + "epoch": 0.5997080291970803, + "grad_norm": 1.3209128735938092, + "learning_rate": 3.6458699092120718e-06, + "loss": 0.3505, + "step": 6162 + }, + { + "epoch": 0.5998053527980536, + "grad_norm": 1.432673353720695, + "learning_rate": 3.644352724844651e-06, + "loss": 0.3196, + "step": 6163 + }, + { + "epoch": 0.5999026763990267, + "grad_norm": 1.4673750914877397, + "learning_rate": 3.642835675188733e-06, + "loss": 0.4652, + "step": 6164 + }, + { + "epoch": 0.6, + "grad_norm": 1.11179399698129, + "learning_rate": 3.6413187603950672e-06, + "loss": 0.1711, + "step": 6165 + }, + { + "epoch": 0.6000973236009732, + "grad_norm": 1.1337512631814095, + "learning_rate": 3.6398019806143914e-06, + "loss": 0.2413, + "step": 6166 + }, + { + "epoch": 0.6001946472019465, + "grad_norm": 1.5778537932666719, + "learning_rate": 3.6382853359974302e-06, + "loss": 0.3535, + "step": 6167 + }, + { + "epoch": 0.6002919708029197, + "grad_norm": 1.306135276611625, + "learning_rate": 3.636768826694891e-06, + "loss": 0.3538, + "step": 6168 + }, + { + "epoch": 0.600389294403893, + "grad_norm": 0.9829387783804582, + "learning_rate": 3.635252452857471e-06, + "loss": 0.1732, + "step": 6169 + }, + { + "epoch": 0.6004866180048661, + "grad_norm": 1.4643040184046618, + "learning_rate": 3.633736214635856e-06, + "loss": 0.3299, + "step": 6170 + }, + { + "epoch": 0.6005839416058394, + "grad_norm": 1.6056317991452056, + "learning_rate": 3.632220112180711e-06, + "loss": 0.3551, + "step": 6171 + }, + { + "epoch": 0.6006812652068126, + "grad_norm": 1.4285296978986193, + "learning_rate": 3.6307041456426946e-06, + "loss": 0.3768, + "step": 6172 + }, + { + "epoch": 0.6007785888077859, + "grad_norm": 1.3940771736718718, + "learning_rate": 3.6291883151724483e-06, + "loss": 0.3949, + "step": 6173 + }, + { + "epoch": 0.6008759124087591, + "grad_norm": 1.588750139742169, + "learning_rate": 3.6276726209206026e-06, + "loss": 0.4221, + "step": 6174 + }, + { + "epoch": 0.6009732360097324, + "grad_norm": 1.8091632119521663, + "learning_rate": 3.6261570630377716e-06, + "loss": 0.2268, + "step": 6175 + }, + { + "epoch": 0.6010705596107055, + "grad_norm": 1.574034422123617, + "learning_rate": 3.6246416416745577e-06, + "loss": 0.482, + "step": 6176 + }, + { + "epoch": 0.6011678832116788, + "grad_norm": 1.4551104036373603, + "learning_rate": 3.623126356981549e-06, + "loss": 0.3554, + "step": 6177 + }, + { + "epoch": 0.601265206812652, + "grad_norm": 1.7832562040720823, + "learning_rate": 3.6216112091093215e-06, + "loss": 0.4079, + "step": 6178 + }, + { + "epoch": 0.6013625304136253, + "grad_norm": 1.482659563351729, + "learning_rate": 3.620096198208434e-06, + "loss": 0.4811, + "step": 6179 + }, + { + "epoch": 0.6014598540145986, + "grad_norm": 1.4021447340664892, + "learning_rate": 3.6185813244294355e-06, + "loss": 0.3706, + "step": 6180 + }, + { + "epoch": 0.6015571776155718, + "grad_norm": 1.4073653566318252, + "learning_rate": 3.617066587922863e-06, + "loss": 0.4801, + "step": 6181 + }, + { + "epoch": 0.601654501216545, + "grad_norm": 1.2371213275800568, + "learning_rate": 3.6155519888392306e-06, + "loss": 0.336, + "step": 6182 + }, + { + "epoch": 0.6017518248175182, + "grad_norm": 1.5131655529186325, + "learning_rate": 3.614037527329048e-06, + "loss": 0.3914, + "step": 6183 + }, + { + "epoch": 0.6018491484184915, + "grad_norm": 1.5176163794999507, + "learning_rate": 3.612523203542808e-06, + "loss": 0.478, + "step": 6184 + }, + { + "epoch": 0.6019464720194647, + "grad_norm": 1.3559598884923307, + "learning_rate": 3.611009017630992e-06, + "loss": 0.4438, + "step": 6185 + }, + { + "epoch": 0.602043795620438, + "grad_norm": 1.451247941047726, + "learning_rate": 3.609494969744062e-06, + "loss": 0.4221, + "step": 6186 + }, + { + "epoch": 0.6021411192214112, + "grad_norm": 1.4754502078128098, + "learning_rate": 3.6079810600324718e-06, + "loss": 0.2953, + "step": 6187 + }, + { + "epoch": 0.6022384428223845, + "grad_norm": 3.091204125557194, + "learning_rate": 3.606467288646659e-06, + "loss": 0.2391, + "step": 6188 + }, + { + "epoch": 0.6023357664233576, + "grad_norm": 1.2406028380444387, + "learning_rate": 3.6049536557370494e-06, + "loss": 0.2906, + "step": 6189 + }, + { + "epoch": 0.6024330900243309, + "grad_norm": 1.2436665300893865, + "learning_rate": 3.6034401614540516e-06, + "loss": 0.3346, + "step": 6190 + }, + { + "epoch": 0.6025304136253041, + "grad_norm": 1.5299762176709786, + "learning_rate": 3.601926805948063e-06, + "loss": 0.5753, + "step": 6191 + }, + { + "epoch": 0.6026277372262774, + "grad_norm": 1.2260847679908822, + "learning_rate": 3.6004135893694698e-06, + "loss": 0.2925, + "step": 6192 + }, + { + "epoch": 0.6027250608272506, + "grad_norm": 1.2302483255258292, + "learning_rate": 3.598900511868636e-06, + "loss": 0.2663, + "step": 6193 + }, + { + "epoch": 0.6028223844282239, + "grad_norm": 1.5455862514453014, + "learning_rate": 3.5973875735959196e-06, + "loss": 0.323, + "step": 6194 + }, + { + "epoch": 0.602919708029197, + "grad_norm": 1.3196179972942772, + "learning_rate": 3.595874774701661e-06, + "loss": 0.3211, + "step": 6195 + }, + { + "epoch": 0.6030170316301703, + "grad_norm": 1.4122169210969915, + "learning_rate": 3.5943621153361918e-06, + "loss": 0.4268, + "step": 6196 + }, + { + "epoch": 0.6031143552311435, + "grad_norm": 1.7083449557149384, + "learning_rate": 3.592849595649822e-06, + "loss": 0.7361, + "step": 6197 + }, + { + "epoch": 0.6032116788321168, + "grad_norm": 1.3428967450485307, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.2302, + "step": 6198 + }, + { + "epoch": 0.60330900243309, + "grad_norm": 1.076313874833649, + "learning_rate": 3.58982497591557e-06, + "loss": 0.2554, + "step": 6199 + }, + { + "epoch": 0.6034063260340633, + "grad_norm": 1.650622542298931, + "learning_rate": 3.5883128761682454e-06, + "loss": 0.5339, + "step": 6200 + }, + { + "epoch": 0.6035036496350364, + "grad_norm": 1.3880897299705472, + "learning_rate": 3.5868009167011388e-06, + "loss": 0.3218, + "step": 6201 + }, + { + "epoch": 0.6036009732360097, + "grad_norm": 1.5121167666846589, + "learning_rate": 3.5852890976644935e-06, + "loss": 0.4513, + "step": 6202 + }, + { + "epoch": 0.603698296836983, + "grad_norm": 1.4633278288915132, + "learning_rate": 3.583777419208542e-06, + "loss": 0.5499, + "step": 6203 + }, + { + "epoch": 0.6037956204379562, + "grad_norm": 1.4117203172549408, + "learning_rate": 3.5822658814834964e-06, + "loss": 0.3729, + "step": 6204 + }, + { + "epoch": 0.6038929440389295, + "grad_norm": 1.4710303926524353, + "learning_rate": 3.5807544846395613e-06, + "loss": 0.4537, + "step": 6205 + }, + { + "epoch": 0.6039902676399027, + "grad_norm": 1.4998656848535865, + "learning_rate": 3.579243228826926e-06, + "loss": 0.3768, + "step": 6206 + }, + { + "epoch": 0.604087591240876, + "grad_norm": 1.3493881817200788, + "learning_rate": 3.5777321141957666e-06, + "loss": 0.2518, + "step": 6207 + }, + { + "epoch": 0.6041849148418491, + "grad_norm": 1.4274298347024477, + "learning_rate": 3.576221140896239e-06, + "loss": 0.3759, + "step": 6208 + }, + { + "epoch": 0.6042822384428224, + "grad_norm": 1.4704630175742153, + "learning_rate": 3.574710309078492e-06, + "loss": 0.4538, + "step": 6209 + }, + { + "epoch": 0.6043795620437956, + "grad_norm": 1.4107483019875193, + "learning_rate": 3.5731996188926584e-06, + "loss": 0.3414, + "step": 6210 + }, + { + "epoch": 0.6044768856447689, + "grad_norm": 1.470750866199934, + "learning_rate": 3.571689070488854e-06, + "loss": 0.4459, + "step": 6211 + }, + { + "epoch": 0.6045742092457421, + "grad_norm": 1.3873175819009333, + "learning_rate": 3.5701786640171853e-06, + "loss": 0.2683, + "step": 6212 + }, + { + "epoch": 0.6046715328467154, + "grad_norm": 2.2515251500713487, + "learning_rate": 3.5686683996277417e-06, + "loss": 0.5125, + "step": 6213 + }, + { + "epoch": 0.6047688564476885, + "grad_norm": 1.6249584045955299, + "learning_rate": 3.567158277470601e-06, + "loss": 0.5648, + "step": 6214 + }, + { + "epoch": 0.6048661800486618, + "grad_norm": 1.3437310594510203, + "learning_rate": 3.5656482976958206e-06, + "loss": 0.3504, + "step": 6215 + }, + { + "epoch": 0.604963503649635, + "grad_norm": 1.311697033082678, + "learning_rate": 3.5641384604534503e-06, + "loss": 0.3048, + "step": 6216 + }, + { + "epoch": 0.6050608272506083, + "grad_norm": 1.668230880401363, + "learning_rate": 3.5626287658935254e-06, + "loss": 0.516, + "step": 6217 + }, + { + "epoch": 0.6051581508515815, + "grad_norm": 1.4794528666391957, + "learning_rate": 3.561119214166062e-06, + "loss": 0.4903, + "step": 6218 + }, + { + "epoch": 0.6052554744525548, + "grad_norm": 1.4771961285666158, + "learning_rate": 3.559609805421067e-06, + "loss": 0.287, + "step": 6219 + }, + { + "epoch": 0.6053527980535279, + "grad_norm": 1.328846724139663, + "learning_rate": 3.5581005398085313e-06, + "loss": 0.38, + "step": 6220 + }, + { + "epoch": 0.6054501216545012, + "grad_norm": 1.2685308182535933, + "learning_rate": 3.5565914174784322e-06, + "loss": 0.3403, + "step": 6221 + }, + { + "epoch": 0.6055474452554744, + "grad_norm": 1.3275299534408687, + "learning_rate": 3.5550824385807293e-06, + "loss": 0.4229, + "step": 6222 + }, + { + "epoch": 0.6056447688564477, + "grad_norm": 1.6401471123191445, + "learning_rate": 3.5535736032653735e-06, + "loss": 0.4136, + "step": 6223 + }, + { + "epoch": 0.6057420924574209, + "grad_norm": 1.5123942440671032, + "learning_rate": 3.552064911682297e-06, + "loss": 0.5184, + "step": 6224 + }, + { + "epoch": 0.6058394160583942, + "grad_norm": 1.5317638858385545, + "learning_rate": 3.550556363981422e-06, + "loss": 0.5174, + "step": 6225 + }, + { + "epoch": 0.6059367396593675, + "grad_norm": 1.5048847136587602, + "learning_rate": 3.5490479603126498e-06, + "loss": 0.4189, + "step": 6226 + }, + { + "epoch": 0.6060340632603406, + "grad_norm": 1.4332614274433582, + "learning_rate": 3.5475397008258744e-06, + "loss": 0.5102, + "step": 6227 + }, + { + "epoch": 0.6061313868613138, + "grad_norm": 1.5056757833579093, + "learning_rate": 3.546031585670974e-06, + "loss": 0.5698, + "step": 6228 + }, + { + "epoch": 0.6062287104622871, + "grad_norm": 1.3338492507973077, + "learning_rate": 3.5445236149978057e-06, + "loss": 0.2297, + "step": 6229 + }, + { + "epoch": 0.6063260340632604, + "grad_norm": 2.036144053476165, + "learning_rate": 3.5430157889562213e-06, + "loss": 0.4453, + "step": 6230 + }, + { + "epoch": 0.6064233576642336, + "grad_norm": 1.6536445998139675, + "learning_rate": 3.541508107696053e-06, + "loss": 0.5028, + "step": 6231 + }, + { + "epoch": 0.6065206812652069, + "grad_norm": 1.2233850908807473, + "learning_rate": 3.5400005713671215e-06, + "loss": 0.2803, + "step": 6232 + }, + { + "epoch": 0.60661800486618, + "grad_norm": 1.3361003397249438, + "learning_rate": 3.53849318011923e-06, + "loss": 0.3615, + "step": 6233 + }, + { + "epoch": 0.6067153284671533, + "grad_norm": 8.054935662399211, + "learning_rate": 3.536985934102169e-06, + "loss": 0.4064, + "step": 6234 + }, + { + "epoch": 0.6068126520681265, + "grad_norm": 1.484965156891365, + "learning_rate": 3.5354788334657174e-06, + "loss": 0.5168, + "step": 6235 + }, + { + "epoch": 0.6069099756690998, + "grad_norm": 1.2066530295562112, + "learning_rate": 3.533971878359631e-06, + "loss": 0.2411, + "step": 6236 + }, + { + "epoch": 0.607007299270073, + "grad_norm": 1.4672636262199528, + "learning_rate": 3.532465068933661e-06, + "loss": 0.3032, + "step": 6237 + }, + { + "epoch": 0.6071046228710463, + "grad_norm": 1.3348149273491243, + "learning_rate": 3.530958405337539e-06, + "loss": 0.3891, + "step": 6238 + }, + { + "epoch": 0.6072019464720194, + "grad_norm": 1.2659906871567623, + "learning_rate": 3.529451887720986e-06, + "loss": 0.3701, + "step": 6239 + }, + { + "epoch": 0.6072992700729927, + "grad_norm": 1.4119732423116556, + "learning_rate": 3.5279455162337007e-06, + "loss": 0.4224, + "step": 6240 + }, + { + "epoch": 0.6073965936739659, + "grad_norm": 1.4963274342474535, + "learning_rate": 3.526439291025373e-06, + "loss": 0.3888, + "step": 6241 + }, + { + "epoch": 0.6074939172749392, + "grad_norm": 1.2622868820698283, + "learning_rate": 3.5249332122456803e-06, + "loss": 0.3492, + "step": 6242 + }, + { + "epoch": 0.6075912408759124, + "grad_norm": 1.4096067419917093, + "learning_rate": 3.523427280044281e-06, + "loss": 0.3487, + "step": 6243 + }, + { + "epoch": 0.6076885644768857, + "grad_norm": 1.5095705841534075, + "learning_rate": 3.5219214945708193e-06, + "loss": 0.4579, + "step": 6244 + }, + { + "epoch": 0.6077858880778588, + "grad_norm": 1.3840115623377025, + "learning_rate": 3.520415855974928e-06, + "loss": 0.3845, + "step": 6245 + }, + { + "epoch": 0.6078832116788321, + "grad_norm": 1.2929143519641557, + "learning_rate": 3.518910364406223e-06, + "loss": 0.3106, + "step": 6246 + }, + { + "epoch": 0.6079805352798053, + "grad_norm": 1.8484000710884219, + "learning_rate": 3.517405020014304e-06, + "loss": 0.4175, + "step": 6247 + }, + { + "epoch": 0.6080778588807786, + "grad_norm": 1.578142675923983, + "learning_rate": 3.51589982294876e-06, + "loss": 0.5108, + "step": 6248 + }, + { + "epoch": 0.6081751824817518, + "grad_norm": 1.6264769371313765, + "learning_rate": 3.5143947733591633e-06, + "loss": 0.5404, + "step": 6249 + }, + { + "epoch": 0.6082725060827251, + "grad_norm": 1.3617419540170375, + "learning_rate": 3.512889871395072e-06, + "loss": 0.3561, + "step": 6250 + }, + { + "epoch": 0.6083698296836983, + "grad_norm": 1.318331705438804, + "learning_rate": 3.511385117206027e-06, + "loss": 0.3393, + "step": 6251 + }, + { + "epoch": 0.6084671532846715, + "grad_norm": 1.4329370142956308, + "learning_rate": 3.509880510941558e-06, + "loss": 0.3546, + "step": 6252 + }, + { + "epoch": 0.6085644768856447, + "grad_norm": 1.2462490068774155, + "learning_rate": 3.508376052751179e-06, + "loss": 0.3055, + "step": 6253 + }, + { + "epoch": 0.608661800486618, + "grad_norm": 1.6540416173807684, + "learning_rate": 3.5068717427843873e-06, + "loss": 0.446, + "step": 6254 + }, + { + "epoch": 0.6087591240875913, + "grad_norm": 1.4402205389737914, + "learning_rate": 3.5053675811906683e-06, + "loss": 0.3518, + "step": 6255 + }, + { + "epoch": 0.6088564476885645, + "grad_norm": 0.9093126963872747, + "learning_rate": 3.5038635681194922e-06, + "loss": 0.2299, + "step": 6256 + }, + { + "epoch": 0.6089537712895378, + "grad_norm": 1.3714844671350945, + "learning_rate": 3.502359703720313e-06, + "loss": 0.4417, + "step": 6257 + }, + { + "epoch": 0.6090510948905109, + "grad_norm": 1.3195576513256702, + "learning_rate": 3.5008559881425703e-06, + "loss": 0.2737, + "step": 6258 + }, + { + "epoch": 0.6091484184914842, + "grad_norm": 1.5278386633714716, + "learning_rate": 3.499352421535689e-06, + "loss": 0.43, + "step": 6259 + }, + { + "epoch": 0.6092457420924574, + "grad_norm": 1.1993787826470486, + "learning_rate": 3.49784900404908e-06, + "loss": 0.3391, + "step": 6260 + }, + { + "epoch": 0.6093430656934307, + "grad_norm": 1.3114718997284942, + "learning_rate": 3.4963457358321416e-06, + "loss": 0.1708, + "step": 6261 + }, + { + "epoch": 0.6094403892944039, + "grad_norm": 1.4589053621562125, + "learning_rate": 3.494842617034249e-06, + "loss": 0.2967, + "step": 6262 + }, + { + "epoch": 0.6095377128953772, + "grad_norm": 1.7212200102898008, + "learning_rate": 3.4933396478047702e-06, + "loss": 0.3529, + "step": 6263 + }, + { + "epoch": 0.6096350364963503, + "grad_norm": 1.4633618806619277, + "learning_rate": 3.491836828293058e-06, + "loss": 0.4818, + "step": 6264 + }, + { + "epoch": 0.6097323600973236, + "grad_norm": 1.561796106099118, + "learning_rate": 3.4903341586484464e-06, + "loss": 0.3899, + "step": 6265 + }, + { + "epoch": 0.6098296836982968, + "grad_norm": 1.3723783642732574, + "learning_rate": 3.4888316390202577e-06, + "loss": 0.391, + "step": 6266 + }, + { + "epoch": 0.6099270072992701, + "grad_norm": 1.5967327704048786, + "learning_rate": 3.487329269557797e-06, + "loss": 0.5419, + "step": 6267 + }, + { + "epoch": 0.6100243309002433, + "grad_norm": 1.5516854443969215, + "learning_rate": 3.485827050410358e-06, + "loss": 0.3598, + "step": 6268 + }, + { + "epoch": 0.6101216545012166, + "grad_norm": 1.4569098291607396, + "learning_rate": 3.484324981727215e-06, + "loss": 0.4801, + "step": 6269 + }, + { + "epoch": 0.6102189781021898, + "grad_norm": 1.4186526664249823, + "learning_rate": 3.48282306365763e-06, + "loss": 0.4003, + "step": 6270 + }, + { + "epoch": 0.610316301703163, + "grad_norm": 1.1813037234406614, + "learning_rate": 3.4813212963508514e-06, + "loss": 0.3108, + "step": 6271 + }, + { + "epoch": 0.6104136253041362, + "grad_norm": 1.5961683288567687, + "learning_rate": 3.4798196799561067e-06, + "loss": 0.3287, + "step": 6272 + }, + { + "epoch": 0.6105109489051095, + "grad_norm": 1.3929563546391415, + "learning_rate": 3.478318214622616e-06, + "loss": 0.3976, + "step": 6273 + }, + { + "epoch": 0.6106082725060827, + "grad_norm": 1.5244730242140714, + "learning_rate": 3.476816900499578e-06, + "loss": 0.2579, + "step": 6274 + }, + { + "epoch": 0.610705596107056, + "grad_norm": 1.4662098935521044, + "learning_rate": 3.4753157377361837e-06, + "loss": 0.2799, + "step": 6275 + }, + { + "epoch": 0.6108029197080292, + "grad_norm": 1.4801693414773403, + "learning_rate": 3.473814726481599e-06, + "loss": 0.2896, + "step": 6276 + }, + { + "epoch": 0.6109002433090024, + "grad_norm": 1.5415807734438165, + "learning_rate": 3.4723138668849837e-06, + "loss": 0.3406, + "step": 6277 + }, + { + "epoch": 0.6109975669099756, + "grad_norm": 1.413446450265214, + "learning_rate": 3.4708131590954787e-06, + "loss": 0.1766, + "step": 6278 + }, + { + "epoch": 0.6110948905109489, + "grad_norm": 1.6088417428984825, + "learning_rate": 3.4693126032622105e-06, + "loss": 0.5242, + "step": 6279 + }, + { + "epoch": 0.6111922141119221, + "grad_norm": 1.441357619542961, + "learning_rate": 3.467812199534289e-06, + "loss": 0.4387, + "step": 6280 + }, + { + "epoch": 0.6112895377128954, + "grad_norm": 1.4570273227996673, + "learning_rate": 3.466311948060811e-06, + "loss": 0.4021, + "step": 6281 + }, + { + "epoch": 0.6113868613138687, + "grad_norm": 1.4930653810793868, + "learning_rate": 3.4648118489908588e-06, + "loss": 0.4825, + "step": 6282 + }, + { + "epoch": 0.6114841849148418, + "grad_norm": 1.571461726743023, + "learning_rate": 3.4633119024734958e-06, + "loss": 0.5449, + "step": 6283 + }, + { + "epoch": 0.611581508515815, + "grad_norm": 1.2925223879470797, + "learning_rate": 3.4618121086577727e-06, + "loss": 0.3311, + "step": 6284 + }, + { + "epoch": 0.6116788321167883, + "grad_norm": 1.6121883061512903, + "learning_rate": 3.4603124676927257e-06, + "loss": 0.5423, + "step": 6285 + }, + { + "epoch": 0.6117761557177616, + "grad_norm": 1.3835274092947698, + "learning_rate": 3.4588129797273773e-06, + "loss": 0.4545, + "step": 6286 + }, + { + "epoch": 0.6118734793187348, + "grad_norm": 1.6117108624515402, + "learning_rate": 3.4573136449107293e-06, + "loss": 0.2824, + "step": 6287 + }, + { + "epoch": 0.6119708029197081, + "grad_norm": 1.5783400436280206, + "learning_rate": 3.455814463391771e-06, + "loss": 0.5238, + "step": 6288 + }, + { + "epoch": 0.6120681265206812, + "grad_norm": 1.4494124537568134, + "learning_rate": 3.4543154353194812e-06, + "loss": 0.3216, + "step": 6289 + }, + { + "epoch": 0.6121654501216545, + "grad_norm": 1.4527986221487594, + "learning_rate": 3.4528165608428153e-06, + "loss": 0.3118, + "step": 6290 + }, + { + "epoch": 0.6122627737226277, + "grad_norm": 1.505068084976367, + "learning_rate": 3.4513178401107184e-06, + "loss": 0.2119, + "step": 6291 + }, + { + "epoch": 0.612360097323601, + "grad_norm": 1.3567726393948163, + "learning_rate": 3.44981927327212e-06, + "loss": 0.3592, + "step": 6292 + }, + { + "epoch": 0.6124574209245742, + "grad_norm": 1.5213220237915581, + "learning_rate": 3.448320860475934e-06, + "loss": 0.5401, + "step": 6293 + }, + { + "epoch": 0.6125547445255475, + "grad_norm": 1.8487148357397571, + "learning_rate": 3.4468226018710577e-06, + "loss": 0.4019, + "step": 6294 + }, + { + "epoch": 0.6126520681265207, + "grad_norm": 1.2125102996388717, + "learning_rate": 3.445324497606372e-06, + "loss": 0.2799, + "step": 6295 + }, + { + "epoch": 0.6127493917274939, + "grad_norm": 1.3350068358628495, + "learning_rate": 3.4438265478307477e-06, + "loss": 0.3789, + "step": 6296 + }, + { + "epoch": 0.6128467153284671, + "grad_norm": 1.5770562206979775, + "learning_rate": 3.4423287526930383e-06, + "loss": 0.5942, + "step": 6297 + }, + { + "epoch": 0.6129440389294404, + "grad_norm": 1.41334127506388, + "learning_rate": 3.4408311123420755e-06, + "loss": 0.2732, + "step": 6298 + }, + { + "epoch": 0.6130413625304136, + "grad_norm": 1.649309695946923, + "learning_rate": 3.4393336269266837e-06, + "loss": 0.2456, + "step": 6299 + }, + { + "epoch": 0.6131386861313869, + "grad_norm": 1.2926830425549862, + "learning_rate": 3.4378362965956695e-06, + "loss": 0.3859, + "step": 6300 + }, + { + "epoch": 0.6132360097323601, + "grad_norm": 1.499433012933645, + "learning_rate": 3.436339121497822e-06, + "loss": 0.4412, + "step": 6301 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 1.4107012342846055, + "learning_rate": 3.4348421017819167e-06, + "loss": 0.4779, + "step": 6302 + }, + { + "epoch": 0.6134306569343065, + "grad_norm": 1.4620967080711322, + "learning_rate": 3.4333452375967143e-06, + "loss": 0.4425, + "step": 6303 + }, + { + "epoch": 0.6135279805352798, + "grad_norm": 1.4483668041568563, + "learning_rate": 3.4318485290909604e-06, + "loss": 0.2765, + "step": 6304 + }, + { + "epoch": 0.613625304136253, + "grad_norm": 1.4808643788761575, + "learning_rate": 3.4303519764133784e-06, + "loss": 0.3663, + "step": 6305 + }, + { + "epoch": 0.6137226277372263, + "grad_norm": 1.4000004393713126, + "learning_rate": 3.428855579712687e-06, + "loss": 0.3395, + "step": 6306 + }, + { + "epoch": 0.6138199513381996, + "grad_norm": 1.3367216251772667, + "learning_rate": 3.4273593391375844e-06, + "loss": 0.3569, + "step": 6307 + }, + { + "epoch": 0.6139172749391727, + "grad_norm": 1.560089999317103, + "learning_rate": 3.4258632548367487e-06, + "loss": 0.4118, + "step": 6308 + }, + { + "epoch": 0.614014598540146, + "grad_norm": 1.1278244167595692, + "learning_rate": 3.4243673269588485e-06, + "loss": 0.2091, + "step": 6309 + }, + { + "epoch": 0.6141119221411192, + "grad_norm": 1.4973723124992595, + "learning_rate": 3.422871555652536e-06, + "loss": 0.4631, + "step": 6310 + }, + { + "epoch": 0.6142092457420925, + "grad_norm": 1.4001545939921534, + "learning_rate": 3.421375941066447e-06, + "loss": 0.439, + "step": 6311 + }, + { + "epoch": 0.6143065693430657, + "grad_norm": 1.5825897361298165, + "learning_rate": 3.4198804833492004e-06, + "loss": 0.4885, + "step": 6312 + }, + { + "epoch": 0.614403892944039, + "grad_norm": 1.2241923764521647, + "learning_rate": 3.4183851826494015e-06, + "loss": 0.2927, + "step": 6313 + }, + { + "epoch": 0.6145012165450122, + "grad_norm": 1.6212037566214181, + "learning_rate": 3.416890039115639e-06, + "loss": 0.3205, + "step": 6314 + }, + { + "epoch": 0.6145985401459854, + "grad_norm": 2.2490969011082984, + "learning_rate": 3.4153950528964867e-06, + "loss": 0.4081, + "step": 6315 + }, + { + "epoch": 0.6146958637469586, + "grad_norm": 1.259483412248355, + "learning_rate": 3.4139002241405016e-06, + "loss": 0.3144, + "step": 6316 + }, + { + "epoch": 0.6147931873479319, + "grad_norm": 1.3599382746236823, + "learning_rate": 3.4124055529962263e-06, + "loss": 0.4379, + "step": 6317 + }, + { + "epoch": 0.6148905109489051, + "grad_norm": 1.4671575151168392, + "learning_rate": 3.4109110396121886e-06, + "loss": 0.2986, + "step": 6318 + }, + { + "epoch": 0.6149878345498784, + "grad_norm": 1.2928353324351105, + "learning_rate": 3.409416684136896e-06, + "loss": 0.3141, + "step": 6319 + }, + { + "epoch": 0.6150851581508516, + "grad_norm": 1.6815782418016656, + "learning_rate": 3.4079224867188447e-06, + "loss": 0.5077, + "step": 6320 + }, + { + "epoch": 0.6151824817518248, + "grad_norm": 2.0106035715561497, + "learning_rate": 3.4064284475065148e-06, + "loss": 0.6779, + "step": 6321 + }, + { + "epoch": 0.615279805352798, + "grad_norm": 1.5034343821418512, + "learning_rate": 3.4049345666483703e-06, + "loss": 0.3309, + "step": 6322 + }, + { + "epoch": 0.6153771289537713, + "grad_norm": 1.4676413714599856, + "learning_rate": 3.403440844292858e-06, + "loss": 0.4523, + "step": 6323 + }, + { + "epoch": 0.6154744525547445, + "grad_norm": 1.272720041192553, + "learning_rate": 3.401947280588409e-06, + "loss": 0.285, + "step": 6324 + }, + { + "epoch": 0.6155717761557178, + "grad_norm": 2.0024256302147694, + "learning_rate": 3.400453875683442e-06, + "loss": 0.2769, + "step": 6325 + }, + { + "epoch": 0.615669099756691, + "grad_norm": 1.416829810378921, + "learning_rate": 3.3989606297263576e-06, + "loss": 0.2241, + "step": 6326 + }, + { + "epoch": 0.6157664233576642, + "grad_norm": 1.4792782468305348, + "learning_rate": 3.397467542865538e-06, + "loss": 0.4518, + "step": 6327 + }, + { + "epoch": 0.6158637469586374, + "grad_norm": 1.3309962483015716, + "learning_rate": 3.395974615249354e-06, + "loss": 0.4229, + "step": 6328 + }, + { + "epoch": 0.6159610705596107, + "grad_norm": 1.5278524736967327, + "learning_rate": 3.3944818470261604e-06, + "loss": 0.3332, + "step": 6329 + }, + { + "epoch": 0.6160583941605839, + "grad_norm": 1.5674212204454987, + "learning_rate": 3.39298923834429e-06, + "loss": 0.4768, + "step": 6330 + }, + { + "epoch": 0.6161557177615572, + "grad_norm": 1.7051468493407753, + "learning_rate": 3.3914967893520673e-06, + "loss": 0.3586, + "step": 6331 + }, + { + "epoch": 0.6162530413625305, + "grad_norm": 1.6596272422325444, + "learning_rate": 3.390004500197797e-06, + "loss": 0.4529, + "step": 6332 + }, + { + "epoch": 0.6163503649635036, + "grad_norm": 1.4436935017417774, + "learning_rate": 3.38851237102977e-06, + "loss": 0.2779, + "step": 6333 + }, + { + "epoch": 0.6164476885644768, + "grad_norm": 1.7240503599684307, + "learning_rate": 3.3870204019962583e-06, + "loss": 0.4542, + "step": 6334 + }, + { + "epoch": 0.6165450121654501, + "grad_norm": 1.4194391366635841, + "learning_rate": 3.3855285932455204e-06, + "loss": 0.37, + "step": 6335 + }, + { + "epoch": 0.6166423357664234, + "grad_norm": 1.4749399787199533, + "learning_rate": 3.3840369449258005e-06, + "loss": 0.478, + "step": 6336 + }, + { + "epoch": 0.6167396593673966, + "grad_norm": 1.0682405339452568, + "learning_rate": 3.3825454571853213e-06, + "loss": 0.2321, + "step": 6337 + }, + { + "epoch": 0.6168369829683699, + "grad_norm": 1.122839123956751, + "learning_rate": 3.3810541301722932e-06, + "loss": 0.2402, + "step": 6338 + }, + { + "epoch": 0.6169343065693431, + "grad_norm": 1.485590707963305, + "learning_rate": 3.3795629640349127e-06, + "loss": 0.2774, + "step": 6339 + }, + { + "epoch": 0.6170316301703163, + "grad_norm": 1.5247485657081794, + "learning_rate": 3.378071958921357e-06, + "loss": 0.4048, + "step": 6340 + }, + { + "epoch": 0.6171289537712895, + "grad_norm": 2.2602022716848693, + "learning_rate": 3.3765811149797866e-06, + "loss": 0.1987, + "step": 6341 + }, + { + "epoch": 0.6172262773722628, + "grad_norm": 1.862554444406962, + "learning_rate": 3.375090432358349e-06, + "loss": 0.374, + "step": 6342 + }, + { + "epoch": 0.617323600973236, + "grad_norm": 1.2615712082519541, + "learning_rate": 3.373599911205173e-06, + "loss": 0.3411, + "step": 6343 + }, + { + "epoch": 0.6174209245742093, + "grad_norm": 1.5993289911376303, + "learning_rate": 3.3721095516683745e-06, + "loss": 0.3269, + "step": 6344 + }, + { + "epoch": 0.6175182481751825, + "grad_norm": 1.3306415725614866, + "learning_rate": 3.3706193538960497e-06, + "loss": 0.1994, + "step": 6345 + }, + { + "epoch": 0.6176155717761557, + "grad_norm": 2.021754680750299, + "learning_rate": 3.3691293180362817e-06, + "loss": 0.4255, + "step": 6346 + }, + { + "epoch": 0.6177128953771289, + "grad_norm": 1.6757927740807863, + "learning_rate": 3.3676394442371363e-06, + "loss": 0.3938, + "step": 6347 + }, + { + "epoch": 0.6178102189781022, + "grad_norm": 1.5856184615021809, + "learning_rate": 3.366149732646661e-06, + "loss": 0.336, + "step": 6348 + }, + { + "epoch": 0.6179075425790754, + "grad_norm": 1.2525495278013308, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.2742, + "step": 6349 + }, + { + "epoch": 0.6180048661800487, + "grad_norm": 2.6474231138943267, + "learning_rate": 3.3631707966838455e-06, + "loss": 0.2969, + "step": 6350 + }, + { + "epoch": 0.6181021897810219, + "grad_norm": 1.4764349342423693, + "learning_rate": 3.3616815726075246e-06, + "loss": 0.4025, + "step": 6351 + }, + { + "epoch": 0.6181995133819951, + "grad_norm": 1.516216909703192, + "learning_rate": 3.360192511331911e-06, + "loss": 0.4357, + "step": 6352 + }, + { + "epoch": 0.6182968369829683, + "grad_norm": 1.5645375508781854, + "learning_rate": 3.3587036130049755e-06, + "loss": 0.4842, + "step": 6353 + }, + { + "epoch": 0.6183941605839416, + "grad_norm": 1.6055309719827424, + "learning_rate": 3.3572148777746725e-06, + "loss": 0.3949, + "step": 6354 + }, + { + "epoch": 0.6184914841849148, + "grad_norm": 1.4187572798353119, + "learning_rate": 3.355726305788935e-06, + "loss": 0.347, + "step": 6355 + }, + { + "epoch": 0.6185888077858881, + "grad_norm": 1.4852764404080314, + "learning_rate": 3.3542378971956855e-06, + "loss": 0.4636, + "step": 6356 + }, + { + "epoch": 0.6186861313868613, + "grad_norm": 1.4680545865026327, + "learning_rate": 3.352749652142827e-06, + "loss": 0.3263, + "step": 6357 + }, + { + "epoch": 0.6187834549878346, + "grad_norm": 1.2909199596978085, + "learning_rate": 3.35126157077825e-06, + "loss": 0.221, + "step": 6358 + }, + { + "epoch": 0.6188807785888077, + "grad_norm": 1.2858512877329278, + "learning_rate": 3.3497736532498228e-06, + "loss": 0.4425, + "step": 6359 + }, + { + "epoch": 0.618978102189781, + "grad_norm": 1.4679780920197352, + "learning_rate": 3.348285899705402e-06, + "loss": 0.3828, + "step": 6360 + }, + { + "epoch": 0.6190754257907543, + "grad_norm": 1.4904109905171448, + "learning_rate": 3.3467983102928264e-06, + "loss": 0.5438, + "step": 6361 + }, + { + "epoch": 0.6191727493917275, + "grad_norm": 1.5466551961692698, + "learning_rate": 3.345310885159921e-06, + "loss": 0.4711, + "step": 6362 + }, + { + "epoch": 0.6192700729927008, + "grad_norm": 1.6000010251995618, + "learning_rate": 3.3438236244544876e-06, + "loss": 0.4009, + "step": 6363 + }, + { + "epoch": 0.619367396593674, + "grad_norm": 1.380310294745672, + "learning_rate": 3.342336528324318e-06, + "loss": 0.3238, + "step": 6364 + }, + { + "epoch": 0.6194647201946472, + "grad_norm": 1.4194772646038305, + "learning_rate": 3.3408495969171895e-06, + "loss": 0.277, + "step": 6365 + }, + { + "epoch": 0.6195620437956204, + "grad_norm": 1.3779067180187112, + "learning_rate": 3.3393628303808546e-06, + "loss": 0.417, + "step": 6366 + }, + { + "epoch": 0.6196593673965937, + "grad_norm": 1.5475185096139097, + "learning_rate": 3.337876228863055e-06, + "loss": 0.549, + "step": 6367 + }, + { + "epoch": 0.6197566909975669, + "grad_norm": 1.601875683936525, + "learning_rate": 3.3363897925115174e-06, + "loss": 0.578, + "step": 6368 + }, + { + "epoch": 0.6198540145985402, + "grad_norm": 1.485607719529514, + "learning_rate": 3.334903521473949e-06, + "loss": 0.254, + "step": 6369 + }, + { + "epoch": 0.6199513381995134, + "grad_norm": 1.6975560965840877, + "learning_rate": 3.33341741589804e-06, + "loss": 0.6447, + "step": 6370 + }, + { + "epoch": 0.6200486618004866, + "grad_norm": 1.5440973331279908, + "learning_rate": 3.331931475931467e-06, + "loss": 0.533, + "step": 6371 + }, + { + "epoch": 0.6201459854014598, + "grad_norm": 1.489907965153857, + "learning_rate": 3.33044570172189e-06, + "loss": 0.4217, + "step": 6372 + }, + { + "epoch": 0.6202433090024331, + "grad_norm": 1.3357390317671818, + "learning_rate": 3.3289600934169488e-06, + "loss": 0.3263, + "step": 6373 + }, + { + "epoch": 0.6203406326034063, + "grad_norm": 1.405734158318971, + "learning_rate": 3.327474651164268e-06, + "loss": 0.4839, + "step": 6374 + }, + { + "epoch": 0.6204379562043796, + "grad_norm": 1.72369789238416, + "learning_rate": 3.3259893751114607e-06, + "loss": 0.3278, + "step": 6375 + }, + { + "epoch": 0.6205352798053528, + "grad_norm": 1.6631384777969438, + "learning_rate": 3.32450426540612e-06, + "loss": 0.4216, + "step": 6376 + }, + { + "epoch": 0.6206326034063261, + "grad_norm": 1.306876624538264, + "learning_rate": 3.3230193221958185e-06, + "loss": 0.3602, + "step": 6377 + }, + { + "epoch": 0.6207299270072992, + "grad_norm": 1.8326922365035092, + "learning_rate": 3.321534545628118e-06, + "loss": 0.3393, + "step": 6378 + }, + { + "epoch": 0.6208272506082725, + "grad_norm": 1.3059504443839534, + "learning_rate": 3.3200499358505612e-06, + "loss": 0.3376, + "step": 6379 + }, + { + "epoch": 0.6209245742092457, + "grad_norm": 1.2028159202790685, + "learning_rate": 3.318565493010676e-06, + "loss": 0.3857, + "step": 6380 + }, + { + "epoch": 0.621021897810219, + "grad_norm": 1.5256994425713408, + "learning_rate": 3.3170812172559695e-06, + "loss": 0.5269, + "step": 6381 + }, + { + "epoch": 0.6211192214111922, + "grad_norm": 1.5735523716078301, + "learning_rate": 3.3155971087339373e-06, + "loss": 0.5115, + "step": 6382 + }, + { + "epoch": 0.6212165450121655, + "grad_norm": 1.480129233208057, + "learning_rate": 3.314113167592058e-06, + "loss": 0.2543, + "step": 6383 + }, + { + "epoch": 0.6213138686131386, + "grad_norm": 1.4854674485324186, + "learning_rate": 3.3126293939777865e-06, + "loss": 0.4638, + "step": 6384 + }, + { + "epoch": 0.6214111922141119, + "grad_norm": 1.6127885338551649, + "learning_rate": 3.311145788038569e-06, + "loss": 0.1873, + "step": 6385 + }, + { + "epoch": 0.6215085158150851, + "grad_norm": 1.408793543876312, + "learning_rate": 3.309662349921834e-06, + "loss": 0.4188, + "step": 6386 + }, + { + "epoch": 0.6216058394160584, + "grad_norm": 1.312927812153489, + "learning_rate": 3.3081790797749915e-06, + "loss": 0.3685, + "step": 6387 + }, + { + "epoch": 0.6217031630170317, + "grad_norm": 1.286380080753042, + "learning_rate": 3.3066959777454324e-06, + "loss": 0.3, + "step": 6388 + }, + { + "epoch": 0.6218004866180049, + "grad_norm": 1.4640736093985107, + "learning_rate": 3.305213043980534e-06, + "loss": 0.2843, + "step": 6389 + }, + { + "epoch": 0.621897810218978, + "grad_norm": 1.601117476995483, + "learning_rate": 3.3037302786276584e-06, + "loss": 0.3321, + "step": 6390 + }, + { + "epoch": 0.6219951338199513, + "grad_norm": 1.5629534254206616, + "learning_rate": 3.3022476818341466e-06, + "loss": 0.4059, + "step": 6391 + }, + { + "epoch": 0.6220924574209246, + "grad_norm": 1.375427613090092, + "learning_rate": 3.300765253747327e-06, + "loss": 0.376, + "step": 6392 + }, + { + "epoch": 0.6221897810218978, + "grad_norm": 1.520247028505382, + "learning_rate": 3.2992829945145076e-06, + "loss": 0.4351, + "step": 6393 + }, + { + "epoch": 0.6222871046228711, + "grad_norm": 1.6734811073697942, + "learning_rate": 3.2978009042829843e-06, + "loss": 0.3744, + "step": 6394 + }, + { + "epoch": 0.6223844282238443, + "grad_norm": 1.4250547465465695, + "learning_rate": 3.2963189832000286e-06, + "loss": 0.3529, + "step": 6395 + }, + { + "epoch": 0.6224817518248175, + "grad_norm": 1.5534281014660964, + "learning_rate": 3.294837231412904e-06, + "loss": 0.4346, + "step": 6396 + }, + { + "epoch": 0.6225790754257907, + "grad_norm": 1.5199040051313941, + "learning_rate": 3.2933556490688515e-06, + "loss": 0.552, + "step": 6397 + }, + { + "epoch": 0.622676399026764, + "grad_norm": 1.4811874698365435, + "learning_rate": 3.2918742363150996e-06, + "loss": 0.4781, + "step": 6398 + }, + { + "epoch": 0.6227737226277372, + "grad_norm": 1.5829534409520465, + "learning_rate": 3.290392993298852e-06, + "loss": 0.4083, + "step": 6399 + }, + { + "epoch": 0.6228710462287105, + "grad_norm": 1.4110587235903005, + "learning_rate": 3.2889119201673043e-06, + "loss": 0.447, + "step": 6400 + }, + { + "epoch": 0.6229683698296837, + "grad_norm": 1.4361938679402597, + "learning_rate": 3.2874310170676316e-06, + "loss": 0.4102, + "step": 6401 + }, + { + "epoch": 0.623065693430657, + "grad_norm": 1.6223143612737465, + "learning_rate": 3.28595028414699e-06, + "loss": 0.4837, + "step": 6402 + }, + { + "epoch": 0.6231630170316301, + "grad_norm": 1.4197191935920161, + "learning_rate": 3.2844697215525224e-06, + "loss": 0.237, + "step": 6403 + }, + { + "epoch": 0.6232603406326034, + "grad_norm": 1.5890665990125814, + "learning_rate": 3.282989329431353e-06, + "loss": 0.5402, + "step": 6404 + }, + { + "epoch": 0.6233576642335766, + "grad_norm": 1.4318873256650708, + "learning_rate": 3.28150910793059e-06, + "loss": 0.4189, + "step": 6405 + }, + { + "epoch": 0.6234549878345499, + "grad_norm": 1.2893337542343504, + "learning_rate": 3.2800290571973225e-06, + "loss": 0.2076, + "step": 6406 + }, + { + "epoch": 0.6235523114355231, + "grad_norm": 1.7394992632928765, + "learning_rate": 3.278549177378625e-06, + "loss": 0.4858, + "step": 6407 + }, + { + "epoch": 0.6236496350364964, + "grad_norm": 1.7877530299008864, + "learning_rate": 3.2770694686215555e-06, + "loss": 0.3758, + "step": 6408 + }, + { + "epoch": 0.6237469586374695, + "grad_norm": 1.3889300700546277, + "learning_rate": 3.27558993107315e-06, + "loss": 0.3385, + "step": 6409 + }, + { + "epoch": 0.6238442822384428, + "grad_norm": 1.5311081489721348, + "learning_rate": 3.2741105648804326e-06, + "loss": 0.5021, + "step": 6410 + }, + { + "epoch": 0.623941605839416, + "grad_norm": 1.6357991769456734, + "learning_rate": 3.27263137019041e-06, + "loss": 0.3163, + "step": 6411 + }, + { + "epoch": 0.6240389294403893, + "grad_norm": 1.5531807503777904, + "learning_rate": 3.27115234715007e-06, + "loss": 0.4715, + "step": 6412 + }, + { + "epoch": 0.6241362530413626, + "grad_norm": 1.595618256234525, + "learning_rate": 3.2696734959063836e-06, + "loss": 0.2676, + "step": 6413 + }, + { + "epoch": 0.6242335766423358, + "grad_norm": 1.2586532057790611, + "learning_rate": 3.268194816606305e-06, + "loss": 0.3499, + "step": 6414 + }, + { + "epoch": 0.624330900243309, + "grad_norm": 1.3478748089021555, + "learning_rate": 3.266716309396772e-06, + "loss": 0.3094, + "step": 6415 + }, + { + "epoch": 0.6244282238442822, + "grad_norm": 1.6488441580377198, + "learning_rate": 3.2652379744247053e-06, + "loss": 0.2998, + "step": 6416 + }, + { + "epoch": 0.6245255474452555, + "grad_norm": 1.6076493872823139, + "learning_rate": 3.2637598118370075e-06, + "loss": 0.3228, + "step": 6417 + }, + { + "epoch": 0.6246228710462287, + "grad_norm": 1.5158291708686529, + "learning_rate": 3.2622818217805634e-06, + "loss": 0.4085, + "step": 6418 + }, + { + "epoch": 0.624720194647202, + "grad_norm": 1.4341934155650422, + "learning_rate": 3.260804004402245e-06, + "loss": 0.2319, + "step": 6419 + }, + { + "epoch": 0.6248175182481752, + "grad_norm": 1.663752280677247, + "learning_rate": 3.2593263598489e-06, + "loss": 0.4452, + "step": 6420 + }, + { + "epoch": 0.6249148418491485, + "grad_norm": 1.2769632035094298, + "learning_rate": 3.257848888267364e-06, + "loss": 0.3209, + "step": 6421 + }, + { + "epoch": 0.6250121654501216, + "grad_norm": 1.4916826762329467, + "learning_rate": 3.256371589804455e-06, + "loss": 0.4286, + "step": 6422 + }, + { + "epoch": 0.6251094890510949, + "grad_norm": 1.36549468826328, + "learning_rate": 3.2548944646069743e-06, + "loss": 0.4467, + "step": 6423 + }, + { + "epoch": 0.6252068126520681, + "grad_norm": 1.3602976682219388, + "learning_rate": 3.2534175128217016e-06, + "loss": 0.4956, + "step": 6424 + }, + { + "epoch": 0.6253041362530414, + "grad_norm": 1.4692369830606595, + "learning_rate": 3.2519407345954048e-06, + "loss": 0.4828, + "step": 6425 + }, + { + "epoch": 0.6254014598540146, + "grad_norm": 1.0896644581534791, + "learning_rate": 3.2504641300748317e-06, + "loss": 0.2557, + "step": 6426 + }, + { + "epoch": 0.6254987834549879, + "grad_norm": 1.3661247264391625, + "learning_rate": 3.2489876994067127e-06, + "loss": 0.3301, + "step": 6427 + }, + { + "epoch": 0.625596107055961, + "grad_norm": 1.6193965254088782, + "learning_rate": 3.2475114427377628e-06, + "loss": 0.5323, + "step": 6428 + }, + { + "epoch": 0.6256934306569343, + "grad_norm": 1.58042754846969, + "learning_rate": 3.246035360214678e-06, + "loss": 0.3914, + "step": 6429 + }, + { + "epoch": 0.6257907542579075, + "grad_norm": 1.5398264117065557, + "learning_rate": 3.244559451984139e-06, + "loss": 0.3135, + "step": 6430 + }, + { + "epoch": 0.6258880778588808, + "grad_norm": 1.393062042004862, + "learning_rate": 3.243083718192804e-06, + "loss": 0.4127, + "step": 6431 + }, + { + "epoch": 0.625985401459854, + "grad_norm": 1.374180419456703, + "learning_rate": 3.2416081589873204e-06, + "loss": 0.4203, + "step": 6432 + }, + { + "epoch": 0.6260827250608273, + "grad_norm": 1.5221690056463797, + "learning_rate": 3.240132774514314e-06, + "loss": 0.3567, + "step": 6433 + }, + { + "epoch": 0.6261800486618004, + "grad_norm": 1.7565693167917438, + "learning_rate": 3.238657564920398e-06, + "loss": 0.4339, + "step": 6434 + }, + { + "epoch": 0.6262773722627737, + "grad_norm": 1.4458477781748666, + "learning_rate": 3.2371825303521608e-06, + "loss": 0.3381, + "step": 6435 + }, + { + "epoch": 0.6263746958637469, + "grad_norm": 1.0579358974139734, + "learning_rate": 3.2357076709561785e-06, + "loss": 0.2131, + "step": 6436 + }, + { + "epoch": 0.6264720194647202, + "grad_norm": 1.411446443439624, + "learning_rate": 3.2342329868790113e-06, + "loss": 0.3913, + "step": 6437 + }, + { + "epoch": 0.6265693430656935, + "grad_norm": 1.6804640992905848, + "learning_rate": 3.2327584782671954e-06, + "loss": 0.4483, + "step": 6438 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 1.3697314327713126, + "learning_rate": 3.2312841452672565e-06, + "loss": 0.4149, + "step": 6439 + }, + { + "epoch": 0.6267639902676398, + "grad_norm": 1.4377594589269822, + "learning_rate": 3.2298099880256996e-06, + "loss": 0.2782, + "step": 6440 + }, + { + "epoch": 0.6268613138686131, + "grad_norm": 1.2400485642982848, + "learning_rate": 3.2283360066890135e-06, + "loss": 0.3476, + "step": 6441 + }, + { + "epoch": 0.6269586374695864, + "grad_norm": 1.4012086147216676, + "learning_rate": 3.2268622014036654e-06, + "loss": 0.3381, + "step": 6442 + }, + { + "epoch": 0.6270559610705596, + "grad_norm": 1.9163727811432625, + "learning_rate": 3.22538857231611e-06, + "loss": 0.2783, + "step": 6443 + }, + { + "epoch": 0.6271532846715329, + "grad_norm": 1.5427758977580395, + "learning_rate": 3.2239151195727824e-06, + "loss": 0.445, + "step": 6444 + }, + { + "epoch": 0.6272506082725061, + "grad_norm": 1.3443867261615499, + "learning_rate": 3.2224418433201036e-06, + "loss": 0.3202, + "step": 6445 + }, + { + "epoch": 0.6273479318734794, + "grad_norm": 1.4345988977751094, + "learning_rate": 3.22096874370447e-06, + "loss": 0.4615, + "step": 6446 + }, + { + "epoch": 0.6274452554744525, + "grad_norm": 1.5837685923700648, + "learning_rate": 3.2194958208722656e-06, + "loss": 0.4776, + "step": 6447 + }, + { + "epoch": 0.6275425790754258, + "grad_norm": 1.3815783149316243, + "learning_rate": 3.2180230749698565e-06, + "loss": 0.41, + "step": 6448 + }, + { + "epoch": 0.627639902676399, + "grad_norm": 1.237087793767247, + "learning_rate": 3.2165505061435887e-06, + "loss": 0.2476, + "step": 6449 + }, + { + "epoch": 0.6277372262773723, + "grad_norm": 1.2584796818308166, + "learning_rate": 3.2150781145397937e-06, + "loss": 0.3838, + "step": 6450 + }, + { + "epoch": 0.6278345498783455, + "grad_norm": 1.5613599051036826, + "learning_rate": 3.213605900304784e-06, + "loss": 0.3109, + "step": 6451 + }, + { + "epoch": 0.6279318734793188, + "grad_norm": 1.2250805302833447, + "learning_rate": 3.2121338635848553e-06, + "loss": 0.2798, + "step": 6452 + }, + { + "epoch": 0.6280291970802919, + "grad_norm": 1.6756000554221138, + "learning_rate": 3.2106620045262813e-06, + "loss": 0.3721, + "step": 6453 + }, + { + "epoch": 0.6281265206812652, + "grad_norm": 1.3234677506089279, + "learning_rate": 3.209190323275323e-06, + "loss": 0.2681, + "step": 6454 + }, + { + "epoch": 0.6282238442822384, + "grad_norm": 1.5906825746559898, + "learning_rate": 3.207718819978226e-06, + "loss": 0.4974, + "step": 6455 + }, + { + "epoch": 0.6283211678832117, + "grad_norm": 1.5094469925986025, + "learning_rate": 3.2062474947812096e-06, + "loss": 0.3998, + "step": 6456 + }, + { + "epoch": 0.6284184914841849, + "grad_norm": 1.5482315126630701, + "learning_rate": 3.204776347830482e-06, + "loss": 0.5504, + "step": 6457 + }, + { + "epoch": 0.6285158150851582, + "grad_norm": 1.3480322271903677, + "learning_rate": 3.2033053792722326e-06, + "loss": 0.2613, + "step": 6458 + }, + { + "epoch": 0.6286131386861313, + "grad_norm": 1.5305125737986958, + "learning_rate": 3.2018345892526327e-06, + "loss": 0.3946, + "step": 6459 + }, + { + "epoch": 0.6287104622871046, + "grad_norm": 1.3806290279276137, + "learning_rate": 3.2003639779178334e-06, + "loss": 0.3694, + "step": 6460 + }, + { + "epoch": 0.6288077858880778, + "grad_norm": 1.6224067977444971, + "learning_rate": 3.1988935454139723e-06, + "loss": 0.5805, + "step": 6461 + }, + { + "epoch": 0.6289051094890511, + "grad_norm": 1.5657592350411667, + "learning_rate": 3.1974232918871666e-06, + "loss": 0.317, + "step": 6462 + }, + { + "epoch": 0.6290024330900243, + "grad_norm": 3.5127424163247816, + "learning_rate": 3.1959532174835186e-06, + "loss": 0.5635, + "step": 6463 + }, + { + "epoch": 0.6290997566909976, + "grad_norm": 1.2858706636545765, + "learning_rate": 3.1944833223491046e-06, + "loss": 0.2784, + "step": 6464 + }, + { + "epoch": 0.6291970802919709, + "grad_norm": 1.3540634213727698, + "learning_rate": 3.1930136066299945e-06, + "loss": 0.3003, + "step": 6465 + }, + { + "epoch": 0.629294403892944, + "grad_norm": 1.170333153169365, + "learning_rate": 3.1915440704722343e-06, + "loss": 0.323, + "step": 6466 + }, + { + "epoch": 0.6293917274939173, + "grad_norm": 1.3124886466852246, + "learning_rate": 3.190074714021849e-06, + "loss": 0.321, + "step": 6467 + }, + { + "epoch": 0.6294890510948905, + "grad_norm": 1.6243528397895342, + "learning_rate": 3.1886055374248526e-06, + "loss": 0.2533, + "step": 6468 + }, + { + "epoch": 0.6295863746958638, + "grad_norm": 1.4883999514363708, + "learning_rate": 3.187136540827237e-06, + "loss": 0.3649, + "step": 6469 + }, + { + "epoch": 0.629683698296837, + "grad_norm": 1.427487040125178, + "learning_rate": 3.185667724374979e-06, + "loss": 0.3898, + "step": 6470 + }, + { + "epoch": 0.6297810218978103, + "grad_norm": 1.115881950643076, + "learning_rate": 3.184199088214033e-06, + "loss": 0.2074, + "step": 6471 + }, + { + "epoch": 0.6298783454987834, + "grad_norm": 1.6919065797054764, + "learning_rate": 3.1827306324903395e-06, + "loss": 0.2688, + "step": 6472 + }, + { + "epoch": 0.6299756690997567, + "grad_norm": 1.3599466897529593, + "learning_rate": 3.1812623573498226e-06, + "loss": 0.3131, + "step": 6473 + }, + { + "epoch": 0.6300729927007299, + "grad_norm": 1.3631234063966051, + "learning_rate": 3.1797942629383793e-06, + "loss": 0.2478, + "step": 6474 + }, + { + "epoch": 0.6301703163017032, + "grad_norm": 1.7283042994052793, + "learning_rate": 3.1783263494019005e-06, + "loss": 0.3768, + "step": 6475 + }, + { + "epoch": 0.6302676399026764, + "grad_norm": 1.4281723248342992, + "learning_rate": 3.1768586168862525e-06, + "loss": 0.3077, + "step": 6476 + }, + { + "epoch": 0.6303649635036497, + "grad_norm": 2.900169985820706, + "learning_rate": 3.1753910655372855e-06, + "loss": 0.3586, + "step": 6477 + }, + { + "epoch": 0.6304622871046228, + "grad_norm": 1.672337334424629, + "learning_rate": 3.173923695500828e-06, + "loss": 0.435, + "step": 6478 + }, + { + "epoch": 0.6305596107055961, + "grad_norm": 1.5938101364458683, + "learning_rate": 3.1724565069226955e-06, + "loss": 0.5215, + "step": 6479 + }, + { + "epoch": 0.6306569343065693, + "grad_norm": 1.4436062200383195, + "learning_rate": 3.1709894999486828e-06, + "loss": 0.4094, + "step": 6480 + }, + { + "epoch": 0.6307542579075426, + "grad_norm": 1.4529861876112795, + "learning_rate": 3.1695226747245687e-06, + "loss": 0.2817, + "step": 6481 + }, + { + "epoch": 0.6308515815085158, + "grad_norm": 1.5483022713531014, + "learning_rate": 3.168056031396111e-06, + "loss": 0.3348, + "step": 6482 + }, + { + "epoch": 0.6309489051094891, + "grad_norm": 1.292264164376706, + "learning_rate": 3.1665895701090516e-06, + "loss": 0.4504, + "step": 6483 + }, + { + "epoch": 0.6310462287104623, + "grad_norm": 1.2803892640445205, + "learning_rate": 3.165123291009114e-06, + "loss": 0.1939, + "step": 6484 + }, + { + "epoch": 0.6311435523114355, + "grad_norm": 1.6429724475131509, + "learning_rate": 3.163657194242002e-06, + "loss": 0.3441, + "step": 6485 + }, + { + "epoch": 0.6312408759124087, + "grad_norm": 1.3659913852581975, + "learning_rate": 3.162191279953403e-06, + "loss": 0.3171, + "step": 6486 + }, + { + "epoch": 0.631338199513382, + "grad_norm": 1.2837470772774027, + "learning_rate": 3.1607255482889865e-06, + "loss": 0.1758, + "step": 6487 + }, + { + "epoch": 0.6314355231143552, + "grad_norm": 1.6139208100897777, + "learning_rate": 3.159259999394405e-06, + "loss": 0.4519, + "step": 6488 + }, + { + "epoch": 0.6315328467153285, + "grad_norm": 1.7475421539155827, + "learning_rate": 3.1577946334152867e-06, + "loss": 0.2733, + "step": 6489 + }, + { + "epoch": 0.6316301703163018, + "grad_norm": 1.4912354792665754, + "learning_rate": 3.1563294504972474e-06, + "loss": 0.4201, + "step": 6490 + }, + { + "epoch": 0.6317274939172749, + "grad_norm": 1.6382570162150927, + "learning_rate": 3.154864450785885e-06, + "loss": 0.4395, + "step": 6491 + }, + { + "epoch": 0.6318248175182481, + "grad_norm": 1.5663588366859882, + "learning_rate": 3.1533996344267753e-06, + "loss": 0.5681, + "step": 6492 + }, + { + "epoch": 0.6319221411192214, + "grad_norm": 1.4907756737266007, + "learning_rate": 3.1519350015654793e-06, + "loss": 0.4187, + "step": 6493 + }, + { + "epoch": 0.6320194647201947, + "grad_norm": 1.4455849880382687, + "learning_rate": 3.1504705523475377e-06, + "loss": 0.3222, + "step": 6494 + }, + { + "epoch": 0.6321167883211679, + "grad_norm": 1.4603444900127154, + "learning_rate": 3.1490062869184747e-06, + "loss": 0.3556, + "step": 6495 + }, + { + "epoch": 0.6322141119221412, + "grad_norm": 1.5259510139212225, + "learning_rate": 3.1475422054237948e-06, + "loss": 0.3879, + "step": 6496 + }, + { + "epoch": 0.6323114355231143, + "grad_norm": 1.4348064563236933, + "learning_rate": 3.1460783080089835e-06, + "loss": 0.322, + "step": 6497 + }, + { + "epoch": 0.6324087591240876, + "grad_norm": 1.464928220220762, + "learning_rate": 3.1446145948195104e-06, + "loss": 0.2428, + "step": 6498 + }, + { + "epoch": 0.6325060827250608, + "grad_norm": 2.010405176423069, + "learning_rate": 3.143151066000828e-06, + "loss": 0.4493, + "step": 6499 + }, + { + "epoch": 0.6326034063260341, + "grad_norm": 1.6763377299382285, + "learning_rate": 3.141687721698363e-06, + "loss": 0.3089, + "step": 6500 + }, + { + "epoch": 0.6327007299270073, + "grad_norm": 1.2933149997814406, + "learning_rate": 3.140224562057532e-06, + "loss": 0.2106, + "step": 6501 + }, + { + "epoch": 0.6327980535279806, + "grad_norm": 1.3798692490652842, + "learning_rate": 3.13876158722373e-06, + "loss": 0.2742, + "step": 6502 + }, + { + "epoch": 0.6328953771289537, + "grad_norm": 1.6893586413473158, + "learning_rate": 3.137298797342332e-06, + "loss": 0.4408, + "step": 6503 + }, + { + "epoch": 0.632992700729927, + "grad_norm": 1.471219847182982, + "learning_rate": 3.135836192558697e-06, + "loss": 0.3318, + "step": 6504 + }, + { + "epoch": 0.6330900243309002, + "grad_norm": 1.2796389057029516, + "learning_rate": 3.1343737730181655e-06, + "loss": 0.4114, + "step": 6505 + }, + { + "epoch": 0.6331873479318735, + "grad_norm": 1.8869923142527827, + "learning_rate": 3.13291153886606e-06, + "loss": 0.3791, + "step": 6506 + }, + { + "epoch": 0.6332846715328467, + "grad_norm": 1.4539433297784599, + "learning_rate": 3.131449490247682e-06, + "loss": 0.3876, + "step": 6507 + }, + { + "epoch": 0.63338199513382, + "grad_norm": 1.3341134094117948, + "learning_rate": 3.1299876273083164e-06, + "loss": 0.3685, + "step": 6508 + }, + { + "epoch": 0.6334793187347932, + "grad_norm": 1.4170620964782272, + "learning_rate": 3.128525950193232e-06, + "loss": 0.4013, + "step": 6509 + }, + { + "epoch": 0.6335766423357664, + "grad_norm": 1.5530589406262345, + "learning_rate": 3.127064459047671e-06, + "loss": 0.368, + "step": 6510 + }, + { + "epoch": 0.6336739659367396, + "grad_norm": 1.5483044271710225, + "learning_rate": 3.125603154016867e-06, + "loss": 0.4434, + "step": 6511 + }, + { + "epoch": 0.6337712895377129, + "grad_norm": 1.3264657819916343, + "learning_rate": 3.1241420352460296e-06, + "loss": 0.3797, + "step": 6512 + }, + { + "epoch": 0.6338686131386861, + "grad_norm": 1.4633775214009548, + "learning_rate": 3.1226811028803514e-06, + "loss": 0.3799, + "step": 6513 + }, + { + "epoch": 0.6339659367396594, + "grad_norm": 1.4275809021587014, + "learning_rate": 3.121220357065006e-06, + "loss": 0.3724, + "step": 6514 + }, + { + "epoch": 0.6340632603406327, + "grad_norm": 1.2579159901379062, + "learning_rate": 3.1197597979451477e-06, + "loss": 0.34, + "step": 6515 + }, + { + "epoch": 0.6341605839416058, + "grad_norm": 1.767126339664398, + "learning_rate": 3.118299425665914e-06, + "loss": 0.5021, + "step": 6516 + }, + { + "epoch": 0.634257907542579, + "grad_norm": 1.6146261230049028, + "learning_rate": 3.116839240372424e-06, + "loss": 0.6639, + "step": 6517 + }, + { + "epoch": 0.6343552311435523, + "grad_norm": 1.5481263310931461, + "learning_rate": 3.115379242209775e-06, + "loss": 0.4425, + "step": 6518 + }, + { + "epoch": 0.6344525547445256, + "grad_norm": 1.6350276832686754, + "learning_rate": 3.1139194313230497e-06, + "loss": 0.4323, + "step": 6519 + }, + { + "epoch": 0.6345498783454988, + "grad_norm": 1.681221053430889, + "learning_rate": 3.1124598078573115e-06, + "loss": 0.2437, + "step": 6520 + }, + { + "epoch": 0.6346472019464721, + "grad_norm": 1.4748162332920707, + "learning_rate": 3.1110003719576005e-06, + "loss": 0.37, + "step": 6521 + }, + { + "epoch": 0.6347445255474452, + "grad_norm": 1.3748894993856542, + "learning_rate": 3.109541123768943e-06, + "loss": 0.2493, + "step": 6522 + }, + { + "epoch": 0.6348418491484185, + "grad_norm": 1.4376733094936145, + "learning_rate": 3.108082063436346e-06, + "loss": 0.2908, + "step": 6523 + }, + { + "epoch": 0.6349391727493917, + "grad_norm": 1.6379363781656653, + "learning_rate": 3.1066231911047996e-06, + "loss": 0.4498, + "step": 6524 + }, + { + "epoch": 0.635036496350365, + "grad_norm": 1.683708381733199, + "learning_rate": 3.105164506919268e-06, + "loss": 0.3291, + "step": 6525 + }, + { + "epoch": 0.6351338199513382, + "grad_norm": 1.4308202873054392, + "learning_rate": 3.1037060110247053e-06, + "loss": 0.418, + "step": 6526 + }, + { + "epoch": 0.6352311435523115, + "grad_norm": 1.263260032838939, + "learning_rate": 3.1022477035660413e-06, + "loss": 0.2807, + "step": 6527 + }, + { + "epoch": 0.6353284671532847, + "grad_norm": 1.183934128166101, + "learning_rate": 3.1007895846881896e-06, + "loss": 0.3428, + "step": 6528 + }, + { + "epoch": 0.6354257907542579, + "grad_norm": 1.1839227502404823, + "learning_rate": 3.099331654536044e-06, + "loss": 0.3267, + "step": 6529 + }, + { + "epoch": 0.6355231143552311, + "grad_norm": 1.462328895927904, + "learning_rate": 3.0978739132544798e-06, + "loss": 0.4651, + "step": 6530 + }, + { + "epoch": 0.6356204379562044, + "grad_norm": 1.5613248602189018, + "learning_rate": 3.0964163609883563e-06, + "loss": 0.5058, + "step": 6531 + }, + { + "epoch": 0.6357177615571776, + "grad_norm": 1.5514022390955555, + "learning_rate": 3.094958997882507e-06, + "loss": 0.4418, + "step": 6532 + }, + { + "epoch": 0.6358150851581509, + "grad_norm": 1.50778032179577, + "learning_rate": 3.0935018240817518e-06, + "loss": 0.512, + "step": 6533 + }, + { + "epoch": 0.6359124087591241, + "grad_norm": 1.3028402442750457, + "learning_rate": 3.0920448397308932e-06, + "loss": 0.2806, + "step": 6534 + }, + { + "epoch": 0.6360097323600973, + "grad_norm": 1.4253549434913757, + "learning_rate": 3.0905880449747138e-06, + "loss": 0.3683, + "step": 6535 + }, + { + "epoch": 0.6361070559610705, + "grad_norm": 1.6257959030642224, + "learning_rate": 3.089131439957972e-06, + "loss": 0.4056, + "step": 6536 + }, + { + "epoch": 0.6362043795620438, + "grad_norm": 2.038223737180886, + "learning_rate": 3.087675024825413e-06, + "loss": 0.1905, + "step": 6537 + }, + { + "epoch": 0.636301703163017, + "grad_norm": 1.4294957100055656, + "learning_rate": 3.0862187997217643e-06, + "loss": 0.4646, + "step": 6538 + }, + { + "epoch": 0.6363990267639903, + "grad_norm": 1.5204127091474489, + "learning_rate": 3.0847627647917277e-06, + "loss": 0.3884, + "step": 6539 + }, + { + "epoch": 0.6364963503649635, + "grad_norm": 1.470787420807994, + "learning_rate": 3.0833069201799927e-06, + "loss": 0.2999, + "step": 6540 + }, + { + "epoch": 0.6365936739659367, + "grad_norm": 1.3680617670648643, + "learning_rate": 3.0818512660312273e-06, + "loss": 0.1966, + "step": 6541 + }, + { + "epoch": 0.6366909975669099, + "grad_norm": 1.4095686007744734, + "learning_rate": 3.0803958024900822e-06, + "loss": 0.4465, + "step": 6542 + }, + { + "epoch": 0.6367883211678832, + "grad_norm": 1.7020279036810935, + "learning_rate": 3.078940529701183e-06, + "loss": 0.3963, + "step": 6543 + }, + { + "epoch": 0.6368856447688565, + "grad_norm": 1.6759286384412257, + "learning_rate": 3.077485447809145e-06, + "loss": 0.4328, + "step": 6544 + }, + { + "epoch": 0.6369829683698297, + "grad_norm": 1.3607338000680083, + "learning_rate": 3.076030556958563e-06, + "loss": 0.3031, + "step": 6545 + }, + { + "epoch": 0.637080291970803, + "grad_norm": 1.3654839938275702, + "learning_rate": 3.0745758572940044e-06, + "loss": 0.2692, + "step": 6546 + }, + { + "epoch": 0.6371776155717761, + "grad_norm": 1.3285997327669636, + "learning_rate": 3.073121348960026e-06, + "loss": 0.4202, + "step": 6547 + }, + { + "epoch": 0.6372749391727494, + "grad_norm": 1.491142266371717, + "learning_rate": 3.0716670321011637e-06, + "loss": 0.5475, + "step": 6548 + }, + { + "epoch": 0.6373722627737226, + "grad_norm": 1.3959105266273333, + "learning_rate": 3.0702129068619347e-06, + "loss": 0.1799, + "step": 6549 + }, + { + "epoch": 0.6374695863746959, + "grad_norm": 1.4953678451707477, + "learning_rate": 3.068758973386834e-06, + "loss": 0.2944, + "step": 6550 + }, + { + "epoch": 0.6375669099756691, + "grad_norm": 1.3774626526382292, + "learning_rate": 3.0673052318203415e-06, + "loss": 0.4229, + "step": 6551 + }, + { + "epoch": 0.6376642335766424, + "grad_norm": 3.0575530911418967, + "learning_rate": 3.065851682306916e-06, + "loss": 0.3802, + "step": 6552 + }, + { + "epoch": 0.6377615571776156, + "grad_norm": 1.5266257547257827, + "learning_rate": 3.0643983249910003e-06, + "loss": 0.431, + "step": 6553 + }, + { + "epoch": 0.6378588807785888, + "grad_norm": 1.4318729227784441, + "learning_rate": 3.062945160017009e-06, + "loss": 0.3329, + "step": 6554 + }, + { + "epoch": 0.637956204379562, + "grad_norm": 1.477288849615939, + "learning_rate": 3.0614921875293485e-06, + "loss": 0.3838, + "step": 6555 + }, + { + "epoch": 0.6380535279805353, + "grad_norm": 1.4366676574550201, + "learning_rate": 3.0600394076724034e-06, + "loss": 0.3418, + "step": 6556 + }, + { + "epoch": 0.6381508515815085, + "grad_norm": 1.840787349982565, + "learning_rate": 3.058586820590532e-06, + "loss": 0.3175, + "step": 6557 + }, + { + "epoch": 0.6382481751824818, + "grad_norm": 1.6374162084675348, + "learning_rate": 3.057134426428082e-06, + "loss": 0.5415, + "step": 6558 + }, + { + "epoch": 0.638345498783455, + "grad_norm": 1.7509514402086164, + "learning_rate": 3.055682225329378e-06, + "loss": 0.3998, + "step": 6559 + }, + { + "epoch": 0.6384428223844282, + "grad_norm": 1.3419721064191474, + "learning_rate": 3.0542302174387285e-06, + "loss": 0.3397, + "step": 6560 + }, + { + "epoch": 0.6385401459854014, + "grad_norm": 1.4188823856648498, + "learning_rate": 3.052778402900416e-06, + "loss": 0.1553, + "step": 6561 + }, + { + "epoch": 0.6386374695863747, + "grad_norm": 1.4080661490138804, + "learning_rate": 3.051326781858711e-06, + "loss": 0.2777, + "step": 6562 + }, + { + "epoch": 0.6387347931873479, + "grad_norm": 1.5492062197364629, + "learning_rate": 3.0498753544578636e-06, + "loss": 0.4784, + "step": 6563 + }, + { + "epoch": 0.6388321167883212, + "grad_norm": 1.8418367448295987, + "learning_rate": 3.0484241208420974e-06, + "loss": 0.4083, + "step": 6564 + }, + { + "epoch": 0.6389294403892944, + "grad_norm": 1.6392677142297458, + "learning_rate": 3.046973081155627e-06, + "loss": 0.483, + "step": 6565 + }, + { + "epoch": 0.6390267639902676, + "grad_norm": 1.325005041868586, + "learning_rate": 3.0455222355426417e-06, + "loss": 0.3442, + "step": 6566 + }, + { + "epoch": 0.6391240875912408, + "grad_norm": 1.5095000631832123, + "learning_rate": 3.0440715841473156e-06, + "loss": 0.3789, + "step": 6567 + }, + { + "epoch": 0.6392214111922141, + "grad_norm": 1.3463836410906744, + "learning_rate": 3.0426211271137963e-06, + "loss": 0.2513, + "step": 6568 + }, + { + "epoch": 0.6393187347931873, + "grad_norm": 1.4654952978073685, + "learning_rate": 3.041170864586218e-06, + "loss": 0.2815, + "step": 6569 + }, + { + "epoch": 0.6394160583941606, + "grad_norm": 1.2085160194064402, + "learning_rate": 3.0397207967086963e-06, + "loss": 0.2626, + "step": 6570 + }, + { + "epoch": 0.6395133819951339, + "grad_norm": 1.2129154909198758, + "learning_rate": 3.0382709236253236e-06, + "loss": 0.3191, + "step": 6571 + }, + { + "epoch": 0.6396107055961071, + "grad_norm": 1.2824407022461222, + "learning_rate": 3.0368212454801747e-06, + "loss": 0.2703, + "step": 6572 + }, + { + "epoch": 0.6397080291970803, + "grad_norm": 2.2236201604685077, + "learning_rate": 3.0353717624173052e-06, + "loss": 0.2915, + "step": 6573 + }, + { + "epoch": 0.6398053527980535, + "grad_norm": 1.6125491053474876, + "learning_rate": 3.0339224745807523e-06, + "loss": 0.409, + "step": 6574 + }, + { + "epoch": 0.6399026763990268, + "grad_norm": 1.742793572213448, + "learning_rate": 3.0324733821145303e-06, + "loss": 0.4993, + "step": 6575 + }, + { + "epoch": 0.64, + "grad_norm": 1.3630390115977793, + "learning_rate": 3.0310244851626376e-06, + "loss": 0.3173, + "step": 6576 + }, + { + "epoch": 0.6400973236009733, + "grad_norm": 1.500298947108402, + "learning_rate": 3.029575783869052e-06, + "loss": 0.4725, + "step": 6577 + }, + { + "epoch": 0.6401946472019465, + "grad_norm": 1.705190193889179, + "learning_rate": 3.0281272783777343e-06, + "loss": 0.3429, + "step": 6578 + }, + { + "epoch": 0.6402919708029197, + "grad_norm": 1.448392648012749, + "learning_rate": 3.0266789688326187e-06, + "loss": 0.336, + "step": 6579 + }, + { + "epoch": 0.6403892944038929, + "grad_norm": 1.3178412429464779, + "learning_rate": 3.0252308553776264e-06, + "loss": 0.2979, + "step": 6580 + }, + { + "epoch": 0.6404866180048662, + "grad_norm": 1.3242172196190918, + "learning_rate": 3.0237829381566586e-06, + "loss": 0.2406, + "step": 6581 + }, + { + "epoch": 0.6405839416058394, + "grad_norm": 1.2949004250615508, + "learning_rate": 3.0223352173135957e-06, + "loss": 0.2948, + "step": 6582 + }, + { + "epoch": 0.6406812652068127, + "grad_norm": 1.391976289322491, + "learning_rate": 3.020887692992297e-06, + "loss": 0.2993, + "step": 6583 + }, + { + "epoch": 0.6407785888077859, + "grad_norm": 1.4327058043112526, + "learning_rate": 3.0194403653366046e-06, + "loss": 0.4787, + "step": 6584 + }, + { + "epoch": 0.6408759124087591, + "grad_norm": 1.352759211832976, + "learning_rate": 3.0179932344903406e-06, + "loss": 0.3548, + "step": 6585 + }, + { + "epoch": 0.6409732360097323, + "grad_norm": 1.4721962288695103, + "learning_rate": 3.0165463005973074e-06, + "loss": 0.3982, + "step": 6586 + }, + { + "epoch": 0.6410705596107056, + "grad_norm": 1.5071786131133138, + "learning_rate": 3.0150995638012863e-06, + "loss": 0.2905, + "step": 6587 + }, + { + "epoch": 0.6411678832116788, + "grad_norm": 1.7834773275882259, + "learning_rate": 3.0136530242460422e-06, + "loss": 0.5121, + "step": 6588 + }, + { + "epoch": 0.6412652068126521, + "grad_norm": 1.3996326305129958, + "learning_rate": 3.01220668207532e-06, + "loss": 0.3202, + "step": 6589 + }, + { + "epoch": 0.6413625304136253, + "grad_norm": 1.3538756228954372, + "learning_rate": 3.0107605374328393e-06, + "loss": 0.3381, + "step": 6590 + }, + { + "epoch": 0.6414598540145985, + "grad_norm": 1.7326099934416226, + "learning_rate": 3.0093145904623067e-06, + "loss": 0.4123, + "step": 6591 + }, + { + "epoch": 0.6415571776155717, + "grad_norm": 1.5533762247005998, + "learning_rate": 3.007868841307408e-06, + "loss": 0.2857, + "step": 6592 + }, + { + "epoch": 0.641654501216545, + "grad_norm": 1.377517562865021, + "learning_rate": 3.0064232901118064e-06, + "loss": 0.2983, + "step": 6593 + }, + { + "epoch": 0.6417518248175182, + "grad_norm": 1.3605307565764377, + "learning_rate": 3.0049779370191467e-06, + "loss": 0.3055, + "step": 6594 + }, + { + "epoch": 0.6418491484184915, + "grad_norm": 1.4541476353198792, + "learning_rate": 3.0035327821730563e-06, + "loss": 0.3364, + "step": 6595 + }, + { + "epoch": 0.6419464720194648, + "grad_norm": 1.6858716101582292, + "learning_rate": 3.0020878257171415e-06, + "loss": 0.3286, + "step": 6596 + }, + { + "epoch": 0.642043795620438, + "grad_norm": 1.2361037305309155, + "learning_rate": 3.0006430677949868e-06, + "loss": 0.2427, + "step": 6597 + }, + { + "epoch": 0.6421411192214111, + "grad_norm": 1.5622620973672636, + "learning_rate": 2.999198508550159e-06, + "loss": 0.4313, + "step": 6598 + }, + { + "epoch": 0.6422384428223844, + "grad_norm": 1.6444258486208945, + "learning_rate": 2.997754148126205e-06, + "loss": 0.472, + "step": 6599 + }, + { + "epoch": 0.6423357664233577, + "grad_norm": 1.220898031314922, + "learning_rate": 2.9963099866666543e-06, + "loss": 0.2771, + "step": 6600 + }, + { + "epoch": 0.6424330900243309, + "grad_norm": 1.3733291010343895, + "learning_rate": 2.9948660243150098e-06, + "loss": 0.3123, + "step": 6601 + }, + { + "epoch": 0.6425304136253042, + "grad_norm": 1.3566622358073268, + "learning_rate": 2.9934222612147595e-06, + "loss": 0.3548, + "step": 6602 + }, + { + "epoch": 0.6426277372262774, + "grad_norm": 1.4428606836553397, + "learning_rate": 2.9919786975093756e-06, + "loss": 0.4486, + "step": 6603 + }, + { + "epoch": 0.6427250608272506, + "grad_norm": 1.331557651126881, + "learning_rate": 2.9905353333423014e-06, + "loss": 0.3721, + "step": 6604 + }, + { + "epoch": 0.6428223844282238, + "grad_norm": 1.593785827832394, + "learning_rate": 2.989092168856965e-06, + "loss": 0.4927, + "step": 6605 + }, + { + "epoch": 0.6429197080291971, + "grad_norm": 1.536653598942224, + "learning_rate": 2.987649204196777e-06, + "loss": 0.3153, + "step": 6606 + }, + { + "epoch": 0.6430170316301703, + "grad_norm": 1.3054514036276021, + "learning_rate": 2.9862064395051248e-06, + "loss": 0.3167, + "step": 6607 + }, + { + "epoch": 0.6431143552311436, + "grad_norm": 1.757420067956881, + "learning_rate": 2.984763874925376e-06, + "loss": 0.4285, + "step": 6608 + }, + { + "epoch": 0.6432116788321168, + "grad_norm": 1.2420031332753536, + "learning_rate": 2.9833215106008794e-06, + "loss": 0.173, + "step": 6609 + }, + { + "epoch": 0.64330900243309, + "grad_norm": 1.4602255488933027, + "learning_rate": 2.981879346674965e-06, + "loss": 0.3907, + "step": 6610 + }, + { + "epoch": 0.6434063260340632, + "grad_norm": 6.231283621281225, + "learning_rate": 2.9804373832909394e-06, + "loss": 0.3185, + "step": 6611 + }, + { + "epoch": 0.6435036496350365, + "grad_norm": 1.478715809985492, + "learning_rate": 2.978995620592092e-06, + "loss": 0.4662, + "step": 6612 + }, + { + "epoch": 0.6436009732360097, + "grad_norm": 1.4846465816009955, + "learning_rate": 2.9775540587216912e-06, + "loss": 0.3263, + "step": 6613 + }, + { + "epoch": 0.643698296836983, + "grad_norm": 1.2943716131485596, + "learning_rate": 2.9761126978229895e-06, + "loss": 0.3322, + "step": 6614 + }, + { + "epoch": 0.6437956204379562, + "grad_norm": 1.3583973844127837, + "learning_rate": 2.9746715380392112e-06, + "loss": 0.4017, + "step": 6615 + }, + { + "epoch": 0.6438929440389295, + "grad_norm": 1.1103818381014245, + "learning_rate": 2.9732305795135665e-06, + "loss": 0.3148, + "step": 6616 + }, + { + "epoch": 0.6439902676399026, + "grad_norm": 1.7583755608621012, + "learning_rate": 2.971789822389245e-06, + "loss": 0.5661, + "step": 6617 + }, + { + "epoch": 0.6440875912408759, + "grad_norm": 1.4829147384015064, + "learning_rate": 2.970349266809417e-06, + "loss": 0.4069, + "step": 6618 + }, + { + "epoch": 0.6441849148418491, + "grad_norm": 1.4588836267455016, + "learning_rate": 2.9689089129172285e-06, + "loss": 0.3918, + "step": 6619 + }, + { + "epoch": 0.6442822384428224, + "grad_norm": 1.1937496964219592, + "learning_rate": 2.9674687608558096e-06, + "loss": 0.2991, + "step": 6620 + }, + { + "epoch": 0.6443795620437957, + "grad_norm": 1.8283816426425703, + "learning_rate": 2.966028810768271e-06, + "loss": 0.3497, + "step": 6621 + }, + { + "epoch": 0.6444768856447689, + "grad_norm": 1.4663545269143798, + "learning_rate": 2.9645890627976987e-06, + "loss": 0.5835, + "step": 6622 + }, + { + "epoch": 0.644574209245742, + "grad_norm": 1.4473218816822218, + "learning_rate": 2.9631495170871605e-06, + "loss": 0.4357, + "step": 6623 + }, + { + "epoch": 0.6446715328467153, + "grad_norm": 1.4348975964470179, + "learning_rate": 2.961710173779708e-06, + "loss": 0.4294, + "step": 6624 + }, + { + "epoch": 0.6447688564476886, + "grad_norm": 1.3680725284863948, + "learning_rate": 2.9602710330183706e-06, + "loss": 0.3398, + "step": 6625 + }, + { + "epoch": 0.6448661800486618, + "grad_norm": 1.5161776763951267, + "learning_rate": 2.958832094946151e-06, + "loss": 0.3968, + "step": 6626 + }, + { + "epoch": 0.6449635036496351, + "grad_norm": 1.5178720393371483, + "learning_rate": 2.957393359706042e-06, + "loss": 0.3841, + "step": 6627 + }, + { + "epoch": 0.6450608272506083, + "grad_norm": 1.4053519030224193, + "learning_rate": 2.955954827441011e-06, + "loss": 0.4372, + "step": 6628 + }, + { + "epoch": 0.6451581508515815, + "grad_norm": 1.4955618687276493, + "learning_rate": 2.9545164982940045e-06, + "loss": 0.3427, + "step": 6629 + }, + { + "epoch": 0.6452554744525547, + "grad_norm": 1.1191184156520917, + "learning_rate": 2.95307837240795e-06, + "loss": 0.2363, + "step": 6630 + }, + { + "epoch": 0.645352798053528, + "grad_norm": 1.4559198389990236, + "learning_rate": 2.9516404499257565e-06, + "loss": 0.4113, + "step": 6631 + }, + { + "epoch": 0.6454501216545012, + "grad_norm": 1.4156938279284876, + "learning_rate": 2.9502027309903125e-06, + "loss": 0.2687, + "step": 6632 + }, + { + "epoch": 0.6455474452554745, + "grad_norm": 1.546264065406372, + "learning_rate": 2.9487652157444803e-06, + "loss": 0.5267, + "step": 6633 + }, + { + "epoch": 0.6456447688564477, + "grad_norm": 1.258388837428096, + "learning_rate": 2.94732790433111e-06, + "loss": 0.3008, + "step": 6634 + }, + { + "epoch": 0.645742092457421, + "grad_norm": 1.3070462770390554, + "learning_rate": 2.9458907968930274e-06, + "loss": 0.314, + "step": 6635 + }, + { + "epoch": 0.6458394160583941, + "grad_norm": 1.342732865154092, + "learning_rate": 2.944453893573041e-06, + "loss": 0.2853, + "step": 6636 + }, + { + "epoch": 0.6459367396593674, + "grad_norm": 1.2365501650372648, + "learning_rate": 2.9430171945139325e-06, + "loss": 0.3006, + "step": 6637 + }, + { + "epoch": 0.6460340632603406, + "grad_norm": 1.4756838490224202, + "learning_rate": 2.9415806998584695e-06, + "loss": 0.2777, + "step": 6638 + }, + { + "epoch": 0.6461313868613139, + "grad_norm": 1.8617608286928278, + "learning_rate": 2.9401444097493993e-06, + "loss": 0.3576, + "step": 6639 + }, + { + "epoch": 0.6462287104622871, + "grad_norm": 1.7158037075222772, + "learning_rate": 2.9387083243294433e-06, + "loss": 0.497, + "step": 6640 + }, + { + "epoch": 0.6463260340632604, + "grad_norm": 1.3471686324159067, + "learning_rate": 2.937272443741309e-06, + "loss": 0.273, + "step": 6641 + }, + { + "epoch": 0.6464233576642335, + "grad_norm": 1.1938097614288674, + "learning_rate": 2.935836768127679e-06, + "loss": 0.2135, + "step": 6642 + }, + { + "epoch": 0.6465206812652068, + "grad_norm": 1.2363318854541447, + "learning_rate": 2.9344012976312197e-06, + "loss": 0.2831, + "step": 6643 + }, + { + "epoch": 0.64661800486618, + "grad_norm": 1.3109023952216936, + "learning_rate": 2.932966032394572e-06, + "loss": 0.333, + "step": 6644 + }, + { + "epoch": 0.6467153284671533, + "grad_norm": 1.530272814294704, + "learning_rate": 2.9315309725603596e-06, + "loss": 0.483, + "step": 6645 + }, + { + "epoch": 0.6468126520681265, + "grad_norm": 1.5315420689647377, + "learning_rate": 2.9300961182711884e-06, + "loss": 0.4346, + "step": 6646 + }, + { + "epoch": 0.6469099756690998, + "grad_norm": 1.3411164758919616, + "learning_rate": 2.9286614696696358e-06, + "loss": 0.3016, + "step": 6647 + }, + { + "epoch": 0.6470072992700729, + "grad_norm": 1.7287882961574295, + "learning_rate": 2.9272270268982663e-06, + "loss": 0.448, + "step": 6648 + }, + { + "epoch": 0.6471046228710462, + "grad_norm": 1.5086159418529348, + "learning_rate": 2.9257927900996216e-06, + "loss": 0.4965, + "step": 6649 + }, + { + "epoch": 0.6472019464720195, + "grad_norm": 1.121913171825412, + "learning_rate": 2.9243587594162226e-06, + "loss": 0.2167, + "step": 6650 + }, + { + "epoch": 0.6472992700729927, + "grad_norm": 1.8580765237396737, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.3295, + "step": 6651 + }, + { + "epoch": 0.647396593673966, + "grad_norm": 1.5977490684949451, + "learning_rate": 2.9214913169651404e-06, + "loss": 0.5074, + "step": 6652 + }, + { + "epoch": 0.6474939172749392, + "grad_norm": 1.3471648716381126, + "learning_rate": 2.920057905482398e-06, + "loss": 0.3431, + "step": 6653 + }, + { + "epoch": 0.6475912408759124, + "grad_norm": 1.6787335903021845, + "learning_rate": 2.9186247006847805e-06, + "loss": 0.336, + "step": 6654 + }, + { + "epoch": 0.6476885644768856, + "grad_norm": 1.5099697197158022, + "learning_rate": 2.917191702714705e-06, + "loss": 0.3717, + "step": 6655 + }, + { + "epoch": 0.6477858880778589, + "grad_norm": 1.493061548659645, + "learning_rate": 2.9157589117145704e-06, + "loss": 0.4942, + "step": 6656 + }, + { + "epoch": 0.6478832116788321, + "grad_norm": 1.414229999548491, + "learning_rate": 2.9143263278267555e-06, + "loss": 0.3514, + "step": 6657 + }, + { + "epoch": 0.6479805352798054, + "grad_norm": 1.6119514508142312, + "learning_rate": 2.912893951193614e-06, + "loss": 0.3723, + "step": 6658 + }, + { + "epoch": 0.6480778588807786, + "grad_norm": 1.71435984950754, + "learning_rate": 2.9114617819574824e-06, + "loss": 0.5102, + "step": 6659 + }, + { + "epoch": 0.6481751824817519, + "grad_norm": 1.5558302623525726, + "learning_rate": 2.910029820260678e-06, + "loss": 0.2798, + "step": 6660 + }, + { + "epoch": 0.648272506082725, + "grad_norm": 1.3469746068247404, + "learning_rate": 2.9085980662454964e-06, + "loss": 0.2322, + "step": 6661 + }, + { + "epoch": 0.6483698296836983, + "grad_norm": 1.401344389514034, + "learning_rate": 2.907166520054207e-06, + "loss": 0.3496, + "step": 6662 + }, + { + "epoch": 0.6484671532846715, + "grad_norm": 1.4870078579015846, + "learning_rate": 2.9057351818290687e-06, + "loss": 0.4631, + "step": 6663 + }, + { + "epoch": 0.6485644768856448, + "grad_norm": 1.957932437958498, + "learning_rate": 2.9043040517123143e-06, + "loss": 0.5221, + "step": 6664 + }, + { + "epoch": 0.648661800486618, + "grad_norm": 1.6613311559459771, + "learning_rate": 2.9028731298461533e-06, + "loss": 0.3169, + "step": 6665 + }, + { + "epoch": 0.6487591240875913, + "grad_norm": 1.5105275431309184, + "learning_rate": 2.901442416372777e-06, + "loss": 0.4091, + "step": 6666 + }, + { + "epoch": 0.6488564476885644, + "grad_norm": 1.634055070945907, + "learning_rate": 2.9000119114343584e-06, + "loss": 0.4286, + "step": 6667 + }, + { + "epoch": 0.6489537712895377, + "grad_norm": 1.3345448334510328, + "learning_rate": 2.8985816151730497e-06, + "loss": 0.4469, + "step": 6668 + }, + { + "epoch": 0.6490510948905109, + "grad_norm": 2.727388185889262, + "learning_rate": 2.897151527730974e-06, + "loss": 0.4915, + "step": 6669 + }, + { + "epoch": 0.6491484184914842, + "grad_norm": 1.3341487933996297, + "learning_rate": 2.895721649250244e-06, + "loss": 0.21, + "step": 6670 + }, + { + "epoch": 0.6492457420924574, + "grad_norm": 1.500453403612506, + "learning_rate": 2.8942919798729473e-06, + "loss": 0.3799, + "step": 6671 + }, + { + "epoch": 0.6493430656934307, + "grad_norm": 1.4647330687352165, + "learning_rate": 2.892862519741153e-06, + "loss": 0.4607, + "step": 6672 + }, + { + "epoch": 0.6494403892944038, + "grad_norm": 1.4687518667655608, + "learning_rate": 2.8914332689969014e-06, + "loss": 0.3455, + "step": 6673 + }, + { + "epoch": 0.6495377128953771, + "grad_norm": 1.3535602776894515, + "learning_rate": 2.890004227782224e-06, + "loss": 0.3697, + "step": 6674 + }, + { + "epoch": 0.6496350364963503, + "grad_norm": 1.2346898896530456, + "learning_rate": 2.888575396239125e-06, + "loss": 0.291, + "step": 6675 + }, + { + "epoch": 0.6497323600973236, + "grad_norm": 1.2776327477031653, + "learning_rate": 2.8871467745095842e-06, + "loss": 0.3244, + "step": 6676 + }, + { + "epoch": 0.6498296836982969, + "grad_norm": 1.402523358672398, + "learning_rate": 2.8857183627355677e-06, + "loss": 0.358, + "step": 6677 + }, + { + "epoch": 0.6499270072992701, + "grad_norm": 1.2448787685336862, + "learning_rate": 2.884290161059017e-06, + "loss": 0.224, + "step": 6678 + }, + { + "epoch": 0.6500243309002434, + "grad_norm": 1.4739988161959612, + "learning_rate": 2.882862169621855e-06, + "loss": 0.41, + "step": 6679 + }, + { + "epoch": 0.6501216545012165, + "grad_norm": 1.2966540964264213, + "learning_rate": 2.881434388565979e-06, + "loss": 0.3083, + "step": 6680 + }, + { + "epoch": 0.6502189781021898, + "grad_norm": 1.460840721800784, + "learning_rate": 2.8800068180332697e-06, + "loss": 0.4741, + "step": 6681 + }, + { + "epoch": 0.650316301703163, + "grad_norm": 2.158897874954919, + "learning_rate": 2.878579458165588e-06, + "loss": 0.4813, + "step": 6682 + }, + { + "epoch": 0.6504136253041363, + "grad_norm": 1.3984750071349343, + "learning_rate": 2.877152309104766e-06, + "loss": 0.3782, + "step": 6683 + }, + { + "epoch": 0.6505109489051095, + "grad_norm": 1.326237762567292, + "learning_rate": 2.8757253709926245e-06, + "loss": 0.391, + "step": 6684 + }, + { + "epoch": 0.6506082725060828, + "grad_norm": 1.5557193877972066, + "learning_rate": 2.87429864397096e-06, + "loss": 0.5827, + "step": 6685 + }, + { + "epoch": 0.6507055961070559, + "grad_norm": 1.6241143087080552, + "learning_rate": 2.8728721281815473e-06, + "loss": 0.6017, + "step": 6686 + }, + { + "epoch": 0.6508029197080292, + "grad_norm": 1.4980491031396401, + "learning_rate": 2.8714458237661363e-06, + "loss": 0.3771, + "step": 6687 + }, + { + "epoch": 0.6509002433090024, + "grad_norm": 2.3092643164099758, + "learning_rate": 2.8700197308664624e-06, + "loss": 0.5363, + "step": 6688 + }, + { + "epoch": 0.6509975669099757, + "grad_norm": 1.4739719655335488, + "learning_rate": 2.868593849624237e-06, + "loss": 0.4634, + "step": 6689 + }, + { + "epoch": 0.6510948905109489, + "grad_norm": 1.5066114000013453, + "learning_rate": 2.867168180181153e-06, + "loss": 0.3942, + "step": 6690 + }, + { + "epoch": 0.6511922141119222, + "grad_norm": 1.6017351010985514, + "learning_rate": 2.865742722678876e-06, + "loss": 0.388, + "step": 6691 + }, + { + "epoch": 0.6512895377128953, + "grad_norm": 1.400788786613074, + "learning_rate": 2.864317477259056e-06, + "loss": 0.4073, + "step": 6692 + }, + { + "epoch": 0.6513868613138686, + "grad_norm": 1.7401054497258985, + "learning_rate": 2.862892444063321e-06, + "loss": 0.2496, + "step": 6693 + }, + { + "epoch": 0.6514841849148418, + "grad_norm": 1.5923507382135946, + "learning_rate": 2.8614676232332776e-06, + "loss": 0.3246, + "step": 6694 + }, + { + "epoch": 0.6515815085158151, + "grad_norm": 1.4027517251259218, + "learning_rate": 2.8600430149105106e-06, + "loss": 0.382, + "step": 6695 + }, + { + "epoch": 0.6516788321167883, + "grad_norm": 1.6163192113468026, + "learning_rate": 2.858618619236585e-06, + "loss": 0.3401, + "step": 6696 + }, + { + "epoch": 0.6517761557177616, + "grad_norm": 1.4795033067342065, + "learning_rate": 2.8571944363530455e-06, + "loss": 0.5037, + "step": 6697 + }, + { + "epoch": 0.6518734793187347, + "grad_norm": 2.0629604865916074, + "learning_rate": 2.85577046640141e-06, + "loss": 0.371, + "step": 6698 + }, + { + "epoch": 0.651970802919708, + "grad_norm": 1.3478044069092732, + "learning_rate": 2.8543467095231803e-06, + "loss": 0.3012, + "step": 6699 + }, + { + "epoch": 0.6520681265206812, + "grad_norm": 1.4802320789398475, + "learning_rate": 2.852923165859838e-06, + "loss": 0.3371, + "step": 6700 + }, + { + "epoch": 0.6521654501216545, + "grad_norm": 1.6363690350905862, + "learning_rate": 2.8514998355528415e-06, + "loss": 0.3674, + "step": 6701 + }, + { + "epoch": 0.6522627737226278, + "grad_norm": 1.260026771624302, + "learning_rate": 2.850076718743625e-06, + "loss": 0.3232, + "step": 6702 + }, + { + "epoch": 0.652360097323601, + "grad_norm": 1.412248322337149, + "learning_rate": 2.848653815573607e-06, + "loss": 0.4153, + "step": 6703 + }, + { + "epoch": 0.6524574209245743, + "grad_norm": 1.5690732402544092, + "learning_rate": 2.847231126184181e-06, + "loss": 0.5527, + "step": 6704 + }, + { + "epoch": 0.6525547445255474, + "grad_norm": 1.527139167059845, + "learning_rate": 2.845808650716722e-06, + "loss": 0.3157, + "step": 6705 + }, + { + "epoch": 0.6526520681265207, + "grad_norm": 1.6747933701558217, + "learning_rate": 2.8443863893125813e-06, + "loss": 0.2914, + "step": 6706 + }, + { + "epoch": 0.6527493917274939, + "grad_norm": 1.5567914825053548, + "learning_rate": 2.8429643421130892e-06, + "loss": 0.3197, + "step": 6707 + }, + { + "epoch": 0.6528467153284672, + "grad_norm": 1.5472847752203414, + "learning_rate": 2.8415425092595594e-06, + "loss": 0.4646, + "step": 6708 + }, + { + "epoch": 0.6529440389294404, + "grad_norm": 1.6913478920540168, + "learning_rate": 2.840120890893274e-06, + "loss": 0.3787, + "step": 6709 + }, + { + "epoch": 0.6530413625304137, + "grad_norm": 1.5849915487984703, + "learning_rate": 2.838699487155504e-06, + "loss": 0.4798, + "step": 6710 + }, + { + "epoch": 0.6531386861313868, + "grad_norm": 1.7023533464137646, + "learning_rate": 2.8372782981874964e-06, + "loss": 0.3343, + "step": 6711 + }, + { + "epoch": 0.6532360097323601, + "grad_norm": 1.541253945375762, + "learning_rate": 2.835857324130471e-06, + "loss": 0.4786, + "step": 6712 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 1.2110189102863618, + "learning_rate": 2.8344365651256344e-06, + "loss": 0.2647, + "step": 6713 + }, + { + "epoch": 0.6534306569343066, + "grad_norm": 1.4043068918605717, + "learning_rate": 2.8330160213141664e-06, + "loss": 0.3855, + "step": 6714 + }, + { + "epoch": 0.6535279805352798, + "grad_norm": 1.4107831158669495, + "learning_rate": 2.831595692837229e-06, + "loss": 0.4442, + "step": 6715 + }, + { + "epoch": 0.6536253041362531, + "grad_norm": 1.4723933868153984, + "learning_rate": 2.83017557983596e-06, + "loss": 0.3573, + "step": 6716 + }, + { + "epoch": 0.6537226277372262, + "grad_norm": 1.7510733037840591, + "learning_rate": 2.8287556824514778e-06, + "loss": 0.3143, + "step": 6717 + }, + { + "epoch": 0.6538199513381995, + "grad_norm": 1.2229236152013945, + "learning_rate": 2.8273360008248773e-06, + "loss": 0.3149, + "step": 6718 + }, + { + "epoch": 0.6539172749391727, + "grad_norm": 1.213354737561109, + "learning_rate": 2.8259165350972367e-06, + "loss": 0.2917, + "step": 6719 + }, + { + "epoch": 0.654014598540146, + "grad_norm": 1.5777200613673565, + "learning_rate": 2.8244972854096036e-06, + "loss": 0.5767, + "step": 6720 + }, + { + "epoch": 0.6541119221411192, + "grad_norm": 1.286511480595245, + "learning_rate": 2.823078251903013e-06, + "loss": 0.2742, + "step": 6721 + }, + { + "epoch": 0.6542092457420925, + "grad_norm": 1.598419827748563, + "learning_rate": 2.8216594347184754e-06, + "loss": 0.446, + "step": 6722 + }, + { + "epoch": 0.6543065693430657, + "grad_norm": 1.1662896886943277, + "learning_rate": 2.8202408339969776e-06, + "loss": 0.3565, + "step": 6723 + }, + { + "epoch": 0.6544038929440389, + "grad_norm": 3.085003288922651, + "learning_rate": 2.818822449879488e-06, + "loss": 0.5741, + "step": 6724 + }, + { + "epoch": 0.6545012165450121, + "grad_norm": 1.2821354016498248, + "learning_rate": 2.8174042825069526e-06, + "loss": 0.3194, + "step": 6725 + }, + { + "epoch": 0.6545985401459854, + "grad_norm": 1.4086569638066282, + "learning_rate": 2.815986332020294e-06, + "loss": 0.3585, + "step": 6726 + }, + { + "epoch": 0.6546958637469587, + "grad_norm": 1.2516048619486468, + "learning_rate": 2.8145685985604164e-06, + "loss": 0.3207, + "step": 6727 + }, + { + "epoch": 0.6547931873479319, + "grad_norm": 1.6096589661966523, + "learning_rate": 2.8131510822682005e-06, + "loss": 0.3966, + "step": 6728 + }, + { + "epoch": 0.6548905109489052, + "grad_norm": 1.4450744432855225, + "learning_rate": 2.811733783284508e-06, + "loss": 0.3273, + "step": 6729 + }, + { + "epoch": 0.6549878345498783, + "grad_norm": 1.0838015290432028, + "learning_rate": 2.8103167017501725e-06, + "loss": 0.2482, + "step": 6730 + }, + { + "epoch": 0.6550851581508516, + "grad_norm": 1.4705790230518594, + "learning_rate": 2.8088998378060116e-06, + "loss": 0.3232, + "step": 6731 + }, + { + "epoch": 0.6551824817518248, + "grad_norm": 1.8547998125992238, + "learning_rate": 2.8074831915928213e-06, + "loss": 0.4581, + "step": 6732 + }, + { + "epoch": 0.6552798053527981, + "grad_norm": 1.512042028276952, + "learning_rate": 2.806066763251376e-06, + "loss": 0.2257, + "step": 6733 + }, + { + "epoch": 0.6553771289537713, + "grad_norm": 1.5493321025737354, + "learning_rate": 2.804650552922422e-06, + "loss": 0.4022, + "step": 6734 + }, + { + "epoch": 0.6554744525547446, + "grad_norm": 1.3866403906805511, + "learning_rate": 2.8032345607466927e-06, + "loss": 0.3454, + "step": 6735 + }, + { + "epoch": 0.6555717761557177, + "grad_norm": 1.5579323189741427, + "learning_rate": 2.801818786864895e-06, + "loss": 0.3515, + "step": 6736 + }, + { + "epoch": 0.655669099756691, + "grad_norm": 1.497690568748641, + "learning_rate": 2.8004032314177154e-06, + "loss": 0.2639, + "step": 6737 + }, + { + "epoch": 0.6557664233576642, + "grad_norm": 1.5483077378855565, + "learning_rate": 2.7989878945458193e-06, + "loss": 0.6582, + "step": 6738 + }, + { + "epoch": 0.6558637469586375, + "grad_norm": 1.4877237392102052, + "learning_rate": 2.7975727763898486e-06, + "loss": 0.1536, + "step": 6739 + }, + { + "epoch": 0.6559610705596107, + "grad_norm": 1.336065542766245, + "learning_rate": 2.7961578770904263e-06, + "loss": 0.3445, + "step": 6740 + }, + { + "epoch": 0.656058394160584, + "grad_norm": 1.4297878876342072, + "learning_rate": 2.794743196788149e-06, + "loss": 0.377, + "step": 6741 + }, + { + "epoch": 0.6561557177615571, + "grad_norm": 1.7819741099025646, + "learning_rate": 2.7933287356235956e-06, + "loss": 0.5225, + "step": 6742 + }, + { + "epoch": 0.6562530413625304, + "grad_norm": 1.3255852144665683, + "learning_rate": 2.791914493737322e-06, + "loss": 0.3054, + "step": 6743 + }, + { + "epoch": 0.6563503649635036, + "grad_norm": 1.969683108136502, + "learning_rate": 2.7905004712698646e-06, + "loss": 0.3802, + "step": 6744 + }, + { + "epoch": 0.6564476885644769, + "grad_norm": 1.4518428601547329, + "learning_rate": 2.7890866683617314e-06, + "loss": 0.443, + "step": 6745 + }, + { + "epoch": 0.6565450121654501, + "grad_norm": 1.7057052804764412, + "learning_rate": 2.787673085153414e-06, + "loss": 0.5134, + "step": 6746 + }, + { + "epoch": 0.6566423357664234, + "grad_norm": 1.3307615252577463, + "learning_rate": 2.7862597217853827e-06, + "loss": 0.284, + "step": 6747 + }, + { + "epoch": 0.6567396593673966, + "grad_norm": 1.5300810887159755, + "learning_rate": 2.7848465783980837e-06, + "loss": 0.2296, + "step": 6748 + }, + { + "epoch": 0.6568369829683698, + "grad_norm": 1.585924270649872, + "learning_rate": 2.783433655131941e-06, + "loss": 0.4173, + "step": 6749 + }, + { + "epoch": 0.656934306569343, + "grad_norm": 1.6458852091794416, + "learning_rate": 2.782020952127359e-06, + "loss": 0.6476, + "step": 6750 + }, + { + "epoch": 0.6570316301703163, + "grad_norm": 1.2530117944587003, + "learning_rate": 2.78060846952472e-06, + "loss": 0.3907, + "step": 6751 + }, + { + "epoch": 0.6571289537712895, + "grad_norm": 1.451593473861585, + "learning_rate": 2.77919620746438e-06, + "loss": 0.4913, + "step": 6752 + }, + { + "epoch": 0.6572262773722628, + "grad_norm": 1.6784954010346753, + "learning_rate": 2.7777841660866776e-06, + "loss": 0.4332, + "step": 6753 + }, + { + "epoch": 0.657323600973236, + "grad_norm": 1.7320311921859133, + "learning_rate": 2.7763723455319284e-06, + "loss": 0.4713, + "step": 6754 + }, + { + "epoch": 0.6574209245742092, + "grad_norm": 1.784547063277984, + "learning_rate": 2.774960745940428e-06, + "loss": 0.3729, + "step": 6755 + }, + { + "epoch": 0.6575182481751825, + "grad_norm": 1.289910055174179, + "learning_rate": 2.7735493674524437e-06, + "loss": 0.2325, + "step": 6756 + }, + { + "epoch": 0.6576155717761557, + "grad_norm": 1.3923811378634814, + "learning_rate": 2.772138210208228e-06, + "loss": 0.4136, + "step": 6757 + }, + { + "epoch": 0.657712895377129, + "grad_norm": 1.5675438914017519, + "learning_rate": 2.7707272743480073e-06, + "loss": 0.4787, + "step": 6758 + }, + { + "epoch": 0.6578102189781022, + "grad_norm": 2.4454474366224277, + "learning_rate": 2.7693165600119875e-06, + "loss": 0.3791, + "step": 6759 + }, + { + "epoch": 0.6579075425790755, + "grad_norm": 1.4853382398420436, + "learning_rate": 2.7679060673403517e-06, + "loss": 0.4117, + "step": 6760 + }, + { + "epoch": 0.6580048661800486, + "grad_norm": 1.6557659665466375, + "learning_rate": 2.7664957964732624e-06, + "loss": 0.5487, + "step": 6761 + }, + { + "epoch": 0.6581021897810219, + "grad_norm": 1.512419230298557, + "learning_rate": 2.7650857475508608e-06, + "loss": 0.4162, + "step": 6762 + }, + { + "epoch": 0.6581995133819951, + "grad_norm": 1.408478729748277, + "learning_rate": 2.76367592071326e-06, + "loss": 0.2971, + "step": 6763 + }, + { + "epoch": 0.6582968369829684, + "grad_norm": 1.3898916132267092, + "learning_rate": 2.7622663161005576e-06, + "loss": 0.2689, + "step": 6764 + }, + { + "epoch": 0.6583941605839416, + "grad_norm": 1.8935341639633638, + "learning_rate": 2.7608569338528284e-06, + "loss": 0.6053, + "step": 6765 + }, + { + "epoch": 0.6584914841849149, + "grad_norm": 1.4152439303474345, + "learning_rate": 2.75944777411012e-06, + "loss": 0.2361, + "step": 6766 + }, + { + "epoch": 0.6585888077858881, + "grad_norm": 1.3655151602351514, + "learning_rate": 2.7580388370124644e-06, + "loss": 0.3103, + "step": 6767 + }, + { + "epoch": 0.6586861313868613, + "grad_norm": 1.4381378251506454, + "learning_rate": 2.7566301226998667e-06, + "loss": 0.4752, + "step": 6768 + }, + { + "epoch": 0.6587834549878345, + "grad_norm": 1.4347834419039491, + "learning_rate": 2.7552216313123126e-06, + "loss": 0.3787, + "step": 6769 + }, + { + "epoch": 0.6588807785888078, + "grad_norm": 1.485348512745472, + "learning_rate": 2.753813362989765e-06, + "loss": 0.531, + "step": 6770 + }, + { + "epoch": 0.658978102189781, + "grad_norm": 1.5954596381561879, + "learning_rate": 2.7524053178721642e-06, + "loss": 0.3961, + "step": 6771 + }, + { + "epoch": 0.6590754257907543, + "grad_norm": 1.6653211248442563, + "learning_rate": 2.750997496099428e-06, + "loss": 0.4432, + "step": 6772 + }, + { + "epoch": 0.6591727493917275, + "grad_norm": 1.9895189551145296, + "learning_rate": 2.7495898978114554e-06, + "loss": 0.3427, + "step": 6773 + }, + { + "epoch": 0.6592700729927007, + "grad_norm": 1.395584392648721, + "learning_rate": 2.7481825231481156e-06, + "loss": 0.2998, + "step": 6774 + }, + { + "epoch": 0.6593673965936739, + "grad_norm": 1.4257930389334967, + "learning_rate": 2.746775372249263e-06, + "loss": 0.2927, + "step": 6775 + }, + { + "epoch": 0.6594647201946472, + "grad_norm": 1.2339528484610085, + "learning_rate": 2.745368445254728e-06, + "loss": 0.2964, + "step": 6776 + }, + { + "epoch": 0.6595620437956204, + "grad_norm": 1.792398199917605, + "learning_rate": 2.7439617423043146e-06, + "loss": 0.5683, + "step": 6777 + }, + { + "epoch": 0.6596593673965937, + "grad_norm": 1.2875917309424718, + "learning_rate": 2.7425552635378094e-06, + "loss": 0.3235, + "step": 6778 + }, + { + "epoch": 0.659756690997567, + "grad_norm": 1.3979402222711956, + "learning_rate": 2.7411490090949754e-06, + "loss": 0.3291, + "step": 6779 + }, + { + "epoch": 0.6598540145985401, + "grad_norm": 1.203183259420406, + "learning_rate": 2.7397429791155526e-06, + "loss": 0.2845, + "step": 6780 + }, + { + "epoch": 0.6599513381995133, + "grad_norm": 1.535820193773522, + "learning_rate": 2.73833717373926e-06, + "loss": 0.6247, + "step": 6781 + }, + { + "epoch": 0.6600486618004866, + "grad_norm": 1.3880646527538296, + "learning_rate": 2.7369315931057916e-06, + "loss": 0.3147, + "step": 6782 + }, + { + "epoch": 0.6601459854014599, + "grad_norm": 1.317367338707539, + "learning_rate": 2.7355262373548243e-06, + "loss": 0.3695, + "step": 6783 + }, + { + "epoch": 0.6602433090024331, + "grad_norm": 1.8179251934058178, + "learning_rate": 2.7341211066260047e-06, + "loss": 0.3467, + "step": 6784 + }, + { + "epoch": 0.6603406326034064, + "grad_norm": 1.4156028830040523, + "learning_rate": 2.7327162010589636e-06, + "loss": 0.395, + "step": 6785 + }, + { + "epoch": 0.6604379562043796, + "grad_norm": 1.5206793441500446, + "learning_rate": 2.7313115207933068e-06, + "loss": 0.4647, + "step": 6786 + }, + { + "epoch": 0.6605352798053528, + "grad_norm": 1.4097349430422725, + "learning_rate": 2.7299070659686207e-06, + "loss": 0.3793, + "step": 6787 + }, + { + "epoch": 0.660632603406326, + "grad_norm": 1.3418131331745644, + "learning_rate": 2.7285028367244625e-06, + "loss": 0.2305, + "step": 6788 + }, + { + "epoch": 0.6607299270072993, + "grad_norm": 1.3267575573257613, + "learning_rate": 2.727098833200374e-06, + "loss": 0.3616, + "step": 6789 + }, + { + "epoch": 0.6608272506082725, + "grad_norm": 1.1981549343932583, + "learning_rate": 2.725695055535871e-06, + "loss": 0.2421, + "step": 6790 + }, + { + "epoch": 0.6609245742092458, + "grad_norm": 1.4727479617756, + "learning_rate": 2.724291503870449e-06, + "loss": 0.5035, + "step": 6791 + }, + { + "epoch": 0.661021897810219, + "grad_norm": 1.4243252779763125, + "learning_rate": 2.7228881783435785e-06, + "loss": 0.5362, + "step": 6792 + }, + { + "epoch": 0.6611192214111922, + "grad_norm": 1.2399776162926734, + "learning_rate": 2.7214850790947088e-06, + "loss": 0.2938, + "step": 6793 + }, + { + "epoch": 0.6612165450121654, + "grad_norm": 1.6318296549942648, + "learning_rate": 2.72008220626327e-06, + "loss": 0.1549, + "step": 6794 + }, + { + "epoch": 0.6613138686131387, + "grad_norm": 1.3501742956886726, + "learning_rate": 2.718679559988662e-06, + "loss": 0.3351, + "step": 6795 + }, + { + "epoch": 0.6614111922141119, + "grad_norm": 1.330888901571354, + "learning_rate": 2.7172771404102683e-06, + "loss": 0.3373, + "step": 6796 + }, + { + "epoch": 0.6615085158150852, + "grad_norm": 3.9666140357834805, + "learning_rate": 2.715874947667447e-06, + "loss": 0.316, + "step": 6797 + }, + { + "epoch": 0.6616058394160584, + "grad_norm": 1.5280511881158292, + "learning_rate": 2.71447298189954e-06, + "loss": 0.2387, + "step": 6798 + }, + { + "epoch": 0.6617031630170316, + "grad_norm": 1.242125983484383, + "learning_rate": 2.7130712432458537e-06, + "loss": 0.1557, + "step": 6799 + }, + { + "epoch": 0.6618004866180048, + "grad_norm": 1.8552117420688552, + "learning_rate": 2.7116697318456847e-06, + "loss": 0.2278, + "step": 6800 + }, + { + "epoch": 0.6618978102189781, + "grad_norm": 1.4872710236053985, + "learning_rate": 2.7102684478383006e-06, + "loss": 0.373, + "step": 6801 + }, + { + "epoch": 0.6619951338199513, + "grad_norm": 1.6932412428607622, + "learning_rate": 2.708867391362948e-06, + "loss": 0.4548, + "step": 6802 + }, + { + "epoch": 0.6620924574209246, + "grad_norm": 1.4356740051860206, + "learning_rate": 2.7074665625588515e-06, + "loss": 0.2871, + "step": 6803 + }, + { + "epoch": 0.6621897810218978, + "grad_norm": 1.652451531973401, + "learning_rate": 2.706065961565212e-06, + "loss": 0.3971, + "step": 6804 + }, + { + "epoch": 0.662287104622871, + "grad_norm": 1.486040887134083, + "learning_rate": 2.7046655885212093e-06, + "loss": 0.2709, + "step": 6805 + }, + { + "epoch": 0.6623844282238442, + "grad_norm": 1.4917475666009035, + "learning_rate": 2.703265443565996e-06, + "loss": 0.4575, + "step": 6806 + }, + { + "epoch": 0.6624817518248175, + "grad_norm": 1.4235482065006015, + "learning_rate": 2.7018655268387075e-06, + "loss": 0.3679, + "step": 6807 + }, + { + "epoch": 0.6625790754257908, + "grad_norm": 1.2560006591679733, + "learning_rate": 2.700465838478454e-06, + "loss": 0.2515, + "step": 6808 + }, + { + "epoch": 0.662676399026764, + "grad_norm": 1.497765784519021, + "learning_rate": 2.6990663786243255e-06, + "loss": 0.6173, + "step": 6809 + }, + { + "epoch": 0.6627737226277373, + "grad_norm": 1.3066855110703985, + "learning_rate": 2.697667147415383e-06, + "loss": 0.2614, + "step": 6810 + }, + { + "epoch": 0.6628710462287105, + "grad_norm": 1.5807901671285265, + "learning_rate": 2.696268144990669e-06, + "loss": 0.2332, + "step": 6811 + }, + { + "epoch": 0.6629683698296837, + "grad_norm": 1.4737498626838341, + "learning_rate": 2.6948693714892104e-06, + "loss": 0.345, + "step": 6812 + }, + { + "epoch": 0.6630656934306569, + "grad_norm": 1.656285051658688, + "learning_rate": 2.6934708270499964e-06, + "loss": 0.3504, + "step": 6813 + }, + { + "epoch": 0.6631630170316302, + "grad_norm": 1.76532033806788, + "learning_rate": 2.692072511812004e-06, + "loss": 0.5332, + "step": 6814 + }, + { + "epoch": 0.6632603406326034, + "grad_norm": 1.1082327966815568, + "learning_rate": 2.6906744259141847e-06, + "loss": 0.2255, + "step": 6815 + }, + { + "epoch": 0.6633576642335767, + "grad_norm": 1.5337460473840547, + "learning_rate": 2.6892765694954696e-06, + "loss": 0.4916, + "step": 6816 + }, + { + "epoch": 0.6634549878345499, + "grad_norm": 1.352395434964838, + "learning_rate": 2.68787894269476e-06, + "loss": 0.2645, + "step": 6817 + }, + { + "epoch": 0.6635523114355231, + "grad_norm": 1.4003880712254435, + "learning_rate": 2.686481545650941e-06, + "loss": 0.3401, + "step": 6818 + }, + { + "epoch": 0.6636496350364963, + "grad_norm": 1.6332773523403699, + "learning_rate": 2.6850843785028748e-06, + "loss": 0.4248, + "step": 6819 + }, + { + "epoch": 0.6637469586374696, + "grad_norm": 1.6862710787146364, + "learning_rate": 2.6836874413893945e-06, + "loss": 0.5409, + "step": 6820 + }, + { + "epoch": 0.6638442822384428, + "grad_norm": 1.2001202205200827, + "learning_rate": 2.6822907344493143e-06, + "loss": 0.1653, + "step": 6821 + }, + { + "epoch": 0.6639416058394161, + "grad_norm": 1.723918979652751, + "learning_rate": 2.6808942578214312e-06, + "loss": 0.4331, + "step": 6822 + }, + { + "epoch": 0.6640389294403893, + "grad_norm": 1.5146121544000053, + "learning_rate": 2.6794980116445133e-06, + "loss": 0.4546, + "step": 6823 + }, + { + "epoch": 0.6641362530413625, + "grad_norm": 1.302752950784683, + "learning_rate": 2.6781019960573016e-06, + "loss": 0.2941, + "step": 6824 + }, + { + "epoch": 0.6642335766423357, + "grad_norm": 1.724459372679229, + "learning_rate": 2.676706211198522e-06, + "loss": 0.3436, + "step": 6825 + }, + { + "epoch": 0.664330900243309, + "grad_norm": 1.3415579548422654, + "learning_rate": 2.6753106572068743e-06, + "loss": 0.4702, + "step": 6826 + }, + { + "epoch": 0.6644282238442822, + "grad_norm": 1.3351960829701917, + "learning_rate": 2.6739153342210378e-06, + "loss": 0.3489, + "step": 6827 + }, + { + "epoch": 0.6645255474452555, + "grad_norm": 1.3789849191329269, + "learning_rate": 2.6725202423796615e-06, + "loss": 0.3317, + "step": 6828 + }, + { + "epoch": 0.6646228710462287, + "grad_norm": 1.7664426412634113, + "learning_rate": 2.67112538182138e-06, + "loss": 0.543, + "step": 6829 + }, + { + "epoch": 0.664720194647202, + "grad_norm": 1.4864940575751309, + "learning_rate": 2.6697307526848026e-06, + "loss": 0.4393, + "step": 6830 + }, + { + "epoch": 0.6648175182481751, + "grad_norm": 1.5739230879298867, + "learning_rate": 2.6683363551085085e-06, + "loss": 0.3464, + "step": 6831 + }, + { + "epoch": 0.6649148418491484, + "grad_norm": 1.2218897552770067, + "learning_rate": 2.6669421892310654e-06, + "loss": 0.2979, + "step": 6832 + }, + { + "epoch": 0.6650121654501217, + "grad_norm": 1.4774874257748367, + "learning_rate": 2.665548255191012e-06, + "loss": 0.3952, + "step": 6833 + }, + { + "epoch": 0.6651094890510949, + "grad_norm": 1.4018603158386684, + "learning_rate": 2.6641545531268644e-06, + "loss": 0.535, + "step": 6834 + }, + { + "epoch": 0.6652068126520682, + "grad_norm": 1.1243934055622178, + "learning_rate": 2.6627610831771134e-06, + "loss": 0.2501, + "step": 6835 + }, + { + "epoch": 0.6653041362530414, + "grad_norm": 1.512693499963609, + "learning_rate": 2.6613678454802293e-06, + "loss": 0.206, + "step": 6836 + }, + { + "epoch": 0.6654014598540146, + "grad_norm": 1.435277796848393, + "learning_rate": 2.6599748401746605e-06, + "loss": 0.2615, + "step": 6837 + }, + { + "epoch": 0.6654987834549878, + "grad_norm": 1.6561965213994767, + "learning_rate": 2.6585820673988315e-06, + "loss": 0.4856, + "step": 6838 + }, + { + "epoch": 0.6655961070559611, + "grad_norm": 1.4960382117577842, + "learning_rate": 2.65718952729114e-06, + "loss": 0.3522, + "step": 6839 + }, + { + "epoch": 0.6656934306569343, + "grad_norm": 1.4027075335516728, + "learning_rate": 2.655797219989965e-06, + "loss": 0.3226, + "step": 6840 + }, + { + "epoch": 0.6657907542579076, + "grad_norm": 1.2693321189230098, + "learning_rate": 2.654405145633664e-06, + "loss": 0.497, + "step": 6841 + }, + { + "epoch": 0.6658880778588808, + "grad_norm": 1.6336499854414535, + "learning_rate": 2.6530133043605606e-06, + "loss": 0.4969, + "step": 6842 + }, + { + "epoch": 0.665985401459854, + "grad_norm": 1.7307854700549319, + "learning_rate": 2.6516216963089698e-06, + "loss": 0.5778, + "step": 6843 + }, + { + "epoch": 0.6660827250608272, + "grad_norm": 1.2561468622941703, + "learning_rate": 2.6502303216171743e-06, + "loss": 0.3605, + "step": 6844 + }, + { + "epoch": 0.6661800486618005, + "grad_norm": 1.3090419627982983, + "learning_rate": 2.6488391804234383e-06, + "loss": 0.368, + "step": 6845 + }, + { + "epoch": 0.6662773722627737, + "grad_norm": 1.3356484440632348, + "learning_rate": 2.6474482728659955e-06, + "loss": 0.3893, + "step": 6846 + }, + { + "epoch": 0.666374695863747, + "grad_norm": 1.4774243297014897, + "learning_rate": 2.646057599083065e-06, + "loss": 0.491, + "step": 6847 + }, + { + "epoch": 0.6664720194647202, + "grad_norm": 1.4784459610404423, + "learning_rate": 2.6446671592128385e-06, + "loss": 0.2695, + "step": 6848 + }, + { + "epoch": 0.6665693430656934, + "grad_norm": 1.723875760255084, + "learning_rate": 2.643276953393483e-06, + "loss": 0.5801, + "step": 6849 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.4119887346623394, + "learning_rate": 2.6418869817631442e-06, + "loss": 0.2102, + "step": 6850 + }, + { + "epoch": 0.6667639902676399, + "grad_norm": 1.8746959439745785, + "learning_rate": 2.6404972444599462e-06, + "loss": 0.482, + "step": 6851 + }, + { + "epoch": 0.6668613138686131, + "grad_norm": 1.4863633340883518, + "learning_rate": 2.639107741621987e-06, + "loss": 0.2951, + "step": 6852 + }, + { + "epoch": 0.6669586374695864, + "grad_norm": 1.3542596002625418, + "learning_rate": 2.637718473387343e-06, + "loss": 0.3737, + "step": 6853 + }, + { + "epoch": 0.6670559610705596, + "grad_norm": 1.4410889645922729, + "learning_rate": 2.6363294398940664e-06, + "loss": 0.3663, + "step": 6854 + }, + { + "epoch": 0.6671532846715329, + "grad_norm": 1.6213355929590048, + "learning_rate": 2.6349406412801857e-06, + "loss": 0.5067, + "step": 6855 + }, + { + "epoch": 0.667250608272506, + "grad_norm": 1.259669147907595, + "learning_rate": 2.633552077683709e-06, + "loss": 0.2239, + "step": 6856 + }, + { + "epoch": 0.6673479318734793, + "grad_norm": 1.6313138956585955, + "learning_rate": 2.6321637492426157e-06, + "loss": 0.327, + "step": 6857 + }, + { + "epoch": 0.6674452554744525, + "grad_norm": 1.5718047230597259, + "learning_rate": 2.630775656094865e-06, + "loss": 0.4699, + "step": 6858 + }, + { + "epoch": 0.6675425790754258, + "grad_norm": 1.6635215488938018, + "learning_rate": 2.6293877983783965e-06, + "loss": 0.4299, + "step": 6859 + }, + { + "epoch": 0.667639902676399, + "grad_norm": 1.725007751005146, + "learning_rate": 2.628000176231117e-06, + "loss": 0.4024, + "step": 6860 + }, + { + "epoch": 0.6677372262773723, + "grad_norm": 1.5256270104582157, + "learning_rate": 2.6266127897909175e-06, + "loss": 0.2548, + "step": 6861 + }, + { + "epoch": 0.6678345498783455, + "grad_norm": 1.4672932807075405, + "learning_rate": 2.625225639195665e-06, + "loss": 0.2651, + "step": 6862 + }, + { + "epoch": 0.6679318734793187, + "grad_norm": 1.3575922797295445, + "learning_rate": 2.6238387245831996e-06, + "loss": 0.3917, + "step": 6863 + }, + { + "epoch": 0.668029197080292, + "grad_norm": 1.456159315437296, + "learning_rate": 2.6224520460913413e-06, + "loss": 0.3906, + "step": 6864 + }, + { + "epoch": 0.6681265206812652, + "grad_norm": 1.703706554550206, + "learning_rate": 2.621065603857884e-06, + "loss": 0.5739, + "step": 6865 + }, + { + "epoch": 0.6682238442822385, + "grad_norm": 1.3691555136661493, + "learning_rate": 2.619679398020602e-06, + "loss": 0.4599, + "step": 6866 + }, + { + "epoch": 0.6683211678832117, + "grad_norm": 1.17929550630756, + "learning_rate": 2.618293428717239e-06, + "loss": 0.2635, + "step": 6867 + }, + { + "epoch": 0.6684184914841849, + "grad_norm": 1.834881223009904, + "learning_rate": 2.6169076960855222e-06, + "loss": 0.5044, + "step": 6868 + }, + { + "epoch": 0.6685158150851581, + "grad_norm": 1.5147877376405583, + "learning_rate": 2.6155222002631526e-06, + "loss": 0.4497, + "step": 6869 + }, + { + "epoch": 0.6686131386861314, + "grad_norm": 1.6463491219098636, + "learning_rate": 2.614136941387809e-06, + "loss": 0.3537, + "step": 6870 + }, + { + "epoch": 0.6687104622871046, + "grad_norm": 1.4510859467851225, + "learning_rate": 2.612751919597143e-06, + "loss": 0.4066, + "step": 6871 + }, + { + "epoch": 0.6688077858880779, + "grad_norm": 1.6114800029038248, + "learning_rate": 2.611367135028785e-06, + "loss": 0.3816, + "step": 6872 + }, + { + "epoch": 0.6689051094890511, + "grad_norm": 1.6799964798027578, + "learning_rate": 2.6099825878203434e-06, + "loss": 0.4222, + "step": 6873 + }, + { + "epoch": 0.6690024330900244, + "grad_norm": 1.4065643806843282, + "learning_rate": 2.608598278109401e-06, + "loss": 0.3696, + "step": 6874 + }, + { + "epoch": 0.6690997566909975, + "grad_norm": 1.5204570862253417, + "learning_rate": 2.607214206033518e-06, + "loss": 0.4105, + "step": 6875 + }, + { + "epoch": 0.6691970802919708, + "grad_norm": 1.6410568395577223, + "learning_rate": 2.605830371730229e-06, + "loss": 0.3413, + "step": 6876 + }, + { + "epoch": 0.669294403892944, + "grad_norm": 1.6532507316787326, + "learning_rate": 2.6044467753370505e-06, + "loss": 0.2001, + "step": 6877 + }, + { + "epoch": 0.6693917274939173, + "grad_norm": 1.5092478033141117, + "learning_rate": 2.603063416991466e-06, + "loss": 0.2827, + "step": 6878 + }, + { + "epoch": 0.6694890510948905, + "grad_norm": 1.546751306385594, + "learning_rate": 2.601680296830943e-06, + "loss": 0.3002, + "step": 6879 + }, + { + "epoch": 0.6695863746958638, + "grad_norm": 1.6644752241349323, + "learning_rate": 2.6002974149929234e-06, + "loss": 0.5049, + "step": 6880 + }, + { + "epoch": 0.6696836982968369, + "grad_norm": 1.475128123004427, + "learning_rate": 2.5989147716148266e-06, + "loss": 0.3427, + "step": 6881 + }, + { + "epoch": 0.6697810218978102, + "grad_norm": 1.5064062963413896, + "learning_rate": 2.5975323668340424e-06, + "loss": 0.512, + "step": 6882 + }, + { + "epoch": 0.6698783454987834, + "grad_norm": 1.435389918874925, + "learning_rate": 2.5961502007879435e-06, + "loss": 0.2699, + "step": 6883 + }, + { + "epoch": 0.6699756690997567, + "grad_norm": 1.455527995158438, + "learning_rate": 2.5947682736138767e-06, + "loss": 0.5053, + "step": 6884 + }, + { + "epoch": 0.67007299270073, + "grad_norm": 1.3756279811800831, + "learning_rate": 2.593386585449166e-06, + "loss": 0.4458, + "step": 6885 + }, + { + "epoch": 0.6701703163017032, + "grad_norm": 1.4437589801987945, + "learning_rate": 2.5920051364311083e-06, + "loss": 0.2007, + "step": 6886 + }, + { + "epoch": 0.6702676399026763, + "grad_norm": 1.2319789761693942, + "learning_rate": 2.5906239266969806e-06, + "loss": 0.2167, + "step": 6887 + }, + { + "epoch": 0.6703649635036496, + "grad_norm": 1.500079947566321, + "learning_rate": 2.589242956384036e-06, + "loss": 0.4666, + "step": 6888 + }, + { + "epoch": 0.6704622871046229, + "grad_norm": 1.3702725725435745, + "learning_rate": 2.5878622256294995e-06, + "loss": 0.3863, + "step": 6889 + }, + { + "epoch": 0.6705596107055961, + "grad_norm": 1.3110456583465535, + "learning_rate": 2.586481734570575e-06, + "loss": 0.2975, + "step": 6890 + }, + { + "epoch": 0.6706569343065694, + "grad_norm": 1.2784160814190484, + "learning_rate": 2.5851014833444447e-06, + "loss": 0.2023, + "step": 6891 + }, + { + "epoch": 0.6707542579075426, + "grad_norm": 1.2698345115553682, + "learning_rate": 2.5837214720882662e-06, + "loss": 0.2125, + "step": 6892 + }, + { + "epoch": 0.6708515815085158, + "grad_norm": 1.5916352546881036, + "learning_rate": 2.5823417009391684e-06, + "loss": 0.4794, + "step": 6893 + }, + { + "epoch": 0.670948905109489, + "grad_norm": 1.3308019251133874, + "learning_rate": 2.5809621700342614e-06, + "loss": 0.2422, + "step": 6894 + }, + { + "epoch": 0.6710462287104623, + "grad_norm": 1.6001626945271217, + "learning_rate": 2.5795828795106305e-06, + "loss": 0.6117, + "step": 6895 + }, + { + "epoch": 0.6711435523114355, + "grad_norm": 1.5937722896438413, + "learning_rate": 2.578203829505337e-06, + "loss": 0.2808, + "step": 6896 + }, + { + "epoch": 0.6712408759124088, + "grad_norm": 1.6413766501516232, + "learning_rate": 2.5768250201554167e-06, + "loss": 0.5904, + "step": 6897 + }, + { + "epoch": 0.671338199513382, + "grad_norm": 1.286238472667233, + "learning_rate": 2.5754464515978845e-06, + "loss": 0.1833, + "step": 6898 + }, + { + "epoch": 0.6714355231143553, + "grad_norm": 1.7388750412253844, + "learning_rate": 2.57406812396973e-06, + "loss": 0.4257, + "step": 6899 + }, + { + "epoch": 0.6715328467153284, + "grad_norm": 1.4820959162542136, + "learning_rate": 2.5726900374079155e-06, + "loss": 0.2638, + "step": 6900 + }, + { + "epoch": 0.6716301703163017, + "grad_norm": 1.3416956345568392, + "learning_rate": 2.5713121920493833e-06, + "loss": 0.3294, + "step": 6901 + }, + { + "epoch": 0.6717274939172749, + "grad_norm": 1.2547849624423444, + "learning_rate": 2.5699345880310546e-06, + "loss": 0.3069, + "step": 6902 + }, + { + "epoch": 0.6718248175182482, + "grad_norm": 1.4256756469078642, + "learning_rate": 2.5685572254898163e-06, + "loss": 0.4298, + "step": 6903 + }, + { + "epoch": 0.6719221411192214, + "grad_norm": 1.553886168720797, + "learning_rate": 2.5671801045625413e-06, + "loss": 0.1912, + "step": 6904 + }, + { + "epoch": 0.6720194647201947, + "grad_norm": 1.4413122958031002, + "learning_rate": 2.565803225386075e-06, + "loss": 0.3262, + "step": 6905 + }, + { + "epoch": 0.6721167883211678, + "grad_norm": 1.6352013448228178, + "learning_rate": 2.564426588097238e-06, + "loss": 0.3957, + "step": 6906 + }, + { + "epoch": 0.6722141119221411, + "grad_norm": 1.4056108505737754, + "learning_rate": 2.5630501928328276e-06, + "loss": 0.3701, + "step": 6907 + }, + { + "epoch": 0.6723114355231143, + "grad_norm": 1.3570337852230339, + "learning_rate": 2.5616740397296184e-06, + "loss": 0.2851, + "step": 6908 + }, + { + "epoch": 0.6724087591240876, + "grad_norm": 1.7010645170092262, + "learning_rate": 2.560298128924358e-06, + "loss": 0.6163, + "step": 6909 + }, + { + "epoch": 0.6725060827250608, + "grad_norm": 1.697343163864623, + "learning_rate": 2.5589224605537744e-06, + "loss": 0.3209, + "step": 6910 + }, + { + "epoch": 0.6726034063260341, + "grad_norm": 1.2657390590182054, + "learning_rate": 2.557547034754564e-06, + "loss": 0.3305, + "step": 6911 + }, + { + "epoch": 0.6727007299270072, + "grad_norm": 1.7582771195861744, + "learning_rate": 2.5561718516634058e-06, + "loss": 0.4296, + "step": 6912 + }, + { + "epoch": 0.6727980535279805, + "grad_norm": 1.3308413384247961, + "learning_rate": 2.5547969114169554e-06, + "loss": 0.3097, + "step": 6913 + }, + { + "epoch": 0.6728953771289538, + "grad_norm": 1.4967437208740688, + "learning_rate": 2.553422214151836e-06, + "loss": 0.398, + "step": 6914 + }, + { + "epoch": 0.672992700729927, + "grad_norm": 1.7208607284298814, + "learning_rate": 2.5520477600046556e-06, + "loss": 0.224, + "step": 6915 + }, + { + "epoch": 0.6730900243309003, + "grad_norm": 1.6043783601399488, + "learning_rate": 2.550673549111994e-06, + "loss": 0.3508, + "step": 6916 + }, + { + "epoch": 0.6731873479318735, + "grad_norm": 1.5019913806185416, + "learning_rate": 2.549299581610407e-06, + "loss": 0.2414, + "step": 6917 + }, + { + "epoch": 0.6732846715328468, + "grad_norm": 1.3128864082829395, + "learning_rate": 2.5479258576364274e-06, + "loss": 0.4193, + "step": 6918 + }, + { + "epoch": 0.6733819951338199, + "grad_norm": 1.4228482112177416, + "learning_rate": 2.546552377326562e-06, + "loss": 0.3994, + "step": 6919 + }, + { + "epoch": 0.6734793187347932, + "grad_norm": 1.5312201049377667, + "learning_rate": 2.545179140817297e-06, + "loss": 0.5103, + "step": 6920 + }, + { + "epoch": 0.6735766423357664, + "grad_norm": 1.5038578174273096, + "learning_rate": 2.5438061482450877e-06, + "loss": 0.5274, + "step": 6921 + }, + { + "epoch": 0.6736739659367397, + "grad_norm": 1.5488599891800303, + "learning_rate": 2.5424333997463713e-06, + "loss": 0.4173, + "step": 6922 + }, + { + "epoch": 0.6737712895377129, + "grad_norm": 1.4837450799000769, + "learning_rate": 2.5410608954575577e-06, + "loss": 0.4064, + "step": 6923 + }, + { + "epoch": 0.6738686131386862, + "grad_norm": 1.5839213065916038, + "learning_rate": 2.5396886355150375e-06, + "loss": 0.4168, + "step": 6924 + }, + { + "epoch": 0.6739659367396593, + "grad_norm": 1.3837122172337968, + "learning_rate": 2.538316620055167e-06, + "loss": 0.3702, + "step": 6925 + }, + { + "epoch": 0.6740632603406326, + "grad_norm": 1.4007117675977598, + "learning_rate": 2.536944849214287e-06, + "loss": 0.2946, + "step": 6926 + }, + { + "epoch": 0.6741605839416058, + "grad_norm": 1.3089715213081934, + "learning_rate": 2.5355733231287115e-06, + "loss": 0.2765, + "step": 6927 + }, + { + "epoch": 0.6742579075425791, + "grad_norm": 1.5761921442895896, + "learning_rate": 2.5342020419347296e-06, + "loss": 0.3464, + "step": 6928 + }, + { + "epoch": 0.6743552311435523, + "grad_norm": 1.4211112685589127, + "learning_rate": 2.532831005768607e-06, + "loss": 0.4043, + "step": 6929 + }, + { + "epoch": 0.6744525547445256, + "grad_norm": 1.1801705287300368, + "learning_rate": 2.5314602147665823e-06, + "loss": 0.2711, + "step": 6930 + }, + { + "epoch": 0.6745498783454987, + "grad_norm": 2.3498059172330232, + "learning_rate": 2.530089669064877e-06, + "loss": 0.3169, + "step": 6931 + }, + { + "epoch": 0.674647201946472, + "grad_norm": 1.3312598089295549, + "learning_rate": 2.5287193687996757e-06, + "loss": 0.4139, + "step": 6932 + }, + { + "epoch": 0.6747445255474452, + "grad_norm": 1.3254325952245336, + "learning_rate": 2.5273493141071517e-06, + "loss": 0.1808, + "step": 6933 + }, + { + "epoch": 0.6748418491484185, + "grad_norm": 1.7734026800425997, + "learning_rate": 2.525979505123445e-06, + "loss": 0.3839, + "step": 6934 + }, + { + "epoch": 0.6749391727493917, + "grad_norm": 1.4758122066807204, + "learning_rate": 2.524609941984677e-06, + "loss": 0.4302, + "step": 6935 + }, + { + "epoch": 0.675036496350365, + "grad_norm": 1.7813840447567562, + "learning_rate": 2.523240624826939e-06, + "loss": 0.3517, + "step": 6936 + }, + { + "epoch": 0.6751338199513383, + "grad_norm": 1.3637936566407973, + "learning_rate": 2.521871553786303e-06, + "loss": 0.4106, + "step": 6937 + }, + { + "epoch": 0.6752311435523114, + "grad_norm": 1.4165533057839517, + "learning_rate": 2.5205027289988136e-06, + "loss": 0.3022, + "step": 6938 + }, + { + "epoch": 0.6753284671532847, + "grad_norm": 1.6688244520859234, + "learning_rate": 2.519134150600492e-06, + "loss": 0.1797, + "step": 6939 + }, + { + "epoch": 0.6754257907542579, + "grad_norm": 1.8398284587508136, + "learning_rate": 2.5177658187273346e-06, + "loss": 0.4476, + "step": 6940 + }, + { + "epoch": 0.6755231143552312, + "grad_norm": 1.4645344725589344, + "learning_rate": 2.5163977335153136e-06, + "loss": 0.2885, + "step": 6941 + }, + { + "epoch": 0.6756204379562044, + "grad_norm": 1.6161709599002068, + "learning_rate": 2.5150298951003783e-06, + "loss": 0.2876, + "step": 6942 + }, + { + "epoch": 0.6757177615571777, + "grad_norm": 1.4815866079825117, + "learning_rate": 2.5136623036184483e-06, + "loss": 0.4605, + "step": 6943 + }, + { + "epoch": 0.6758150851581508, + "grad_norm": 1.609615196895527, + "learning_rate": 2.5122949592054225e-06, + "loss": 0.4088, + "step": 6944 + }, + { + "epoch": 0.6759124087591241, + "grad_norm": 1.383240541923229, + "learning_rate": 2.510927861997176e-06, + "loss": 0.3329, + "step": 6945 + }, + { + "epoch": 0.6760097323600973, + "grad_norm": 1.3351640749055282, + "learning_rate": 2.50956101212956e-06, + "loss": 0.3231, + "step": 6946 + }, + { + "epoch": 0.6761070559610706, + "grad_norm": 1.4935226458942055, + "learning_rate": 2.508194409738395e-06, + "loss": 0.4479, + "step": 6947 + }, + { + "epoch": 0.6762043795620438, + "grad_norm": 1.9088923791770325, + "learning_rate": 2.5068280549594827e-06, + "loss": 0.2989, + "step": 6948 + }, + { + "epoch": 0.6763017031630171, + "grad_norm": 1.893732276052998, + "learning_rate": 2.505461947928599e-06, + "loss": 0.4451, + "step": 6949 + }, + { + "epoch": 0.6763990267639902, + "grad_norm": 1.5889441642097806, + "learning_rate": 2.5040960887814947e-06, + "loss": 0.4973, + "step": 6950 + }, + { + "epoch": 0.6764963503649635, + "grad_norm": 1.2679244451485847, + "learning_rate": 2.5027304776538964e-06, + "loss": 0.2481, + "step": 6951 + }, + { + "epoch": 0.6765936739659367, + "grad_norm": 1.7820610742945566, + "learning_rate": 2.5013651146815055e-06, + "loss": 0.3566, + "step": 6952 + }, + { + "epoch": 0.67669099756691, + "grad_norm": 1.446927709422681, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.2403, + "step": 6953 + }, + { + "epoch": 0.6767883211678832, + "grad_norm": 1.4698336235923033, + "learning_rate": 2.4986351337450315e-06, + "loss": 0.2056, + "step": 6954 + }, + { + "epoch": 0.6768856447688565, + "grad_norm": 1.4352353542997942, + "learning_rate": 2.4972705160522255e-06, + "loss": 0.2377, + "step": 6955 + }, + { + "epoch": 0.6769829683698296, + "grad_norm": 1.5869182692251935, + "learning_rate": 2.495906147057187e-06, + "loss": 0.5277, + "step": 6956 + }, + { + "epoch": 0.6770802919708029, + "grad_norm": 1.6685424818530001, + "learning_rate": 2.4945420268954957e-06, + "loss": 0.3761, + "step": 6957 + }, + { + "epoch": 0.6771776155717761, + "grad_norm": 1.744336705650269, + "learning_rate": 2.4931781557027013e-06, + "loss": 0.4476, + "step": 6958 + }, + { + "epoch": 0.6772749391727494, + "grad_norm": 1.562944806682162, + "learning_rate": 2.491814533614334e-06, + "loss": 0.4595, + "step": 6959 + }, + { + "epoch": 0.6773722627737226, + "grad_norm": 1.3864422589675964, + "learning_rate": 2.4904511607658986e-06, + "loss": 0.3829, + "step": 6960 + }, + { + "epoch": 0.6774695863746959, + "grad_norm": 1.4034962288344124, + "learning_rate": 2.4890880372928736e-06, + "loss": 0.3745, + "step": 6961 + }, + { + "epoch": 0.6775669099756692, + "grad_norm": 1.3183694118541551, + "learning_rate": 2.4877251633307137e-06, + "loss": 0.3473, + "step": 6962 + }, + { + "epoch": 0.6776642335766423, + "grad_norm": 1.094957938061393, + "learning_rate": 2.4863625390148487e-06, + "loss": 0.2575, + "step": 6963 + }, + { + "epoch": 0.6777615571776155, + "grad_norm": 1.534354041552699, + "learning_rate": 2.485000164480685e-06, + "loss": 0.331, + "step": 6964 + }, + { + "epoch": 0.6778588807785888, + "grad_norm": 1.4041273415300692, + "learning_rate": 2.4836380398635982e-06, + "loss": 0.2992, + "step": 6965 + }, + { + "epoch": 0.677956204379562, + "grad_norm": 1.2802825624793999, + "learning_rate": 2.482276165298947e-06, + "loss": 0.3063, + "step": 6966 + }, + { + "epoch": 0.6780535279805353, + "grad_norm": 1.4204287037666237, + "learning_rate": 2.4809145409220623e-06, + "loss": 0.4712, + "step": 6967 + }, + { + "epoch": 0.6781508515815086, + "grad_norm": 1.3437498669291585, + "learning_rate": 2.4795531668682466e-06, + "loss": 0.2643, + "step": 6968 + }, + { + "epoch": 0.6782481751824817, + "grad_norm": 1.6285738891807648, + "learning_rate": 2.4781920432727813e-06, + "loss": 0.5092, + "step": 6969 + }, + { + "epoch": 0.678345498783455, + "grad_norm": 1.6169111964859966, + "learning_rate": 2.476831170270921e-06, + "loss": 0.5218, + "step": 6970 + }, + { + "epoch": 0.6784428223844282, + "grad_norm": 1.2529390116201187, + "learning_rate": 2.475470547997902e-06, + "loss": 0.298, + "step": 6971 + }, + { + "epoch": 0.6785401459854015, + "grad_norm": 1.6428611411762748, + "learning_rate": 2.474110176588924e-06, + "loss": 0.5253, + "step": 6972 + }, + { + "epoch": 0.6786374695863747, + "grad_norm": 1.3779755820587174, + "learning_rate": 2.4727500561791707e-06, + "loss": 0.3648, + "step": 6973 + }, + { + "epoch": 0.678734793187348, + "grad_norm": 1.4918542459847803, + "learning_rate": 2.4713901869037976e-06, + "loss": 0.2401, + "step": 6974 + }, + { + "epoch": 0.6788321167883211, + "grad_norm": 1.5435250740053599, + "learning_rate": 2.470030568897938e-06, + "loss": 0.4601, + "step": 6975 + }, + { + "epoch": 0.6789294403892944, + "grad_norm": 1.5073842443557104, + "learning_rate": 2.4686712022966937e-06, + "loss": 0.2173, + "step": 6976 + }, + { + "epoch": 0.6790267639902676, + "grad_norm": 1.4464958501572325, + "learning_rate": 2.467312087235148e-06, + "loss": 0.2882, + "step": 6977 + }, + { + "epoch": 0.6791240875912409, + "grad_norm": 1.371599500795431, + "learning_rate": 2.4659532238483586e-06, + "loss": 0.3791, + "step": 6978 + }, + { + "epoch": 0.6792214111922141, + "grad_norm": 1.792751456995659, + "learning_rate": 2.4645946122713534e-06, + "loss": 0.3043, + "step": 6979 + }, + { + "epoch": 0.6793187347931874, + "grad_norm": 1.387658716695058, + "learning_rate": 2.4632362526391374e-06, + "loss": 0.3579, + "step": 6980 + }, + { + "epoch": 0.6794160583941606, + "grad_norm": 1.178665779721529, + "learning_rate": 2.4618781450866963e-06, + "loss": 0.2894, + "step": 6981 + }, + { + "epoch": 0.6795133819951338, + "grad_norm": 1.6554744992194588, + "learning_rate": 2.460520289748986e-06, + "loss": 0.476, + "step": 6982 + }, + { + "epoch": 0.679610705596107, + "grad_norm": 1.4270076128322857, + "learning_rate": 2.459162686760934e-06, + "loss": 0.4358, + "step": 6983 + }, + { + "epoch": 0.6797080291970803, + "grad_norm": 1.4423335537670423, + "learning_rate": 2.4578053362574466e-06, + "loss": 0.2762, + "step": 6984 + }, + { + "epoch": 0.6798053527980535, + "grad_norm": 1.4052268226638183, + "learning_rate": 2.4564482383734083e-06, + "loss": 0.3274, + "step": 6985 + }, + { + "epoch": 0.6799026763990268, + "grad_norm": 1.4243597600104019, + "learning_rate": 2.4550913932436694e-06, + "loss": 0.4244, + "step": 6986 + }, + { + "epoch": 0.68, + "grad_norm": 1.617023994178133, + "learning_rate": 2.453734801003063e-06, + "loss": 0.4276, + "step": 6987 + }, + { + "epoch": 0.6800973236009732, + "grad_norm": 1.6336278750386801, + "learning_rate": 2.452378461786395e-06, + "loss": 0.4974, + "step": 6988 + }, + { + "epoch": 0.6801946472019464, + "grad_norm": 1.1033290468998957, + "learning_rate": 2.451022375728447e-06, + "loss": 0.2591, + "step": 6989 + }, + { + "epoch": 0.6802919708029197, + "grad_norm": 1.2920284482404487, + "learning_rate": 2.4496665429639675e-06, + "loss": 0.3202, + "step": 6990 + }, + { + "epoch": 0.680389294403893, + "grad_norm": 1.2120326842505735, + "learning_rate": 2.448310963627694e-06, + "loss": 0.2522, + "step": 6991 + }, + { + "epoch": 0.6804866180048662, + "grad_norm": 1.6963667865487155, + "learning_rate": 2.4469556378543284e-06, + "loss": 0.3964, + "step": 6992 + }, + { + "epoch": 0.6805839416058395, + "grad_norm": 1.5656926915198155, + "learning_rate": 2.4456005657785518e-06, + "loss": 0.4468, + "step": 6993 + }, + { + "epoch": 0.6806812652068126, + "grad_norm": 1.4530998966140611, + "learning_rate": 2.4442457475350155e-06, + "loss": 0.4109, + "step": 6994 + }, + { + "epoch": 0.6807785888077859, + "grad_norm": 1.520292586600981, + "learning_rate": 2.4428911832583504e-06, + "loss": 0.4263, + "step": 6995 + }, + { + "epoch": 0.6808759124087591, + "grad_norm": 1.427880818466438, + "learning_rate": 2.4415368730831613e-06, + "loss": 0.4779, + "step": 6996 + }, + { + "epoch": 0.6809732360097324, + "grad_norm": 1.4461973214014017, + "learning_rate": 2.4401828171440237e-06, + "loss": 0.3459, + "step": 6997 + }, + { + "epoch": 0.6810705596107056, + "grad_norm": 1.8635765497458632, + "learning_rate": 2.4388290155754934e-06, + "loss": 0.4807, + "step": 6998 + }, + { + "epoch": 0.6811678832116789, + "grad_norm": 1.5011692258527074, + "learning_rate": 2.4374754685120982e-06, + "loss": 0.2618, + "step": 6999 + }, + { + "epoch": 0.681265206812652, + "grad_norm": 1.5935035963876705, + "learning_rate": 2.4361221760883407e-06, + "loss": 0.4121, + "step": 7000 + }, + { + "epoch": 0.6813625304136253, + "grad_norm": 1.3657851083856138, + "learning_rate": 2.434769138438698e-06, + "loss": 0.3803, + "step": 7001 + }, + { + "epoch": 0.6814598540145985, + "grad_norm": 1.3890571757194168, + "learning_rate": 2.433416355697623e-06, + "loss": 0.2878, + "step": 7002 + }, + { + "epoch": 0.6815571776155718, + "grad_norm": 1.4534767299451985, + "learning_rate": 2.4320638279995443e-06, + "loss": 0.4363, + "step": 7003 + }, + { + "epoch": 0.681654501216545, + "grad_norm": 1.5975253697441922, + "learning_rate": 2.4307115554788595e-06, + "loss": 0.407, + "step": 7004 + }, + { + "epoch": 0.6817518248175183, + "grad_norm": 1.4143137840276294, + "learning_rate": 2.4293595382699464e-06, + "loss": 0.234, + "step": 7005 + }, + { + "epoch": 0.6818491484184915, + "grad_norm": 1.4106838686942245, + "learning_rate": 2.4280077765071565e-06, + "loss": 0.4045, + "step": 7006 + }, + { + "epoch": 0.6819464720194647, + "grad_norm": 1.725546559530535, + "learning_rate": 2.426656270324816e-06, + "loss": 0.3169, + "step": 7007 + }, + { + "epoch": 0.6820437956204379, + "grad_norm": 1.6297421586113854, + "learning_rate": 2.425305019857222e-06, + "loss": 0.6467, + "step": 7008 + }, + { + "epoch": 0.6821411192214112, + "grad_norm": 1.650304378987235, + "learning_rate": 2.4239540252386507e-06, + "loss": 0.4968, + "step": 7009 + }, + { + "epoch": 0.6822384428223844, + "grad_norm": 1.5135282977520814, + "learning_rate": 2.422603286603351e-06, + "loss": 0.5243, + "step": 7010 + }, + { + "epoch": 0.6823357664233577, + "grad_norm": 1.4455538161613704, + "learning_rate": 2.4212528040855477e-06, + "loss": 0.3628, + "step": 7011 + }, + { + "epoch": 0.682433090024331, + "grad_norm": 1.4059383470846873, + "learning_rate": 2.4199025778194373e-06, + "loss": 0.3781, + "step": 7012 + }, + { + "epoch": 0.6825304136253041, + "grad_norm": 1.8515411890791325, + "learning_rate": 2.418552607939194e-06, + "loss": 0.5707, + "step": 7013 + }, + { + "epoch": 0.6826277372262773, + "grad_norm": 2.0140690675425676, + "learning_rate": 2.4172028945789674e-06, + "loss": 0.4568, + "step": 7014 + }, + { + "epoch": 0.6827250608272506, + "grad_norm": 1.2638049284093802, + "learning_rate": 2.415853437872874e-06, + "loss": 0.333, + "step": 7015 + }, + { + "epoch": 0.6828223844282238, + "grad_norm": 1.2303873671003442, + "learning_rate": 2.4145042379550126e-06, + "loss": 0.249, + "step": 7016 + }, + { + "epoch": 0.6829197080291971, + "grad_norm": 1.4612765811170028, + "learning_rate": 2.4131552949594544e-06, + "loss": 0.2459, + "step": 7017 + }, + { + "epoch": 0.6830170316301704, + "grad_norm": 1.3299067942865632, + "learning_rate": 2.4118066090202467e-06, + "loss": 0.4052, + "step": 7018 + }, + { + "epoch": 0.6831143552311435, + "grad_norm": 1.7221463026041783, + "learning_rate": 2.410458180271405e-06, + "loss": 0.4442, + "step": 7019 + }, + { + "epoch": 0.6832116788321168, + "grad_norm": 1.2746340506968268, + "learning_rate": 2.409110008846926e-06, + "loss": 0.2624, + "step": 7020 + }, + { + "epoch": 0.68330900243309, + "grad_norm": 1.5201577902758732, + "learning_rate": 2.4077620948807775e-06, + "loss": 0.5161, + "step": 7021 + }, + { + "epoch": 0.6834063260340633, + "grad_norm": 1.6101704085540542, + "learning_rate": 2.4064144385069027e-06, + "loss": 0.4633, + "step": 7022 + }, + { + "epoch": 0.6835036496350365, + "grad_norm": 1.811203591625575, + "learning_rate": 2.4050670398592197e-06, + "loss": 0.3546, + "step": 7023 + }, + { + "epoch": 0.6836009732360098, + "grad_norm": 1.483058305310578, + "learning_rate": 2.40371989907162e-06, + "loss": 0.3693, + "step": 7024 + }, + { + "epoch": 0.683698296836983, + "grad_norm": 1.350438527125783, + "learning_rate": 2.4023730162779712e-06, + "loss": 0.2979, + "step": 7025 + }, + { + "epoch": 0.6837956204379562, + "grad_norm": 2.122168393420503, + "learning_rate": 2.4010263916121114e-06, + "loss": 0.4741, + "step": 7026 + }, + { + "epoch": 0.6838929440389294, + "grad_norm": 1.4059707364310394, + "learning_rate": 2.3996800252078555e-06, + "loss": 0.2625, + "step": 7027 + }, + { + "epoch": 0.6839902676399027, + "grad_norm": 1.6011541078488662, + "learning_rate": 2.3983339171989944e-06, + "loss": 0.5337, + "step": 7028 + }, + { + "epoch": 0.6840875912408759, + "grad_norm": 1.6664607715584878, + "learning_rate": 2.3969880677192924e-06, + "loss": 0.5497, + "step": 7029 + }, + { + "epoch": 0.6841849148418492, + "grad_norm": 1.4867998094055201, + "learning_rate": 2.3956424769024843e-06, + "loss": 0.4703, + "step": 7030 + }, + { + "epoch": 0.6842822384428224, + "grad_norm": 1.6682610356191527, + "learning_rate": 2.3942971448822842e-06, + "loss": 0.2932, + "step": 7031 + }, + { + "epoch": 0.6843795620437956, + "grad_norm": 1.5716231460991188, + "learning_rate": 2.3929520717923787e-06, + "loss": 0.3262, + "step": 7032 + }, + { + "epoch": 0.6844768856447688, + "grad_norm": 1.496820099208088, + "learning_rate": 2.391607257766428e-06, + "loss": 0.3892, + "step": 7033 + }, + { + "epoch": 0.6845742092457421, + "grad_norm": 1.7206930101770672, + "learning_rate": 2.390262702938067e-06, + "loss": 0.3531, + "step": 7034 + }, + { + "epoch": 0.6846715328467153, + "grad_norm": 1.5024064946077917, + "learning_rate": 2.388918407440906e-06, + "loss": 0.2519, + "step": 7035 + }, + { + "epoch": 0.6847688564476886, + "grad_norm": 1.7563921127653073, + "learning_rate": 2.387574371408529e-06, + "loss": 0.469, + "step": 7036 + }, + { + "epoch": 0.6848661800486618, + "grad_norm": 1.6396186285166314, + "learning_rate": 2.3862305949744906e-06, + "loss": 0.6244, + "step": 7037 + }, + { + "epoch": 0.684963503649635, + "grad_norm": 1.253278248232481, + "learning_rate": 2.384887078272325e-06, + "loss": 0.3874, + "step": 7038 + }, + { + "epoch": 0.6850608272506082, + "grad_norm": 1.6037098700719943, + "learning_rate": 2.3835438214355394e-06, + "loss": 0.6085, + "step": 7039 + }, + { + "epoch": 0.6851581508515815, + "grad_norm": 1.4916227380404068, + "learning_rate": 2.382200824597611e-06, + "loss": 0.304, + "step": 7040 + }, + { + "epoch": 0.6852554744525547, + "grad_norm": 1.456903692394162, + "learning_rate": 2.3808580878919948e-06, + "loss": 0.4673, + "step": 7041 + }, + { + "epoch": 0.685352798053528, + "grad_norm": 1.4472802865422167, + "learning_rate": 2.3795156114521206e-06, + "loss": 0.4213, + "step": 7042 + }, + { + "epoch": 0.6854501216545013, + "grad_norm": 1.4508606691248402, + "learning_rate": 2.3781733954113913e-06, + "loss": 0.4607, + "step": 7043 + }, + { + "epoch": 0.6855474452554745, + "grad_norm": 1.2246391621302561, + "learning_rate": 2.376831439903183e-06, + "loss": 0.2983, + "step": 7044 + }, + { + "epoch": 0.6856447688564477, + "grad_norm": 1.2407074754334955, + "learning_rate": 2.375489745060846e-06, + "loss": 0.3823, + "step": 7045 + }, + { + "epoch": 0.6857420924574209, + "grad_norm": 1.2471204970141723, + "learning_rate": 2.374148311017707e-06, + "loss": 0.3476, + "step": 7046 + }, + { + "epoch": 0.6858394160583942, + "grad_norm": 1.7914390271495106, + "learning_rate": 2.372807137907066e-06, + "loss": 0.4304, + "step": 7047 + }, + { + "epoch": 0.6859367396593674, + "grad_norm": 1.6806619346024443, + "learning_rate": 2.371466225862193e-06, + "loss": 0.3429, + "step": 7048 + }, + { + "epoch": 0.6860340632603407, + "grad_norm": 1.2367672494426036, + "learning_rate": 2.370125575016336e-06, + "loss": 0.2466, + "step": 7049 + }, + { + "epoch": 0.6861313868613139, + "grad_norm": 1.4040122559466237, + "learning_rate": 2.3687851855027196e-06, + "loss": 0.3746, + "step": 7050 + }, + { + "epoch": 0.6862287104622871, + "grad_norm": 1.74319004926358, + "learning_rate": 2.3674450574545342e-06, + "loss": 0.513, + "step": 7051 + }, + { + "epoch": 0.6863260340632603, + "grad_norm": 1.5251869524818102, + "learning_rate": 2.366105191004952e-06, + "loss": 0.4837, + "step": 7052 + }, + { + "epoch": 0.6864233576642336, + "grad_norm": 1.2669444800107412, + "learning_rate": 2.3647655862871155e-06, + "loss": 0.2992, + "step": 7053 + }, + { + "epoch": 0.6865206812652068, + "grad_norm": 1.7047724324160347, + "learning_rate": 2.3634262434341426e-06, + "loss": 0.2762, + "step": 7054 + }, + { + "epoch": 0.6866180048661801, + "grad_norm": 1.6635440502336785, + "learning_rate": 2.362087162579125e-06, + "loss": 0.4159, + "step": 7055 + }, + { + "epoch": 0.6867153284671533, + "grad_norm": 1.370775147723081, + "learning_rate": 2.3607483438551266e-06, + "loss": 0.2887, + "step": 7056 + }, + { + "epoch": 0.6868126520681265, + "grad_norm": 1.8890066234821863, + "learning_rate": 2.3594097873951894e-06, + "loss": 0.5529, + "step": 7057 + }, + { + "epoch": 0.6869099756690997, + "grad_norm": 1.5276484270920698, + "learning_rate": 2.3580714933323234e-06, + "loss": 0.3333, + "step": 7058 + }, + { + "epoch": 0.687007299270073, + "grad_norm": 1.5421443233610488, + "learning_rate": 2.3567334617995165e-06, + "loss": 0.4768, + "step": 7059 + }, + { + "epoch": 0.6871046228710462, + "grad_norm": 1.5257211637214352, + "learning_rate": 2.35539569292973e-06, + "loss": 0.5934, + "step": 7060 + }, + { + "epoch": 0.6872019464720195, + "grad_norm": 1.9726873716411073, + "learning_rate": 2.3540581868559016e-06, + "loss": 0.31, + "step": 7061 + }, + { + "epoch": 0.6872992700729927, + "grad_norm": 1.6297929943608709, + "learning_rate": 2.352720943710935e-06, + "loss": 0.32, + "step": 7062 + }, + { + "epoch": 0.6873965936739659, + "grad_norm": 1.344617164610974, + "learning_rate": 2.351383963627716e-06, + "loss": 0.2451, + "step": 7063 + }, + { + "epoch": 0.6874939172749391, + "grad_norm": 1.5779151399274474, + "learning_rate": 2.3500472467390994e-06, + "loss": 0.4428, + "step": 7064 + }, + { + "epoch": 0.6875912408759124, + "grad_norm": 1.7882624673919416, + "learning_rate": 2.348710793177918e-06, + "loss": 0.4275, + "step": 7065 + }, + { + "epoch": 0.6876885644768856, + "grad_norm": 1.5227467853998835, + "learning_rate": 2.3473746030769738e-06, + "loss": 0.2117, + "step": 7066 + }, + { + "epoch": 0.6877858880778589, + "grad_norm": 1.6059188026338929, + "learning_rate": 2.346038676569046e-06, + "loss": 0.4718, + "step": 7067 + }, + { + "epoch": 0.6878832116788322, + "grad_norm": 1.4027488357193745, + "learning_rate": 2.344703013786888e-06, + "loss": 0.2858, + "step": 7068 + }, + { + "epoch": 0.6879805352798054, + "grad_norm": 1.3051811700517728, + "learning_rate": 2.3433676148632218e-06, + "loss": 0.252, + "step": 7069 + }, + { + "epoch": 0.6880778588807785, + "grad_norm": 1.6732824942982871, + "learning_rate": 2.3420324799307486e-06, + "loss": 0.4898, + "step": 7070 + }, + { + "epoch": 0.6881751824817518, + "grad_norm": 1.751569725110196, + "learning_rate": 2.3406976091221413e-06, + "loss": 0.659, + "step": 7071 + }, + { + "epoch": 0.688272506082725, + "grad_norm": 1.605504687572991, + "learning_rate": 2.339363002570049e-06, + "loss": 0.3602, + "step": 7072 + }, + { + "epoch": 0.6883698296836983, + "grad_norm": 1.4018499413477323, + "learning_rate": 2.3380286604070888e-06, + "loss": 0.3024, + "step": 7073 + }, + { + "epoch": 0.6884671532846716, + "grad_norm": 1.5688080674325353, + "learning_rate": 2.336694582765857e-06, + "loss": 0.3225, + "step": 7074 + }, + { + "epoch": 0.6885644768856448, + "grad_norm": 1.4753661784155816, + "learning_rate": 2.3353607697789218e-06, + "loss": 0.3374, + "step": 7075 + }, + { + "epoch": 0.688661800486618, + "grad_norm": 1.1487316351783057, + "learning_rate": 2.334027221578824e-06, + "loss": 0.2517, + "step": 7076 + }, + { + "epoch": 0.6887591240875912, + "grad_norm": 1.4174448942476598, + "learning_rate": 2.33269393829808e-06, + "loss": 0.2702, + "step": 7077 + }, + { + "epoch": 0.6888564476885645, + "grad_norm": 1.488949282846272, + "learning_rate": 2.331360920069179e-06, + "loss": 0.4764, + "step": 7078 + }, + { + "epoch": 0.6889537712895377, + "grad_norm": 1.9664262651615338, + "learning_rate": 2.3300281670245855e-06, + "loss": 0.4146, + "step": 7079 + }, + { + "epoch": 0.689051094890511, + "grad_norm": 1.6565485271389164, + "learning_rate": 2.328695679296732e-06, + "loss": 0.4088, + "step": 7080 + }, + { + "epoch": 0.6891484184914842, + "grad_norm": 1.3812187191282164, + "learning_rate": 2.327363457018031e-06, + "loss": 0.4236, + "step": 7081 + }, + { + "epoch": 0.6892457420924574, + "grad_norm": 1.779570607674496, + "learning_rate": 2.326031500320866e-06, + "loss": 0.3915, + "step": 7082 + }, + { + "epoch": 0.6893430656934306, + "grad_norm": 1.5587790625217628, + "learning_rate": 2.324699809337596e-06, + "loss": 0.4618, + "step": 7083 + }, + { + "epoch": 0.6894403892944039, + "grad_norm": 1.564654886139345, + "learning_rate": 2.3233683842005488e-06, + "loss": 0.514, + "step": 7084 + }, + { + "epoch": 0.6895377128953771, + "grad_norm": 1.6886866600418275, + "learning_rate": 2.3220372250420304e-06, + "loss": 0.4877, + "step": 7085 + }, + { + "epoch": 0.6896350364963504, + "grad_norm": 1.1663450865093852, + "learning_rate": 2.320706331994319e-06, + "loss": 0.1601, + "step": 7086 + }, + { + "epoch": 0.6897323600973236, + "grad_norm": 1.6581865181782154, + "learning_rate": 2.319375705189666e-06, + "loss": 0.4446, + "step": 7087 + }, + { + "epoch": 0.6898296836982969, + "grad_norm": 1.451546334463058, + "learning_rate": 2.318045344760297e-06, + "loss": 0.4061, + "step": 7088 + }, + { + "epoch": 0.68992700729927, + "grad_norm": 1.7332038017926754, + "learning_rate": 2.3167152508384104e-06, + "loss": 0.3531, + "step": 7089 + }, + { + "epoch": 0.6900243309002433, + "grad_norm": 1.527844437194958, + "learning_rate": 2.3153854235561805e-06, + "loss": 0.3408, + "step": 7090 + }, + { + "epoch": 0.6901216545012165, + "grad_norm": 1.7059815275929582, + "learning_rate": 2.314055863045749e-06, + "loss": 0.3583, + "step": 7091 + }, + { + "epoch": 0.6902189781021898, + "grad_norm": 1.3737638291972332, + "learning_rate": 2.3127265694392383e-06, + "loss": 0.3684, + "step": 7092 + }, + { + "epoch": 0.690316301703163, + "grad_norm": 1.1311901182058706, + "learning_rate": 2.3113975428687392e-06, + "loss": 0.2227, + "step": 7093 + }, + { + "epoch": 0.6904136253041363, + "grad_norm": 1.6094495144972942, + "learning_rate": 2.3100687834663205e-06, + "loss": 0.4757, + "step": 7094 + }, + { + "epoch": 0.6905109489051094, + "grad_norm": 1.4838140130314696, + "learning_rate": 2.308740291364019e-06, + "loss": 0.4687, + "step": 7095 + }, + { + "epoch": 0.6906082725060827, + "grad_norm": 1.51742116933878, + "learning_rate": 2.307412066693849e-06, + "loss": 0.522, + "step": 7096 + }, + { + "epoch": 0.690705596107056, + "grad_norm": 1.368798924795383, + "learning_rate": 2.306084109587796e-06, + "loss": 0.2961, + "step": 7097 + }, + { + "epoch": 0.6908029197080292, + "grad_norm": 1.100628720850723, + "learning_rate": 2.3047564201778217e-06, + "loss": 0.2135, + "step": 7098 + }, + { + "epoch": 0.6909002433090025, + "grad_norm": 2.5028431937357163, + "learning_rate": 2.303428998595858e-06, + "loss": 0.5036, + "step": 7099 + }, + { + "epoch": 0.6909975669099757, + "grad_norm": 1.588738434492104, + "learning_rate": 2.3021018449738125e-06, + "loss": 0.3995, + "step": 7100 + }, + { + "epoch": 0.6910948905109489, + "grad_norm": 1.3040063416203949, + "learning_rate": 2.300774959443566e-06, + "loss": 0.2657, + "step": 7101 + }, + { + "epoch": 0.6911922141119221, + "grad_norm": 1.2538338993863447, + "learning_rate": 2.29944834213697e-06, + "loss": 0.286, + "step": 7102 + }, + { + "epoch": 0.6912895377128954, + "grad_norm": 1.4370473895504778, + "learning_rate": 2.2981219931858523e-06, + "loss": 0.4474, + "step": 7103 + }, + { + "epoch": 0.6913868613138686, + "grad_norm": 1.495019352637735, + "learning_rate": 2.296795912722014e-06, + "loss": 0.3265, + "step": 7104 + }, + { + "epoch": 0.6914841849148419, + "grad_norm": 1.3513107947436753, + "learning_rate": 2.2954701008772257e-06, + "loss": 0.3779, + "step": 7105 + }, + { + "epoch": 0.6915815085158151, + "grad_norm": 1.3609525800147586, + "learning_rate": 2.294144557783236e-06, + "loss": 0.3096, + "step": 7106 + }, + { + "epoch": 0.6916788321167883, + "grad_norm": 1.5810258849773795, + "learning_rate": 2.2928192835717642e-06, + "loss": 0.3473, + "step": 7107 + }, + { + "epoch": 0.6917761557177615, + "grad_norm": 1.3028614262475204, + "learning_rate": 2.291494278374505e-06, + "loss": 0.3806, + "step": 7108 + }, + { + "epoch": 0.6918734793187348, + "grad_norm": 1.4181698322901153, + "learning_rate": 2.2901695423231235e-06, + "loss": 0.3953, + "step": 7109 + }, + { + "epoch": 0.691970802919708, + "grad_norm": 1.6141866955608357, + "learning_rate": 2.2888450755492604e-06, + "loss": 0.4435, + "step": 7110 + }, + { + "epoch": 0.6920681265206813, + "grad_norm": 1.3275763332022836, + "learning_rate": 2.287520878184528e-06, + "loss": 0.2836, + "step": 7111 + }, + { + "epoch": 0.6921654501216545, + "grad_norm": 1.429484201787624, + "learning_rate": 2.286196950360516e-06, + "loss": 0.3437, + "step": 7112 + }, + { + "epoch": 0.6922627737226278, + "grad_norm": 1.3502231201592745, + "learning_rate": 2.2848732922087784e-06, + "loss": 0.2494, + "step": 7113 + }, + { + "epoch": 0.6923600973236009, + "grad_norm": 1.4890264612822304, + "learning_rate": 2.283549903860851e-06, + "loss": 0.4353, + "step": 7114 + }, + { + "epoch": 0.6924574209245742, + "grad_norm": 1.4526377035185738, + "learning_rate": 2.282226785448242e-06, + "loss": 0.3106, + "step": 7115 + }, + { + "epoch": 0.6925547445255474, + "grad_norm": 1.5784710042113108, + "learning_rate": 2.2809039371024243e-06, + "loss": 0.3609, + "step": 7116 + }, + { + "epoch": 0.6926520681265207, + "grad_norm": 1.3720563243003592, + "learning_rate": 2.2795813589548544e-06, + "loss": 0.4049, + "step": 7117 + }, + { + "epoch": 0.692749391727494, + "grad_norm": 2.34320936961282, + "learning_rate": 2.278259051136955e-06, + "loss": 0.4059, + "step": 7118 + }, + { + "epoch": 0.6928467153284672, + "grad_norm": 1.7331830301897417, + "learning_rate": 2.2769370137801305e-06, + "loss": 0.2635, + "step": 7119 + }, + { + "epoch": 0.6929440389294403, + "grad_norm": 1.7928356378879804, + "learning_rate": 2.2756152470157474e-06, + "loss": 0.4523, + "step": 7120 + }, + { + "epoch": 0.6930413625304136, + "grad_norm": 1.8692269778658488, + "learning_rate": 2.2742937509751505e-06, + "loss": 0.4466, + "step": 7121 + }, + { + "epoch": 0.6931386861313868, + "grad_norm": 1.6143047789787386, + "learning_rate": 2.2729725257896616e-06, + "loss": 0.4384, + "step": 7122 + }, + { + "epoch": 0.6932360097323601, + "grad_norm": 1.4067371796084285, + "learning_rate": 2.271651571590567e-06, + "loss": 0.3634, + "step": 7123 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 1.4539553823066247, + "learning_rate": 2.2703308885091324e-06, + "loss": 0.4489, + "step": 7124 + }, + { + "epoch": 0.6934306569343066, + "grad_norm": 1.6923470292487963, + "learning_rate": 2.2690104766765956e-06, + "loss": 0.3835, + "step": 7125 + }, + { + "epoch": 0.6935279805352798, + "grad_norm": 1.3523041559039422, + "learning_rate": 2.267690336224168e-06, + "loss": 0.3914, + "step": 7126 + }, + { + "epoch": 0.693625304136253, + "grad_norm": 1.6374338267692956, + "learning_rate": 2.266370467283029e-06, + "loss": 0.5484, + "step": 7127 + }, + { + "epoch": 0.6937226277372263, + "grad_norm": 1.48241198389425, + "learning_rate": 2.265050869984337e-06, + "loss": 0.3439, + "step": 7128 + }, + { + "epoch": 0.6938199513381995, + "grad_norm": 1.7498954332991097, + "learning_rate": 2.2637315444592194e-06, + "loss": 0.3914, + "step": 7129 + }, + { + "epoch": 0.6939172749391728, + "grad_norm": 1.4413933482019134, + "learning_rate": 2.262412490838784e-06, + "loss": 0.4159, + "step": 7130 + }, + { + "epoch": 0.694014598540146, + "grad_norm": 1.650525980257544, + "learning_rate": 2.2610937092540995e-06, + "loss": 0.2635, + "step": 7131 + }, + { + "epoch": 0.6941119221411193, + "grad_norm": 1.4291678563030545, + "learning_rate": 2.259775199836217e-06, + "loss": 0.2207, + "step": 7132 + }, + { + "epoch": 0.6942092457420924, + "grad_norm": 1.5482230429622463, + "learning_rate": 2.2584569627161596e-06, + "loss": 0.4786, + "step": 7133 + }, + { + "epoch": 0.6943065693430657, + "grad_norm": 1.3125996324779596, + "learning_rate": 2.2571389980249165e-06, + "loss": 0.3913, + "step": 7134 + }, + { + "epoch": 0.6944038929440389, + "grad_norm": 1.5645298552061642, + "learning_rate": 2.255821305893457e-06, + "loss": 0.4752, + "step": 7135 + }, + { + "epoch": 0.6945012165450122, + "grad_norm": 1.838801580829783, + "learning_rate": 2.254503886452721e-06, + "loss": 0.4698, + "step": 7136 + }, + { + "epoch": 0.6945985401459854, + "grad_norm": 1.3377697966651918, + "learning_rate": 2.2531867398336233e-06, + "loss": 0.3974, + "step": 7137 + }, + { + "epoch": 0.6946958637469587, + "grad_norm": 1.6965012843101581, + "learning_rate": 2.2518698661670456e-06, + "loss": 0.481, + "step": 7138 + }, + { + "epoch": 0.6947931873479318, + "grad_norm": 1.4329440024296556, + "learning_rate": 2.2505532655838466e-06, + "loss": 0.3174, + "step": 7139 + }, + { + "epoch": 0.6948905109489051, + "grad_norm": 1.4171523589263946, + "learning_rate": 2.2492369382148634e-06, + "loss": 0.3386, + "step": 7140 + }, + { + "epoch": 0.6949878345498783, + "grad_norm": 1.3472101772508374, + "learning_rate": 2.2479208841908946e-06, + "loss": 0.4004, + "step": 7141 + }, + { + "epoch": 0.6950851581508516, + "grad_norm": 1.63526156068584, + "learning_rate": 2.246605103642719e-06, + "loss": 0.6597, + "step": 7142 + }, + { + "epoch": 0.6951824817518248, + "grad_norm": 1.6193031809232163, + "learning_rate": 2.245289596701086e-06, + "loss": 0.2818, + "step": 7143 + }, + { + "epoch": 0.6952798053527981, + "grad_norm": 1.523570676019948, + "learning_rate": 2.2439743634967216e-06, + "loss": 0.3612, + "step": 7144 + }, + { + "epoch": 0.6953771289537712, + "grad_norm": 1.2057833347003468, + "learning_rate": 2.2426594041603165e-06, + "loss": 0.2034, + "step": 7145 + }, + { + "epoch": 0.6954744525547445, + "grad_norm": 1.4226443546306904, + "learning_rate": 2.2413447188225417e-06, + "loss": 0.3774, + "step": 7146 + }, + { + "epoch": 0.6955717761557177, + "grad_norm": 1.3792925069432436, + "learning_rate": 2.240030307614037e-06, + "loss": 0.3182, + "step": 7147 + }, + { + "epoch": 0.695669099756691, + "grad_norm": 1.2143486461440078, + "learning_rate": 2.2387161706654196e-06, + "loss": 0.228, + "step": 7148 + }, + { + "epoch": 0.6957664233576643, + "grad_norm": 1.5365330392380654, + "learning_rate": 2.23740230810727e-06, + "loss": 0.4679, + "step": 7149 + }, + { + "epoch": 0.6958637469586375, + "grad_norm": 1.3932926436845328, + "learning_rate": 2.236088720070152e-06, + "loss": 0.381, + "step": 7150 + }, + { + "epoch": 0.6959610705596107, + "grad_norm": 1.5468794697398334, + "learning_rate": 2.234775406684599e-06, + "loss": 0.5677, + "step": 7151 + }, + { + "epoch": 0.6960583941605839, + "grad_norm": 1.1817886133763231, + "learning_rate": 2.233462368081112e-06, + "loss": 0.2683, + "step": 7152 + }, + { + "epoch": 0.6961557177615572, + "grad_norm": 1.5161720153931184, + "learning_rate": 2.2321496043901698e-06, + "loss": 0.4507, + "step": 7153 + }, + { + "epoch": 0.6962530413625304, + "grad_norm": 1.3402527832617142, + "learning_rate": 2.230837115742222e-06, + "loss": 0.3581, + "step": 7154 + }, + { + "epoch": 0.6963503649635037, + "grad_norm": 1.4744590025771356, + "learning_rate": 2.2295249022676945e-06, + "loss": 0.4013, + "step": 7155 + }, + { + "epoch": 0.6964476885644769, + "grad_norm": 1.326370965904232, + "learning_rate": 2.2282129640969786e-06, + "loss": 0.4208, + "step": 7156 + }, + { + "epoch": 0.6965450121654502, + "grad_norm": 1.5182291841152533, + "learning_rate": 2.226901301360444e-06, + "loss": 0.3771, + "step": 7157 + }, + { + "epoch": 0.6966423357664233, + "grad_norm": 1.5146032605790822, + "learning_rate": 2.225589914188433e-06, + "loss": 0.3303, + "step": 7158 + }, + { + "epoch": 0.6967396593673966, + "grad_norm": 1.6804953827957696, + "learning_rate": 2.2242788027112544e-06, + "loss": 0.4242, + "step": 7159 + }, + { + "epoch": 0.6968369829683698, + "grad_norm": 1.9117629581978521, + "learning_rate": 2.222967967059199e-06, + "loss": 0.3716, + "step": 7160 + }, + { + "epoch": 0.6969343065693431, + "grad_norm": 1.229105360582037, + "learning_rate": 2.221657407362523e-06, + "loss": 0.3342, + "step": 7161 + }, + { + "epoch": 0.6970316301703163, + "grad_norm": 1.4308496140072384, + "learning_rate": 2.2203471237514606e-06, + "loss": 0.4376, + "step": 7162 + }, + { + "epoch": 0.6971289537712896, + "grad_norm": 1.4614854080980406, + "learning_rate": 2.2190371163562115e-06, + "loss": 0.4659, + "step": 7163 + }, + { + "epoch": 0.6972262773722627, + "grad_norm": 1.4187512301132987, + "learning_rate": 2.2177273853069525e-06, + "loss": 0.2365, + "step": 7164 + }, + { + "epoch": 0.697323600973236, + "grad_norm": 1.3089222054558105, + "learning_rate": 2.216417930733834e-06, + "loss": 0.3082, + "step": 7165 + }, + { + "epoch": 0.6974209245742092, + "grad_norm": 1.261902408991811, + "learning_rate": 2.2151087527669783e-06, + "loss": 0.282, + "step": 7166 + }, + { + "epoch": 0.6975182481751825, + "grad_norm": 1.7640787291139268, + "learning_rate": 2.2137998515364754e-06, + "loss": 0.5948, + "step": 7167 + }, + { + "epoch": 0.6976155717761557, + "grad_norm": 1.632814836272252, + "learning_rate": 2.2124912271723945e-06, + "loss": 0.5355, + "step": 7168 + }, + { + "epoch": 0.697712895377129, + "grad_norm": 1.4433366621447463, + "learning_rate": 2.2111828798047736e-06, + "loss": 0.4636, + "step": 7169 + }, + { + "epoch": 0.6978102189781021, + "grad_norm": 1.7838258021714664, + "learning_rate": 2.2098748095636236e-06, + "loss": 0.3765, + "step": 7170 + }, + { + "epoch": 0.6979075425790754, + "grad_norm": 1.447955215636187, + "learning_rate": 2.208567016578929e-06, + "loss": 0.3262, + "step": 7171 + }, + { + "epoch": 0.6980048661800486, + "grad_norm": 1.5231870005139048, + "learning_rate": 2.2072595009806457e-06, + "loss": 0.2944, + "step": 7172 + }, + { + "epoch": 0.6981021897810219, + "grad_norm": 1.5448173940946588, + "learning_rate": 2.205952262898704e-06, + "loss": 0.2999, + "step": 7173 + }, + { + "epoch": 0.6981995133819952, + "grad_norm": 1.5943167183757534, + "learning_rate": 2.2046453024630016e-06, + "loss": 0.3786, + "step": 7174 + }, + { + "epoch": 0.6982968369829684, + "grad_norm": 1.4530451762927175, + "learning_rate": 2.203338619803414e-06, + "loss": 0.2946, + "step": 7175 + }, + { + "epoch": 0.6983941605839417, + "grad_norm": 1.3574104583040578, + "learning_rate": 2.202032215049788e-06, + "loss": 0.3585, + "step": 7176 + }, + { + "epoch": 0.6984914841849148, + "grad_norm": 2.1649128589316424, + "learning_rate": 2.2007260883319392e-06, + "loss": 0.3878, + "step": 7177 + }, + { + "epoch": 0.698588807785888, + "grad_norm": 1.5012934988614022, + "learning_rate": 2.1994202397796594e-06, + "loss": 0.3496, + "step": 7178 + }, + { + "epoch": 0.6986861313868613, + "grad_norm": 1.8667082231300942, + "learning_rate": 2.198114669522711e-06, + "loss": 0.3446, + "step": 7179 + }, + { + "epoch": 0.6987834549878346, + "grad_norm": 2.059006246104315, + "learning_rate": 2.196809377690831e-06, + "loss": 0.3747, + "step": 7180 + }, + { + "epoch": 0.6988807785888078, + "grad_norm": 1.3436752121096145, + "learning_rate": 2.1955043644137258e-06, + "loss": 0.2313, + "step": 7181 + }, + { + "epoch": 0.6989781021897811, + "grad_norm": 1.2639389109697716, + "learning_rate": 2.194199629821076e-06, + "loss": 0.3304, + "step": 7182 + }, + { + "epoch": 0.6990754257907542, + "grad_norm": 1.4891998579912018, + "learning_rate": 2.192895174042533e-06, + "loss": 0.4796, + "step": 7183 + }, + { + "epoch": 0.6991727493917275, + "grad_norm": 1.353731917016591, + "learning_rate": 2.1915909972077244e-06, + "loss": 0.2765, + "step": 7184 + }, + { + "epoch": 0.6992700729927007, + "grad_norm": 1.4471954452240534, + "learning_rate": 2.1902870994462423e-06, + "loss": 0.3412, + "step": 7185 + }, + { + "epoch": 0.699367396593674, + "grad_norm": 1.4493037196335952, + "learning_rate": 2.1889834808876583e-06, + "loss": 0.3935, + "step": 7186 + }, + { + "epoch": 0.6994647201946472, + "grad_norm": 1.4823255344291792, + "learning_rate": 2.187680141661515e-06, + "loss": 0.3114, + "step": 7187 + }, + { + "epoch": 0.6995620437956205, + "grad_norm": 1.3925930198953602, + "learning_rate": 2.1863770818973235e-06, + "loss": 0.2601, + "step": 7188 + }, + { + "epoch": 0.6996593673965936, + "grad_norm": 1.4217463843895148, + "learning_rate": 2.18507430172457e-06, + "loss": 0.4411, + "step": 7189 + }, + { + "epoch": 0.6997566909975669, + "grad_norm": 1.4457160180214488, + "learning_rate": 2.183771801272714e-06, + "loss": 0.239, + "step": 7190 + }, + { + "epoch": 0.6998540145985401, + "grad_norm": 1.3826602485591162, + "learning_rate": 2.1824695806711847e-06, + "loss": 0.396, + "step": 7191 + }, + { + "epoch": 0.6999513381995134, + "grad_norm": 1.63230939351161, + "learning_rate": 2.1811676400493853e-06, + "loss": 0.2797, + "step": 7192 + }, + { + "epoch": 0.7000486618004866, + "grad_norm": 1.5764297595819967, + "learning_rate": 2.17986597953669e-06, + "loss": 0.5107, + "step": 7193 + }, + { + "epoch": 0.7001459854014599, + "grad_norm": 1.4846905021661214, + "learning_rate": 2.178564599262447e-06, + "loss": 0.2824, + "step": 7194 + }, + { + "epoch": 0.7002433090024331, + "grad_norm": 1.6519622001479857, + "learning_rate": 2.1772634993559725e-06, + "loss": 0.4299, + "step": 7195 + }, + { + "epoch": 0.7003406326034063, + "grad_norm": 1.6158671863479424, + "learning_rate": 2.1759626799465596e-06, + "loss": 0.3323, + "step": 7196 + }, + { + "epoch": 0.7004379562043795, + "grad_norm": 1.4162209407993407, + "learning_rate": 2.1746621411634705e-06, + "loss": 0.2818, + "step": 7197 + }, + { + "epoch": 0.7005352798053528, + "grad_norm": 1.4777578531003996, + "learning_rate": 2.1733618831359426e-06, + "loss": 0.3929, + "step": 7198 + }, + { + "epoch": 0.700632603406326, + "grad_norm": 1.3071594576708383, + "learning_rate": 2.17206190599318e-06, + "loss": 0.1265, + "step": 7199 + }, + { + "epoch": 0.7007299270072993, + "grad_norm": 1.380455508381796, + "learning_rate": 2.1707622098643646e-06, + "loss": 0.35, + "step": 7200 + }, + { + "epoch": 0.7008272506082726, + "grad_norm": 1.9651320380579909, + "learning_rate": 2.169462794878647e-06, + "loss": 0.4279, + "step": 7201 + }, + { + "epoch": 0.7009245742092457, + "grad_norm": 1.6172999098773795, + "learning_rate": 2.168163661165151e-06, + "loss": 0.4402, + "step": 7202 + }, + { + "epoch": 0.701021897810219, + "grad_norm": 1.3452355800352764, + "learning_rate": 2.166864808852973e-06, + "loss": 0.3383, + "step": 7203 + }, + { + "epoch": 0.7011192214111922, + "grad_norm": 1.7373915549266996, + "learning_rate": 2.16556623807118e-06, + "loss": 0.3531, + "step": 7204 + }, + { + "epoch": 0.7012165450121655, + "grad_norm": 1.4004273988372486, + "learning_rate": 2.164267948948814e-06, + "loss": 0.3775, + "step": 7205 + }, + { + "epoch": 0.7013138686131387, + "grad_norm": 1.5907312863556262, + "learning_rate": 2.1629699416148832e-06, + "loss": 0.5438, + "step": 7206 + }, + { + "epoch": 0.701411192214112, + "grad_norm": 1.3487333554433167, + "learning_rate": 2.1616722161983734e-06, + "loss": 0.3756, + "step": 7207 + }, + { + "epoch": 0.7015085158150851, + "grad_norm": 1.6982980623535286, + "learning_rate": 2.1603747728282395e-06, + "loss": 0.7085, + "step": 7208 + }, + { + "epoch": 0.7016058394160584, + "grad_norm": 1.5047427538652773, + "learning_rate": 2.1590776116334117e-06, + "loss": 0.3471, + "step": 7209 + }, + { + "epoch": 0.7017031630170316, + "grad_norm": 1.385831092095565, + "learning_rate": 2.157780732742786e-06, + "loss": 0.4245, + "step": 7210 + }, + { + "epoch": 0.7018004866180049, + "grad_norm": 1.4313131793207814, + "learning_rate": 2.1564841362852363e-06, + "loss": 0.3844, + "step": 7211 + }, + { + "epoch": 0.7018978102189781, + "grad_norm": 1.3541322654853878, + "learning_rate": 2.155187822389605e-06, + "loss": 0.3525, + "step": 7212 + }, + { + "epoch": 0.7019951338199514, + "grad_norm": 1.4374640916402917, + "learning_rate": 2.1538917911847085e-06, + "loss": 0.507, + "step": 7213 + }, + { + "epoch": 0.7020924574209245, + "grad_norm": 1.3297508610154067, + "learning_rate": 2.1525960427993346e-06, + "loss": 0.3304, + "step": 7214 + }, + { + "epoch": 0.7021897810218978, + "grad_norm": 1.2265902959504746, + "learning_rate": 2.151300577362242e-06, + "loss": 0.2403, + "step": 7215 + }, + { + "epoch": 0.702287104622871, + "grad_norm": 1.670150866061289, + "learning_rate": 2.1500053950021633e-06, + "loss": 0.4734, + "step": 7216 + }, + { + "epoch": 0.7023844282238443, + "grad_norm": 1.3763959907342531, + "learning_rate": 2.148710495847799e-06, + "loss": 0.3496, + "step": 7217 + }, + { + "epoch": 0.7024817518248175, + "grad_norm": 1.4436342960910218, + "learning_rate": 2.1474158800278246e-06, + "loss": 0.2752, + "step": 7218 + }, + { + "epoch": 0.7025790754257908, + "grad_norm": 1.7331310313106434, + "learning_rate": 2.146121547670888e-06, + "loss": 0.3792, + "step": 7219 + }, + { + "epoch": 0.702676399026764, + "grad_norm": 1.5581286508430956, + "learning_rate": 2.144827498905609e-06, + "loss": 0.4911, + "step": 7220 + }, + { + "epoch": 0.7027737226277372, + "grad_norm": 1.421881623305101, + "learning_rate": 2.1435337338605742e-06, + "loss": 0.3418, + "step": 7221 + }, + { + "epoch": 0.7028710462287104, + "grad_norm": 1.6512467731268148, + "learning_rate": 2.1422402526643486e-06, + "loss": 0.3426, + "step": 7222 + }, + { + "epoch": 0.7029683698296837, + "grad_norm": 1.3544651240565264, + "learning_rate": 2.1409470554454655e-06, + "loss": 0.2857, + "step": 7223 + }, + { + "epoch": 0.703065693430657, + "grad_norm": 1.558915260444135, + "learning_rate": 2.1396541423324307e-06, + "loss": 0.4269, + "step": 7224 + }, + { + "epoch": 0.7031630170316302, + "grad_norm": 1.4604893777389836, + "learning_rate": 2.1383615134537223e-06, + "loss": 0.2933, + "step": 7225 + }, + { + "epoch": 0.7032603406326035, + "grad_norm": 1.7031215002741422, + "learning_rate": 2.1370691689377885e-06, + "loss": 0.3728, + "step": 7226 + }, + { + "epoch": 0.7033576642335766, + "grad_norm": 1.572370556475196, + "learning_rate": 2.135777108913053e-06, + "loss": 0.2713, + "step": 7227 + }, + { + "epoch": 0.7034549878345498, + "grad_norm": 1.7156749613840372, + "learning_rate": 2.134485333507905e-06, + "loss": 0.2822, + "step": 7228 + }, + { + "epoch": 0.7035523114355231, + "grad_norm": 1.4675217220381886, + "learning_rate": 2.133193842850711e-06, + "loss": 0.3578, + "step": 7229 + }, + { + "epoch": 0.7036496350364964, + "grad_norm": 1.4900235768187389, + "learning_rate": 2.1319026370698065e-06, + "loss": 0.4143, + "step": 7230 + }, + { + "epoch": 0.7037469586374696, + "grad_norm": 1.4269592435399123, + "learning_rate": 2.130611716293502e-06, + "loss": 0.4578, + "step": 7231 + }, + { + "epoch": 0.7038442822384429, + "grad_norm": 1.5084790747749248, + "learning_rate": 2.129321080650073e-06, + "loss": 0.4599, + "step": 7232 + }, + { + "epoch": 0.703941605839416, + "grad_norm": 1.6912310291475308, + "learning_rate": 2.1280307302677726e-06, + "loss": 0.3304, + "step": 7233 + }, + { + "epoch": 0.7040389294403893, + "grad_norm": 1.5163234399179417, + "learning_rate": 2.1267406652748236e-06, + "loss": 0.3233, + "step": 7234 + }, + { + "epoch": 0.7041362530413625, + "grad_norm": 1.6329811410650077, + "learning_rate": 2.125450885799421e-06, + "loss": 0.4944, + "step": 7235 + }, + { + "epoch": 0.7042335766423358, + "grad_norm": 1.5393893286982532, + "learning_rate": 2.124161391969731e-06, + "loss": 0.4636, + "step": 7236 + }, + { + "epoch": 0.704330900243309, + "grad_norm": 2.0255758997196085, + "learning_rate": 2.1228721839138906e-06, + "loss": 0.5764, + "step": 7237 + }, + { + "epoch": 0.7044282238442823, + "grad_norm": 1.6161919076957478, + "learning_rate": 2.121583261760011e-06, + "loss": 0.345, + "step": 7238 + }, + { + "epoch": 0.7045255474452555, + "grad_norm": 1.4336322498608862, + "learning_rate": 2.120294625636171e-06, + "loss": 0.4276, + "step": 7239 + }, + { + "epoch": 0.7046228710462287, + "grad_norm": 1.696929160811595, + "learning_rate": 2.119006275670424e-06, + "loss": 0.5299, + "step": 7240 + }, + { + "epoch": 0.7047201946472019, + "grad_norm": 1.5082479055811553, + "learning_rate": 2.1177182119907958e-06, + "loss": 0.3232, + "step": 7241 + }, + { + "epoch": 0.7048175182481752, + "grad_norm": 1.1739434869476788, + "learning_rate": 2.116430434725279e-06, + "loss": 0.2029, + "step": 7242 + }, + { + "epoch": 0.7049148418491484, + "grad_norm": 1.5109430733137095, + "learning_rate": 2.1151429440018417e-06, + "loss": 0.3887, + "step": 7243 + }, + { + "epoch": 0.7050121654501217, + "grad_norm": 1.935841989200906, + "learning_rate": 2.1138557399484235e-06, + "loss": 0.3736, + "step": 7244 + }, + { + "epoch": 0.7051094890510949, + "grad_norm": 1.5928740992917356, + "learning_rate": 2.112568822692934e-06, + "loss": 0.4092, + "step": 7245 + }, + { + "epoch": 0.7052068126520681, + "grad_norm": 1.6046198250262993, + "learning_rate": 2.111282192363256e-06, + "loss": 0.434, + "step": 7246 + }, + { + "epoch": 0.7053041362530413, + "grad_norm": 1.9174718271346842, + "learning_rate": 2.109995849087242e-06, + "loss": 0.3959, + "step": 7247 + }, + { + "epoch": 0.7054014598540146, + "grad_norm": 1.7142164608045882, + "learning_rate": 2.1087097929927164e-06, + "loss": 0.3613, + "step": 7248 + }, + { + "epoch": 0.7054987834549878, + "grad_norm": 1.5553534533516506, + "learning_rate": 2.107424024207478e-06, + "loss": 0.3619, + "step": 7249 + }, + { + "epoch": 0.7055961070559611, + "grad_norm": 1.2917160004249324, + "learning_rate": 2.1061385428592902e-06, + "loss": 0.2497, + "step": 7250 + }, + { + "epoch": 0.7056934306569344, + "grad_norm": 1.5544453341251494, + "learning_rate": 2.104853349075894e-06, + "loss": 0.4396, + "step": 7251 + }, + { + "epoch": 0.7057907542579075, + "grad_norm": 1.4486418447203926, + "learning_rate": 2.1035684429850025e-06, + "loss": 0.3191, + "step": 7252 + }, + { + "epoch": 0.7058880778588807, + "grad_norm": 1.334915618015605, + "learning_rate": 2.1022838247142922e-06, + "loss": 0.2687, + "step": 7253 + }, + { + "epoch": 0.705985401459854, + "grad_norm": 1.421274414658012, + "learning_rate": 2.1009994943914202e-06, + "loss": 0.3803, + "step": 7254 + }, + { + "epoch": 0.7060827250608273, + "grad_norm": 1.9452654361776698, + "learning_rate": 2.09971545214401e-06, + "loss": 0.2487, + "step": 7255 + }, + { + "epoch": 0.7061800486618005, + "grad_norm": 1.727643464310145, + "learning_rate": 2.0984316980996578e-06, + "loss": 0.4087, + "step": 7256 + }, + { + "epoch": 0.7062773722627738, + "grad_norm": 1.4088391522390744, + "learning_rate": 2.0971482323859318e-06, + "loss": 0.28, + "step": 7257 + }, + { + "epoch": 0.7063746958637469, + "grad_norm": 1.3973055046006424, + "learning_rate": 2.0958650551303695e-06, + "loss": 0.372, + "step": 7258 + }, + { + "epoch": 0.7064720194647202, + "grad_norm": 1.549127499292334, + "learning_rate": 2.0945821664604837e-06, + "loss": 0.3205, + "step": 7259 + }, + { + "epoch": 0.7065693430656934, + "grad_norm": 1.0827124479234114, + "learning_rate": 2.093299566503752e-06, + "loss": 0.2787, + "step": 7260 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 1.6304222023520052, + "learning_rate": 2.092017255387629e-06, + "loss": 0.2756, + "step": 7261 + }, + { + "epoch": 0.7067639902676399, + "grad_norm": 1.4801226289309897, + "learning_rate": 2.0907352332395388e-06, + "loss": 0.405, + "step": 7262 + }, + { + "epoch": 0.7068613138686132, + "grad_norm": 1.3963649031745562, + "learning_rate": 2.0894535001868783e-06, + "loss": 0.3958, + "step": 7263 + }, + { + "epoch": 0.7069586374695864, + "grad_norm": 1.58219204309223, + "learning_rate": 2.088172056357011e-06, + "loss": 0.5997, + "step": 7264 + }, + { + "epoch": 0.7070559610705596, + "grad_norm": 1.4672308024718428, + "learning_rate": 2.086890901877276e-06, + "loss": 0.2781, + "step": 7265 + }, + { + "epoch": 0.7071532846715328, + "grad_norm": 1.400396200724931, + "learning_rate": 2.0856100368749825e-06, + "loss": 0.2796, + "step": 7266 + }, + { + "epoch": 0.7072506082725061, + "grad_norm": 1.5108770348522642, + "learning_rate": 2.084329461477411e-06, + "loss": 0.4326, + "step": 7267 + }, + { + "epoch": 0.7073479318734793, + "grad_norm": 1.5513218626825842, + "learning_rate": 2.0830491758118133e-06, + "loss": 0.4045, + "step": 7268 + }, + { + "epoch": 0.7074452554744526, + "grad_norm": 1.2850462335959933, + "learning_rate": 2.081769180005412e-06, + "loss": 0.2287, + "step": 7269 + }, + { + "epoch": 0.7075425790754258, + "grad_norm": 1.3841791673188049, + "learning_rate": 2.080489474185402e-06, + "loss": 0.3396, + "step": 7270 + }, + { + "epoch": 0.707639902676399, + "grad_norm": 1.6295855452775228, + "learning_rate": 2.079210058478946e-06, + "loss": 0.4433, + "step": 7271 + }, + { + "epoch": 0.7077372262773722, + "grad_norm": 1.4812262617695882, + "learning_rate": 2.077930933013182e-06, + "loss": 0.4124, + "step": 7272 + }, + { + "epoch": 0.7078345498783455, + "grad_norm": 1.3318644220584606, + "learning_rate": 2.076652097915217e-06, + "loss": 0.3889, + "step": 7273 + }, + { + "epoch": 0.7079318734793187, + "grad_norm": 1.5745332843827564, + "learning_rate": 2.075373553312132e-06, + "loss": 0.4756, + "step": 7274 + }, + { + "epoch": 0.708029197080292, + "grad_norm": 1.2240783416135876, + "learning_rate": 2.074095299330973e-06, + "loss": 0.2625, + "step": 7275 + }, + { + "epoch": 0.7081265206812652, + "grad_norm": 1.8230937617427208, + "learning_rate": 2.072817336098761e-06, + "loss": 0.2453, + "step": 7276 + }, + { + "epoch": 0.7082238442822384, + "grad_norm": 1.8841110120492097, + "learning_rate": 2.0715396637424913e-06, + "loss": 0.3926, + "step": 7277 + }, + { + "epoch": 0.7083211678832116, + "grad_norm": 1.6873300961333146, + "learning_rate": 2.0702622823891245e-06, + "loss": 0.4677, + "step": 7278 + }, + { + "epoch": 0.7084184914841849, + "grad_norm": 1.6791598178670215, + "learning_rate": 2.068985192165595e-06, + "loss": 0.5022, + "step": 7279 + }, + { + "epoch": 0.7085158150851582, + "grad_norm": 1.6623388685571951, + "learning_rate": 2.067708393198809e-06, + "loss": 0.4684, + "step": 7280 + }, + { + "epoch": 0.7086131386861314, + "grad_norm": 1.5630837685120604, + "learning_rate": 2.066431885615644e-06, + "loss": 0.347, + "step": 7281 + }, + { + "epoch": 0.7087104622871047, + "grad_norm": 1.5796915720237221, + "learning_rate": 2.065155669542944e-06, + "loss": 0.3138, + "step": 7282 + }, + { + "epoch": 0.7088077858880779, + "grad_norm": 1.5385357105275, + "learning_rate": 2.0638797451075287e-06, + "loss": 0.4928, + "step": 7283 + }, + { + "epoch": 0.708905109489051, + "grad_norm": 1.520937937792685, + "learning_rate": 2.0626041124361886e-06, + "loss": 0.2855, + "step": 7284 + }, + { + "epoch": 0.7090024330900243, + "grad_norm": 1.5796249362166424, + "learning_rate": 2.061328771655684e-06, + "loss": 0.4179, + "step": 7285 + }, + { + "epoch": 0.7090997566909976, + "grad_norm": 1.5614924424788936, + "learning_rate": 2.0600537228927444e-06, + "loss": 0.3953, + "step": 7286 + }, + { + "epoch": 0.7091970802919708, + "grad_norm": 1.3774488490139227, + "learning_rate": 2.0587789662740713e-06, + "loss": 0.3307, + "step": 7287 + }, + { + "epoch": 0.7092944038929441, + "grad_norm": 1.3458357304098003, + "learning_rate": 2.057504501926344e-06, + "loss": 0.3622, + "step": 7288 + }, + { + "epoch": 0.7093917274939173, + "grad_norm": 1.2658219360849698, + "learning_rate": 2.056230329976201e-06, + "loss": 0.1807, + "step": 7289 + }, + { + "epoch": 0.7094890510948905, + "grad_norm": 1.2012401530391155, + "learning_rate": 2.05495645055026e-06, + "loss": 0.2049, + "step": 7290 + }, + { + "epoch": 0.7095863746958637, + "grad_norm": 2.3124331902827993, + "learning_rate": 2.053682863775106e-06, + "loss": 0.409, + "step": 7291 + }, + { + "epoch": 0.709683698296837, + "grad_norm": 1.4892275547643432, + "learning_rate": 2.0524095697772988e-06, + "loss": 0.3969, + "step": 7292 + }, + { + "epoch": 0.7097810218978102, + "grad_norm": 1.4712830472879161, + "learning_rate": 2.0511365686833627e-06, + "loss": 0.3875, + "step": 7293 + }, + { + "epoch": 0.7098783454987835, + "grad_norm": 1.6260555213795773, + "learning_rate": 2.0498638606197984e-06, + "loss": 0.2639, + "step": 7294 + }, + { + "epoch": 0.7099756690997567, + "grad_norm": 1.5898185559275262, + "learning_rate": 2.048591445713077e-06, + "loss": 0.2962, + "step": 7295 + }, + { + "epoch": 0.7100729927007299, + "grad_norm": 1.5035126406332018, + "learning_rate": 2.047319324089636e-06, + "loss": 0.2615, + "step": 7296 + }, + { + "epoch": 0.7101703163017031, + "grad_norm": 1.4568415868618678, + "learning_rate": 2.0460474958758892e-06, + "loss": 0.1855, + "step": 7297 + }, + { + "epoch": 0.7102676399026764, + "grad_norm": 1.644865097265397, + "learning_rate": 2.044775961198216e-06, + "loss": 0.5524, + "step": 7298 + }, + { + "epoch": 0.7103649635036496, + "grad_norm": 1.4530566619883096, + "learning_rate": 2.0435047201829754e-06, + "loss": 0.3994, + "step": 7299 + }, + { + "epoch": 0.7104622871046229, + "grad_norm": 1.4925035072170436, + "learning_rate": 2.0422337729564868e-06, + "loss": 0.4369, + "step": 7300 + }, + { + "epoch": 0.7105596107055961, + "grad_norm": 1.7043862528581615, + "learning_rate": 2.040963119645046e-06, + "loss": 0.4079, + "step": 7301 + }, + { + "epoch": 0.7106569343065693, + "grad_norm": 2.246805678612703, + "learning_rate": 2.0396927603749177e-06, + "loss": 0.3244, + "step": 7302 + }, + { + "epoch": 0.7107542579075425, + "grad_norm": 1.4155326493141775, + "learning_rate": 2.0384226952723424e-06, + "loss": 0.3502, + "step": 7303 + }, + { + "epoch": 0.7108515815085158, + "grad_norm": 1.3551689937823195, + "learning_rate": 2.037152924463522e-06, + "loss": 0.3597, + "step": 7304 + }, + { + "epoch": 0.710948905109489, + "grad_norm": 1.550087800154122, + "learning_rate": 2.0358834480746363e-06, + "loss": 0.273, + "step": 7305 + }, + { + "epoch": 0.7110462287104623, + "grad_norm": 1.7563838324004037, + "learning_rate": 2.034614266231837e-06, + "loss": 0.243, + "step": 7306 + }, + { + "epoch": 0.7111435523114356, + "grad_norm": 1.6961123078471805, + "learning_rate": 2.033345379061238e-06, + "loss": 0.4612, + "step": 7307 + }, + { + "epoch": 0.7112408759124088, + "grad_norm": 1.385500864706661, + "learning_rate": 2.0320767866889307e-06, + "loss": 0.2183, + "step": 7308 + }, + { + "epoch": 0.711338199513382, + "grad_norm": 1.5569116985784421, + "learning_rate": 2.030808489240979e-06, + "loss": 0.3159, + "step": 7309 + }, + { + "epoch": 0.7114355231143552, + "grad_norm": 1.3231430861183433, + "learning_rate": 2.0295404868434148e-06, + "loss": 0.4136, + "step": 7310 + }, + { + "epoch": 0.7115328467153285, + "grad_norm": 1.4485263044176464, + "learning_rate": 2.028272779622236e-06, + "loss": 0.2944, + "step": 7311 + }, + { + "epoch": 0.7116301703163017, + "grad_norm": 1.2476502267289642, + "learning_rate": 2.027005367703418e-06, + "loss": 0.2561, + "step": 7312 + }, + { + "epoch": 0.711727493917275, + "grad_norm": 1.6629904575681997, + "learning_rate": 2.025738251212906e-06, + "loss": 0.3561, + "step": 7313 + }, + { + "epoch": 0.7118248175182482, + "grad_norm": 1.6041120379505454, + "learning_rate": 2.0244714302766094e-06, + "loss": 0.3547, + "step": 7314 + }, + { + "epoch": 0.7119221411192214, + "grad_norm": 1.6350136116464489, + "learning_rate": 2.0232049050204167e-06, + "loss": 0.336, + "step": 7315 + }, + { + "epoch": 0.7120194647201946, + "grad_norm": 1.9283862340402942, + "learning_rate": 2.0219386755701814e-06, + "loss": 0.3013, + "step": 7316 + }, + { + "epoch": 0.7121167883211679, + "grad_norm": 1.374982747056398, + "learning_rate": 2.020672742051733e-06, + "loss": 0.3457, + "step": 7317 + }, + { + "epoch": 0.7122141119221411, + "grad_norm": 1.850911472645716, + "learning_rate": 2.0194071045908614e-06, + "loss": 0.4455, + "step": 7318 + }, + { + "epoch": 0.7123114355231144, + "grad_norm": 1.7314643837297474, + "learning_rate": 2.0181417633133393e-06, + "loss": 0.4427, + "step": 7319 + }, + { + "epoch": 0.7124087591240876, + "grad_norm": 1.707210854306524, + "learning_rate": 2.0168767183449035e-06, + "loss": 0.4578, + "step": 7320 + }, + { + "epoch": 0.7125060827250608, + "grad_norm": 1.4372386694829955, + "learning_rate": 2.0156119698112635e-06, + "loss": 0.1912, + "step": 7321 + }, + { + "epoch": 0.712603406326034, + "grad_norm": 1.6077116731664434, + "learning_rate": 2.0143475178380944e-06, + "loss": 0.6271, + "step": 7322 + }, + { + "epoch": 0.7127007299270073, + "grad_norm": 2.739238054880841, + "learning_rate": 2.0130833625510477e-06, + "loss": 0.4412, + "step": 7323 + }, + { + "epoch": 0.7127980535279805, + "grad_norm": 1.5348438983782409, + "learning_rate": 2.011819504075745e-06, + "loss": 0.3312, + "step": 7324 + }, + { + "epoch": 0.7128953771289538, + "grad_norm": 1.3127844366040244, + "learning_rate": 2.010555942537773e-06, + "loss": 0.1759, + "step": 7325 + }, + { + "epoch": 0.712992700729927, + "grad_norm": 1.5127959442651164, + "learning_rate": 2.0092926780626947e-06, + "loss": 0.5347, + "step": 7326 + }, + { + "epoch": 0.7130900243309003, + "grad_norm": 1.2770987926167694, + "learning_rate": 2.008029710776041e-06, + "loss": 0.3241, + "step": 7327 + }, + { + "epoch": 0.7131873479318734, + "grad_norm": 1.2580494627848673, + "learning_rate": 2.006767040803314e-06, + "loss": 0.4097, + "step": 7328 + }, + { + "epoch": 0.7132846715328467, + "grad_norm": 1.6406502857984475, + "learning_rate": 2.005504668269986e-06, + "loss": 0.4601, + "step": 7329 + }, + { + "epoch": 0.71338199513382, + "grad_norm": 1.3653272779996029, + "learning_rate": 2.0042425933014993e-06, + "loss": 0.2141, + "step": 7330 + }, + { + "epoch": 0.7134793187347932, + "grad_norm": 1.5720487646445356, + "learning_rate": 2.0029808160232693e-06, + "loss": 0.4073, + "step": 7331 + }, + { + "epoch": 0.7135766423357665, + "grad_norm": 1.3082083063259358, + "learning_rate": 2.001719336560676e-06, + "loss": 0.2681, + "step": 7332 + }, + { + "epoch": 0.7136739659367397, + "grad_norm": 1.4572110717097326, + "learning_rate": 2.0004581550390754e-06, + "loss": 0.344, + "step": 7333 + }, + { + "epoch": 0.7137712895377128, + "grad_norm": 1.6816033371392265, + "learning_rate": 1.9991972715837923e-06, + "loss": 0.2645, + "step": 7334 + }, + { + "epoch": 0.7138686131386861, + "grad_norm": 1.6690230242659283, + "learning_rate": 1.997936686320121e-06, + "loss": 0.475, + "step": 7335 + }, + { + "epoch": 0.7139659367396594, + "grad_norm": 1.7644331293533329, + "learning_rate": 1.996676399373326e-06, + "loss": 0.2565, + "step": 7336 + }, + { + "epoch": 0.7140632603406326, + "grad_norm": 1.612776929155974, + "learning_rate": 1.995416410868642e-06, + "loss": 0.3706, + "step": 7337 + }, + { + "epoch": 0.7141605839416059, + "grad_norm": 1.5376075800033044, + "learning_rate": 1.9941567209312768e-06, + "loss": 0.3616, + "step": 7338 + }, + { + "epoch": 0.7142579075425791, + "grad_norm": 1.4816955475769065, + "learning_rate": 1.992897329686405e-06, + "loss": 0.3392, + "step": 7339 + }, + { + "epoch": 0.7143552311435523, + "grad_norm": 1.7337601405193037, + "learning_rate": 1.9916382372591745e-06, + "loss": 0.443, + "step": 7340 + }, + { + "epoch": 0.7144525547445255, + "grad_norm": 1.7118218631582327, + "learning_rate": 1.9903794437747015e-06, + "loss": 0.2962, + "step": 7341 + }, + { + "epoch": 0.7145498783454988, + "grad_norm": 1.5734844909136385, + "learning_rate": 1.989120949358075e-06, + "loss": 0.4402, + "step": 7342 + }, + { + "epoch": 0.714647201946472, + "grad_norm": 1.5819534847785284, + "learning_rate": 1.987862754134348e-06, + "loss": 0.5898, + "step": 7343 + }, + { + "epoch": 0.7147445255474453, + "grad_norm": 1.4472239458721015, + "learning_rate": 1.986604858228551e-06, + "loss": 0.3767, + "step": 7344 + }, + { + "epoch": 0.7148418491484185, + "grad_norm": 1.489008528060522, + "learning_rate": 1.985347261765681e-06, + "loss": 0.3202, + "step": 7345 + }, + { + "epoch": 0.7149391727493918, + "grad_norm": 1.8538140236620748, + "learning_rate": 1.9840899648707086e-06, + "loss": 0.6271, + "step": 7346 + }, + { + "epoch": 0.7150364963503649, + "grad_norm": 1.4904130701041753, + "learning_rate": 1.9828329676685683e-06, + "loss": 0.2491, + "step": 7347 + }, + { + "epoch": 0.7151338199513382, + "grad_norm": 1.4421192256351953, + "learning_rate": 1.98157627028417e-06, + "loss": 0.4727, + "step": 7348 + }, + { + "epoch": 0.7152311435523114, + "grad_norm": 1.6648140226634214, + "learning_rate": 1.9803198728423937e-06, + "loss": 0.5102, + "step": 7349 + }, + { + "epoch": 0.7153284671532847, + "grad_norm": 1.411774190585682, + "learning_rate": 1.9790637754680876e-06, + "loss": 0.4225, + "step": 7350 + }, + { + "epoch": 0.7154257907542579, + "grad_norm": 1.3372439415652364, + "learning_rate": 1.9778079782860704e-06, + "loss": 0.3501, + "step": 7351 + }, + { + "epoch": 0.7155231143552312, + "grad_norm": 1.7637877230864818, + "learning_rate": 1.9765524814211322e-06, + "loss": 0.5255, + "step": 7352 + }, + { + "epoch": 0.7156204379562043, + "grad_norm": 1.4869244014811966, + "learning_rate": 1.9752972849980346e-06, + "loss": 0.3127, + "step": 7353 + }, + { + "epoch": 0.7157177615571776, + "grad_norm": 1.3292692752722683, + "learning_rate": 1.9740423891415028e-06, + "loss": 0.2651, + "step": 7354 + }, + { + "epoch": 0.7158150851581508, + "grad_norm": 1.6975975282767923, + "learning_rate": 1.9727877939762386e-06, + "loss": 0.2625, + "step": 7355 + }, + { + "epoch": 0.7159124087591241, + "grad_norm": 1.5581024848911809, + "learning_rate": 1.971533499626912e-06, + "loss": 0.427, + "step": 7356 + }, + { + "epoch": 0.7160097323600974, + "grad_norm": 1.4809229815725913, + "learning_rate": 1.9702795062181643e-06, + "loss": 0.2638, + "step": 7357 + }, + { + "epoch": 0.7161070559610706, + "grad_norm": 1.595423847911922, + "learning_rate": 1.969025813874604e-06, + "loss": 0.4008, + "step": 7358 + }, + { + "epoch": 0.7162043795620437, + "grad_norm": 1.4893621889652744, + "learning_rate": 1.96777242272081e-06, + "loss": 0.379, + "step": 7359 + }, + { + "epoch": 0.716301703163017, + "grad_norm": 1.3296368689036449, + "learning_rate": 1.9665193328813348e-06, + "loss": 0.4045, + "step": 7360 + }, + { + "epoch": 0.7163990267639903, + "grad_norm": 1.3756879905965664, + "learning_rate": 1.965266544480698e-06, + "loss": 0.3101, + "step": 7361 + }, + { + "epoch": 0.7164963503649635, + "grad_norm": 1.6694089581439444, + "learning_rate": 1.9640140576433895e-06, + "loss": 0.3509, + "step": 7362 + }, + { + "epoch": 0.7165936739659368, + "grad_norm": 1.6063601207464078, + "learning_rate": 1.962761872493871e-06, + "loss": 0.3587, + "step": 7363 + }, + { + "epoch": 0.71669099756691, + "grad_norm": 1.696073018724421, + "learning_rate": 1.961509989156573e-06, + "loss": 0.4831, + "step": 7364 + }, + { + "epoch": 0.7167883211678832, + "grad_norm": 1.6148366749852572, + "learning_rate": 1.960258407755894e-06, + "loss": 0.4433, + "step": 7365 + }, + { + "epoch": 0.7168856447688564, + "grad_norm": 1.4925875302307703, + "learning_rate": 1.9590071284162047e-06, + "loss": 0.5568, + "step": 7366 + }, + { + "epoch": 0.7169829683698297, + "grad_norm": 1.7774234292750941, + "learning_rate": 1.9577561512618467e-06, + "loss": 0.4624, + "step": 7367 + }, + { + "epoch": 0.7170802919708029, + "grad_norm": 1.500476761391062, + "learning_rate": 1.956505476417131e-06, + "loss": 0.3658, + "step": 7368 + }, + { + "epoch": 0.7171776155717762, + "grad_norm": 1.711026960150487, + "learning_rate": 1.9552551040063357e-06, + "loss": 0.4338, + "step": 7369 + }, + { + "epoch": 0.7172749391727494, + "grad_norm": 1.6584324402962876, + "learning_rate": 1.9540050341537116e-06, + "loss": 0.3729, + "step": 7370 + }, + { + "epoch": 0.7173722627737227, + "grad_norm": 1.4849547207797662, + "learning_rate": 1.9527552669834797e-06, + "loss": 0.3447, + "step": 7371 + }, + { + "epoch": 0.7174695863746958, + "grad_norm": 1.7231015691592764, + "learning_rate": 1.9515058026198303e-06, + "loss": 0.4579, + "step": 7372 + }, + { + "epoch": 0.7175669099756691, + "grad_norm": 1.8032406378148262, + "learning_rate": 1.9502566411869223e-06, + "loss": 0.4854, + "step": 7373 + }, + { + "epoch": 0.7176642335766423, + "grad_norm": 1.6502932548014797, + "learning_rate": 1.949007782808887e-06, + "loss": 0.5288, + "step": 7374 + }, + { + "epoch": 0.7177615571776156, + "grad_norm": 1.0490991679065296, + "learning_rate": 1.947759227609825e-06, + "loss": 0.1858, + "step": 7375 + }, + { + "epoch": 0.7178588807785888, + "grad_norm": 1.3485009241493462, + "learning_rate": 1.9465109757138036e-06, + "loss": 0.3298, + "step": 7376 + }, + { + "epoch": 0.7179562043795621, + "grad_norm": 1.5255441038944912, + "learning_rate": 1.9452630272448625e-06, + "loss": 0.3813, + "step": 7377 + }, + { + "epoch": 0.7180535279805352, + "grad_norm": 1.4921210309148643, + "learning_rate": 1.9440153823270136e-06, + "loss": 0.4981, + "step": 7378 + }, + { + "epoch": 0.7181508515815085, + "grad_norm": 2.2890956212274225, + "learning_rate": 1.9427680410842335e-06, + "loss": 0.4214, + "step": 7379 + }, + { + "epoch": 0.7182481751824817, + "grad_norm": 1.6488403985010205, + "learning_rate": 1.9415210036404717e-06, + "loss": 0.5842, + "step": 7380 + }, + { + "epoch": 0.718345498783455, + "grad_norm": 1.5151237366759736, + "learning_rate": 1.940274270119648e-06, + "loss": 0.1621, + "step": 7381 + }, + { + "epoch": 0.7184428223844282, + "grad_norm": 1.4246189344089062, + "learning_rate": 1.939027840645651e-06, + "loss": 0.5241, + "step": 7382 + }, + { + "epoch": 0.7185401459854015, + "grad_norm": 1.5475923050327032, + "learning_rate": 1.9377817153423383e-06, + "loss": 0.3981, + "step": 7383 + }, + { + "epoch": 0.7186374695863746, + "grad_norm": 1.4850545833330557, + "learning_rate": 1.9365358943335387e-06, + "loss": 0.3012, + "step": 7384 + }, + { + "epoch": 0.7187347931873479, + "grad_norm": 1.5840995925410577, + "learning_rate": 1.935290377743051e-06, + "loss": 0.4683, + "step": 7385 + }, + { + "epoch": 0.7188321167883212, + "grad_norm": 1.4184088746397274, + "learning_rate": 1.934045165694643e-06, + "loss": 0.3329, + "step": 7386 + }, + { + "epoch": 0.7189294403892944, + "grad_norm": 1.4811417552171877, + "learning_rate": 1.93280025831205e-06, + "loss": 0.338, + "step": 7387 + }, + { + "epoch": 0.7190267639902677, + "grad_norm": 1.5303607616327723, + "learning_rate": 1.9315556557189807e-06, + "loss": 0.3391, + "step": 7388 + }, + { + "epoch": 0.7191240875912409, + "grad_norm": 1.4338894157173863, + "learning_rate": 1.9303113580391137e-06, + "loss": 0.3391, + "step": 7389 + }, + { + "epoch": 0.7192214111922142, + "grad_norm": 1.4985191665049908, + "learning_rate": 1.9290673653960925e-06, + "loss": 0.3342, + "step": 7390 + }, + { + "epoch": 0.7193187347931873, + "grad_norm": 1.1863657905495277, + "learning_rate": 1.927823677913534e-06, + "loss": 0.2971, + "step": 7391 + }, + { + "epoch": 0.7194160583941606, + "grad_norm": 1.8946232409609065, + "learning_rate": 1.9265802957150247e-06, + "loss": 0.4294, + "step": 7392 + }, + { + "epoch": 0.7195133819951338, + "grad_norm": 1.3626001618692472, + "learning_rate": 1.9253372189241215e-06, + "loss": 0.2384, + "step": 7393 + }, + { + "epoch": 0.7196107055961071, + "grad_norm": 1.520588641753117, + "learning_rate": 1.924094447664348e-06, + "loss": 0.4144, + "step": 7394 + }, + { + "epoch": 0.7197080291970803, + "grad_norm": 1.739902246294359, + "learning_rate": 1.9228519820592e-06, + "loss": 0.3029, + "step": 7395 + }, + { + "epoch": 0.7198053527980536, + "grad_norm": 1.6409706841453218, + "learning_rate": 1.921609822232143e-06, + "loss": 0.304, + "step": 7396 + }, + { + "epoch": 0.7199026763990267, + "grad_norm": 1.685991672780389, + "learning_rate": 1.9203679683066095e-06, + "loss": 0.2969, + "step": 7397 + }, + { + "epoch": 0.72, + "grad_norm": 1.2915387961969664, + "learning_rate": 1.9191264204060033e-06, + "loss": 0.3805, + "step": 7398 + }, + { + "epoch": 0.7200973236009732, + "grad_norm": 1.8069326463738193, + "learning_rate": 1.9178851786536982e-06, + "loss": 0.4532, + "step": 7399 + }, + { + "epoch": 0.7201946472019465, + "grad_norm": 1.765537901223862, + "learning_rate": 1.9166442431730396e-06, + "loss": 0.3499, + "step": 7400 + }, + { + "epoch": 0.7202919708029197, + "grad_norm": 1.6090757962209266, + "learning_rate": 1.9154036140873355e-06, + "loss": 0.2848, + "step": 7401 + }, + { + "epoch": 0.720389294403893, + "grad_norm": 1.7125111489037463, + "learning_rate": 1.9141632915198705e-06, + "loss": 0.3644, + "step": 7402 + }, + { + "epoch": 0.7204866180048661, + "grad_norm": 1.808305820425723, + "learning_rate": 1.912923275593896e-06, + "loss": 0.4067, + "step": 7403 + }, + { + "epoch": 0.7205839416058394, + "grad_norm": 1.5243495460753071, + "learning_rate": 1.911683566432633e-06, + "loss": 0.4829, + "step": 7404 + }, + { + "epoch": 0.7206812652068126, + "grad_norm": 1.5215120325280256, + "learning_rate": 1.910444164159272e-06, + "loss": 0.2943, + "step": 7405 + }, + { + "epoch": 0.7207785888077859, + "grad_norm": 1.4130748018090948, + "learning_rate": 1.9092050688969736e-06, + "loss": 0.3666, + "step": 7406 + }, + { + "epoch": 0.7208759124087591, + "grad_norm": 1.761320420123769, + "learning_rate": 1.9079662807688686e-06, + "loss": 0.5856, + "step": 7407 + }, + { + "epoch": 0.7209732360097324, + "grad_norm": 1.6747714470325592, + "learning_rate": 1.9067277998980538e-06, + "loss": 0.368, + "step": 7408 + }, + { + "epoch": 0.7210705596107055, + "grad_norm": 1.8571053123870995, + "learning_rate": 1.9054896264075985e-06, + "loss": 0.4352, + "step": 7409 + }, + { + "epoch": 0.7211678832116788, + "grad_norm": 1.6048620706633583, + "learning_rate": 1.9042517604205407e-06, + "loss": 0.3397, + "step": 7410 + }, + { + "epoch": 0.721265206812652, + "grad_norm": 1.8961324630450611, + "learning_rate": 1.9030142020598902e-06, + "loss": 0.4438, + "step": 7411 + }, + { + "epoch": 0.7213625304136253, + "grad_norm": 1.5572943373777353, + "learning_rate": 1.9017769514486207e-06, + "loss": 0.25, + "step": 7412 + }, + { + "epoch": 0.7214598540145986, + "grad_norm": 1.4502591000281142, + "learning_rate": 1.9005400087096793e-06, + "loss": 0.4706, + "step": 7413 + }, + { + "epoch": 0.7215571776155718, + "grad_norm": 1.1964043511092517, + "learning_rate": 1.8993033739659822e-06, + "loss": 0.3361, + "step": 7414 + }, + { + "epoch": 0.7216545012165451, + "grad_norm": 1.4076378649445616, + "learning_rate": 1.898067047340415e-06, + "loss": 0.4033, + "step": 7415 + }, + { + "epoch": 0.7217518248175182, + "grad_norm": 1.5151984804272711, + "learning_rate": 1.8968310289558323e-06, + "loss": 0.4472, + "step": 7416 + }, + { + "epoch": 0.7218491484184915, + "grad_norm": 1.4547577575697022, + "learning_rate": 1.8955953189350567e-06, + "loss": 0.3607, + "step": 7417 + }, + { + "epoch": 0.7219464720194647, + "grad_norm": 1.204193532483432, + "learning_rate": 1.8943599174008848e-06, + "loss": 0.2702, + "step": 7418 + }, + { + "epoch": 0.722043795620438, + "grad_norm": 1.3742336391663825, + "learning_rate": 1.8931248244760746e-06, + "loss": 0.3289, + "step": 7419 + }, + { + "epoch": 0.7221411192214112, + "grad_norm": 1.3547879578896822, + "learning_rate": 1.8918900402833606e-06, + "loss": 0.2889, + "step": 7420 + }, + { + "epoch": 0.7222384428223845, + "grad_norm": 1.6485599996600815, + "learning_rate": 1.8906555649454433e-06, + "loss": 0.6008, + "step": 7421 + }, + { + "epoch": 0.7223357664233576, + "grad_norm": 1.3200782703544742, + "learning_rate": 1.8894213985849957e-06, + "loss": 0.2882, + "step": 7422 + }, + { + "epoch": 0.7224330900243309, + "grad_norm": 1.4505044421417583, + "learning_rate": 1.8881875413246542e-06, + "loss": 0.3701, + "step": 7423 + }, + { + "epoch": 0.7225304136253041, + "grad_norm": 1.5524112773120986, + "learning_rate": 1.886953993287029e-06, + "loss": 0.4277, + "step": 7424 + }, + { + "epoch": 0.7226277372262774, + "grad_norm": 1.5982083809904117, + "learning_rate": 1.8857207545946988e-06, + "loss": 0.567, + "step": 7425 + }, + { + "epoch": 0.7227250608272506, + "grad_norm": 1.7199654876218387, + "learning_rate": 1.8844878253702115e-06, + "loss": 0.4061, + "step": 7426 + }, + { + "epoch": 0.7228223844282239, + "grad_norm": 1.3491542727863441, + "learning_rate": 1.8832552057360842e-06, + "loss": 0.4373, + "step": 7427 + }, + { + "epoch": 0.722919708029197, + "grad_norm": 1.6094547733424174, + "learning_rate": 1.882022895814803e-06, + "loss": 0.3191, + "step": 7428 + }, + { + "epoch": 0.7230170316301703, + "grad_norm": 1.397098987551183, + "learning_rate": 1.880790895728824e-06, + "loss": 0.4295, + "step": 7429 + }, + { + "epoch": 0.7231143552311435, + "grad_norm": 1.575388130925685, + "learning_rate": 1.8795592056005696e-06, + "loss": 0.3673, + "step": 7430 + }, + { + "epoch": 0.7232116788321168, + "grad_norm": 1.441498544110725, + "learning_rate": 1.878327825552435e-06, + "loss": 0.2703, + "step": 7431 + }, + { + "epoch": 0.72330900243309, + "grad_norm": 2.3838963779420825, + "learning_rate": 1.8770967557067848e-06, + "loss": 0.4077, + "step": 7432 + }, + { + "epoch": 0.7234063260340633, + "grad_norm": 1.5302649463060127, + "learning_rate": 1.8758659961859477e-06, + "loss": 0.416, + "step": 7433 + }, + { + "epoch": 0.7235036496350365, + "grad_norm": 1.5090417940382483, + "learning_rate": 1.8746355471122269e-06, + "loss": 0.4091, + "step": 7434 + }, + { + "epoch": 0.7236009732360097, + "grad_norm": 2.0436656922849465, + "learning_rate": 1.873405408607893e-06, + "loss": 0.3444, + "step": 7435 + }, + { + "epoch": 0.723698296836983, + "grad_norm": 1.9735598468080733, + "learning_rate": 1.8721755807951848e-06, + "loss": 0.3876, + "step": 7436 + }, + { + "epoch": 0.7237956204379562, + "grad_norm": 1.612500091671017, + "learning_rate": 1.8709460637963123e-06, + "loss": 0.2525, + "step": 7437 + }, + { + "epoch": 0.7238929440389295, + "grad_norm": 1.8219139029535372, + "learning_rate": 1.8697168577334523e-06, + "loss": 0.3253, + "step": 7438 + }, + { + "epoch": 0.7239902676399027, + "grad_norm": 1.6323361226616875, + "learning_rate": 1.8684879627287521e-06, + "loss": 0.2406, + "step": 7439 + }, + { + "epoch": 0.724087591240876, + "grad_norm": 1.4304783202538187, + "learning_rate": 1.8672593789043298e-06, + "loss": 0.3799, + "step": 7440 + }, + { + "epoch": 0.7241849148418491, + "grad_norm": 1.5073004925550073, + "learning_rate": 1.8660311063822668e-06, + "loss": 0.2575, + "step": 7441 + }, + { + "epoch": 0.7242822384428224, + "grad_norm": 1.4290346416441566, + "learning_rate": 1.8648031452846187e-06, + "loss": 0.3965, + "step": 7442 + }, + { + "epoch": 0.7243795620437956, + "grad_norm": 1.3600399101235618, + "learning_rate": 1.8635754957334113e-06, + "loss": 0.3688, + "step": 7443 + }, + { + "epoch": 0.7244768856447689, + "grad_norm": 1.7050943450331033, + "learning_rate": 1.8623481578506324e-06, + "loss": 0.5168, + "step": 7444 + }, + { + "epoch": 0.7245742092457421, + "grad_norm": 1.6526543796467772, + "learning_rate": 1.861121131758246e-06, + "loss": 0.3138, + "step": 7445 + }, + { + "epoch": 0.7246715328467154, + "grad_norm": 1.506459948838207, + "learning_rate": 1.8598944175781808e-06, + "loss": 0.3065, + "step": 7446 + }, + { + "epoch": 0.7247688564476885, + "grad_norm": 1.52085845961347, + "learning_rate": 1.85866801543234e-06, + "loss": 0.4524, + "step": 7447 + }, + { + "epoch": 0.7248661800486618, + "grad_norm": 1.65210485873284, + "learning_rate": 1.857441925442588e-06, + "loss": 0.4542, + "step": 7448 + }, + { + "epoch": 0.724963503649635, + "grad_norm": 1.4593649167482394, + "learning_rate": 1.8562161477307632e-06, + "loss": 0.4833, + "step": 7449 + }, + { + "epoch": 0.7250608272506083, + "grad_norm": 1.4846675383764094, + "learning_rate": 1.854990682418674e-06, + "loss": 0.2248, + "step": 7450 + }, + { + "epoch": 0.7251581508515815, + "grad_norm": 1.5011169725315106, + "learning_rate": 1.8537655296280927e-06, + "loss": 0.3209, + "step": 7451 + }, + { + "epoch": 0.7252554744525548, + "grad_norm": 1.5448068221470526, + "learning_rate": 1.8525406894807636e-06, + "loss": 0.3072, + "step": 7452 + }, + { + "epoch": 0.7253527980535279, + "grad_norm": 1.5166637361239672, + "learning_rate": 1.851316162098401e-06, + "loss": 0.312, + "step": 7453 + }, + { + "epoch": 0.7254501216545012, + "grad_norm": 2.4469856168996915, + "learning_rate": 1.8500919476026885e-06, + "loss": 0.1907, + "step": 7454 + }, + { + "epoch": 0.7255474452554744, + "grad_norm": 1.2528443400332674, + "learning_rate": 1.8488680461152737e-06, + "loss": 0.2696, + "step": 7455 + }, + { + "epoch": 0.7256447688564477, + "grad_norm": 1.5811504408060548, + "learning_rate": 1.8476444577577763e-06, + "loss": 0.2853, + "step": 7456 + }, + { + "epoch": 0.7257420924574209, + "grad_norm": 1.6305699622741514, + "learning_rate": 1.846421182651788e-06, + "loss": 0.378, + "step": 7457 + }, + { + "epoch": 0.7258394160583942, + "grad_norm": 1.498698941873111, + "learning_rate": 1.8451982209188673e-06, + "loss": 0.2001, + "step": 7458 + }, + { + "epoch": 0.7259367396593674, + "grad_norm": 1.629436159731002, + "learning_rate": 1.8439755726805365e-06, + "loss": 0.4068, + "step": 7459 + }, + { + "epoch": 0.7260340632603406, + "grad_norm": 1.5834307473315177, + "learning_rate": 1.842753238058293e-06, + "loss": 0.4065, + "step": 7460 + }, + { + "epoch": 0.7261313868613138, + "grad_norm": 1.4983854346433507, + "learning_rate": 1.841531217173602e-06, + "loss": 0.4435, + "step": 7461 + }, + { + "epoch": 0.7262287104622871, + "grad_norm": 1.3738334215293657, + "learning_rate": 1.8403095101478947e-06, + "loss": 0.2611, + "step": 7462 + }, + { + "epoch": 0.7263260340632604, + "grad_norm": 1.3400226173342547, + "learning_rate": 1.8390881171025726e-06, + "loss": 0.3252, + "step": 7463 + }, + { + "epoch": 0.7264233576642336, + "grad_norm": 1.5602768913925888, + "learning_rate": 1.8378670381590074e-06, + "loss": 0.4048, + "step": 7464 + }, + { + "epoch": 0.7265206812652069, + "grad_norm": 1.4515406677772351, + "learning_rate": 1.8366462734385398e-06, + "loss": 0.3211, + "step": 7465 + }, + { + "epoch": 0.72661800486618, + "grad_norm": 1.4969034817400986, + "learning_rate": 1.8354258230624733e-06, + "loss": 0.3098, + "step": 7466 + }, + { + "epoch": 0.7267153284671533, + "grad_norm": 1.558626944205185, + "learning_rate": 1.8342056871520898e-06, + "loss": 0.3423, + "step": 7467 + }, + { + "epoch": 0.7268126520681265, + "grad_norm": 1.6192425188460398, + "learning_rate": 1.832985865828632e-06, + "loss": 0.3948, + "step": 7468 + }, + { + "epoch": 0.7269099756690998, + "grad_norm": 1.6371171052277378, + "learning_rate": 1.8317663592133179e-06, + "loss": 0.5652, + "step": 7469 + }, + { + "epoch": 0.727007299270073, + "grad_norm": 3.6764814336280174, + "learning_rate": 1.8305471674273262e-06, + "loss": 0.4885, + "step": 7470 + }, + { + "epoch": 0.7271046228710463, + "grad_norm": 1.5729860538899962, + "learning_rate": 1.8293282905918102e-06, + "loss": 0.4786, + "step": 7471 + }, + { + "epoch": 0.7272019464720194, + "grad_norm": 1.5749943717977084, + "learning_rate": 1.8281097288278926e-06, + "loss": 0.4226, + "step": 7472 + }, + { + "epoch": 0.7272992700729927, + "grad_norm": 1.62902809620438, + "learning_rate": 1.8268914822566597e-06, + "loss": 0.1942, + "step": 7473 + }, + { + "epoch": 0.7273965936739659, + "grad_norm": 2.117448966572181, + "learning_rate": 1.8256735509991697e-06, + "loss": 0.4058, + "step": 7474 + }, + { + "epoch": 0.7274939172749392, + "grad_norm": 1.3913245852374658, + "learning_rate": 1.8244559351764507e-06, + "loss": 0.2375, + "step": 7475 + }, + { + "epoch": 0.7275912408759124, + "grad_norm": 1.5816147997880037, + "learning_rate": 1.8232386349094988e-06, + "loss": 0.2623, + "step": 7476 + }, + { + "epoch": 0.7276885644768857, + "grad_norm": 1.8743158999821843, + "learning_rate": 1.8220216503192728e-06, + "loss": 0.3471, + "step": 7477 + }, + { + "epoch": 0.7277858880778589, + "grad_norm": 1.4117193039437554, + "learning_rate": 1.8208049815267105e-06, + "loss": 0.3223, + "step": 7478 + }, + { + "epoch": 0.7278832116788321, + "grad_norm": 1.7092615567628877, + "learning_rate": 1.8195886286527132e-06, + "loss": 0.3544, + "step": 7479 + }, + { + "epoch": 0.7279805352798053, + "grad_norm": 1.6700786316228415, + "learning_rate": 1.8183725918181466e-06, + "loss": 0.4434, + "step": 7480 + }, + { + "epoch": 0.7280778588807786, + "grad_norm": 1.7136205946843208, + "learning_rate": 1.8171568711438515e-06, + "loss": 0.4275, + "step": 7481 + }, + { + "epoch": 0.7281751824817518, + "grad_norm": 1.222648352454276, + "learning_rate": 1.8159414667506342e-06, + "loss": 0.2422, + "step": 7482 + }, + { + "epoch": 0.7282725060827251, + "grad_norm": 1.4871996856914842, + "learning_rate": 1.8147263787592722e-06, + "loss": 0.2459, + "step": 7483 + }, + { + "epoch": 0.7283698296836983, + "grad_norm": 1.653302863491428, + "learning_rate": 1.8135116072905062e-06, + "loss": 0.5396, + "step": 7484 + }, + { + "epoch": 0.7284671532846715, + "grad_norm": 1.4254774632219502, + "learning_rate": 1.8122971524650501e-06, + "loss": 0.2703, + "step": 7485 + }, + { + "epoch": 0.7285644768856447, + "grad_norm": 1.3145544231317021, + "learning_rate": 1.811083014403585e-06, + "loss": 0.2153, + "step": 7486 + }, + { + "epoch": 0.728661800486618, + "grad_norm": 1.6665665675939265, + "learning_rate": 1.8098691932267615e-06, + "loss": 0.3555, + "step": 7487 + }, + { + "epoch": 0.7287591240875912, + "grad_norm": 1.3389563109981748, + "learning_rate": 1.8086556890551971e-06, + "loss": 0.3668, + "step": 7488 + }, + { + "epoch": 0.7288564476885645, + "grad_norm": 1.9533126130594556, + "learning_rate": 1.8074425020094782e-06, + "loss": 0.3223, + "step": 7489 + }, + { + "epoch": 0.7289537712895378, + "grad_norm": 1.4120579627084073, + "learning_rate": 1.8062296322101619e-06, + "loss": 0.2344, + "step": 7490 + }, + { + "epoch": 0.7290510948905109, + "grad_norm": 1.4524120099307862, + "learning_rate": 1.8050170797777682e-06, + "loss": 0.3027, + "step": 7491 + }, + { + "epoch": 0.7291484184914842, + "grad_norm": 1.317425341870494, + "learning_rate": 1.8038048448327911e-06, + "loss": 0.2884, + "step": 7492 + }, + { + "epoch": 0.7292457420924574, + "grad_norm": 1.691752691227038, + "learning_rate": 1.802592927495691e-06, + "loss": 0.3782, + "step": 7493 + }, + { + "epoch": 0.7293430656934307, + "grad_norm": 1.5613753276577542, + "learning_rate": 1.8013813278868991e-06, + "loss": 0.3915, + "step": 7494 + }, + { + "epoch": 0.7294403892944039, + "grad_norm": 1.432625425350165, + "learning_rate": 1.800170046126809e-06, + "loss": 0.2618, + "step": 7495 + }, + { + "epoch": 0.7295377128953772, + "grad_norm": 1.5721608381715664, + "learning_rate": 1.7989590823357879e-06, + "loss": 0.2403, + "step": 7496 + }, + { + "epoch": 0.7296350364963504, + "grad_norm": 1.4034461998271706, + "learning_rate": 1.7977484366341703e-06, + "loss": 0.3165, + "step": 7497 + }, + { + "epoch": 0.7297323600973236, + "grad_norm": 1.2678595232029757, + "learning_rate": 1.7965381091422585e-06, + "loss": 0.3107, + "step": 7498 + }, + { + "epoch": 0.7298296836982968, + "grad_norm": 1.6011699660681395, + "learning_rate": 1.7953280999803245e-06, + "loss": 0.4329, + "step": 7499 + }, + { + "epoch": 0.7299270072992701, + "grad_norm": 1.5497271512976851, + "learning_rate": 1.7941184092686065e-06, + "loss": 0.5849, + "step": 7500 + }, + { + "epoch": 0.7300243309002433, + "grad_norm": 1.5116501430350027, + "learning_rate": 1.7929090371273144e-06, + "loss": 0.5795, + "step": 7501 + }, + { + "epoch": 0.7301216545012166, + "grad_norm": 1.5721547721499922, + "learning_rate": 1.7916999836766207e-06, + "loss": 0.162, + "step": 7502 + }, + { + "epoch": 0.7302189781021898, + "grad_norm": 1.438460153769702, + "learning_rate": 1.7904912490366723e-06, + "loss": 0.2512, + "step": 7503 + }, + { + "epoch": 0.730316301703163, + "grad_norm": 1.779391205649882, + "learning_rate": 1.7892828333275803e-06, + "loss": 0.3669, + "step": 7504 + }, + { + "epoch": 0.7304136253041362, + "grad_norm": 1.732222793993075, + "learning_rate": 1.788074736669429e-06, + "loss": 0.4439, + "step": 7505 + }, + { + "epoch": 0.7305109489051095, + "grad_norm": 1.6617211718916, + "learning_rate": 1.7868669591822636e-06, + "loss": 0.2907, + "step": 7506 + }, + { + "epoch": 0.7306082725060827, + "grad_norm": 1.6220761918541853, + "learning_rate": 1.7856595009861028e-06, + "loss": 0.3708, + "step": 7507 + }, + { + "epoch": 0.730705596107056, + "grad_norm": 1.4481589665749108, + "learning_rate": 1.7844523622009336e-06, + "loss": 0.3621, + "step": 7508 + }, + { + "epoch": 0.7308029197080292, + "grad_norm": 1.5460014574011958, + "learning_rate": 1.7832455429467094e-06, + "loss": 0.4051, + "step": 7509 + }, + { + "epoch": 0.7309002433090024, + "grad_norm": 1.6183413752006717, + "learning_rate": 1.7820390433433525e-06, + "loss": 0.4788, + "step": 7510 + }, + { + "epoch": 0.7309975669099756, + "grad_norm": 1.890416220494734, + "learning_rate": 1.7808328635107541e-06, + "loss": 0.4241, + "step": 7511 + }, + { + "epoch": 0.7310948905109489, + "grad_norm": 1.636194258892272, + "learning_rate": 1.7796270035687747e-06, + "loss": 0.431, + "step": 7512 + }, + { + "epoch": 0.7311922141119221, + "grad_norm": 1.787451872811287, + "learning_rate": 1.778421463637237e-06, + "loss": 0.5589, + "step": 7513 + }, + { + "epoch": 0.7312895377128954, + "grad_norm": 1.466410213865472, + "learning_rate": 1.7772162438359392e-06, + "loss": 0.4736, + "step": 7514 + }, + { + "epoch": 0.7313868613138687, + "grad_norm": 1.5697728838321088, + "learning_rate": 1.7760113442846455e-06, + "loss": 0.2491, + "step": 7515 + }, + { + "epoch": 0.7314841849148418, + "grad_norm": 1.552628423566785, + "learning_rate": 1.7748067651030843e-06, + "loss": 0.3602, + "step": 7516 + }, + { + "epoch": 0.731581508515815, + "grad_norm": 1.7116474786194886, + "learning_rate": 1.7736025064109573e-06, + "loss": 0.4681, + "step": 7517 + }, + { + "epoch": 0.7316788321167883, + "grad_norm": 1.6020638030225507, + "learning_rate": 1.7723985683279327e-06, + "loss": 0.4359, + "step": 7518 + }, + { + "epoch": 0.7317761557177616, + "grad_norm": 1.6210695563659623, + "learning_rate": 1.771194950973646e-06, + "loss": 0.437, + "step": 7519 + }, + { + "epoch": 0.7318734793187348, + "grad_norm": 1.4296348020086311, + "learning_rate": 1.7699916544677015e-06, + "loss": 0.368, + "step": 7520 + }, + { + "epoch": 0.7319708029197081, + "grad_norm": 1.9888121490070614, + "learning_rate": 1.7687886789296721e-06, + "loss": 0.4976, + "step": 7521 + }, + { + "epoch": 0.7320681265206813, + "grad_norm": 1.188311450180724, + "learning_rate": 1.767586024479097e-06, + "loss": 0.1439, + "step": 7522 + }, + { + "epoch": 0.7321654501216545, + "grad_norm": 1.776266320601241, + "learning_rate": 1.7663836912354876e-06, + "loss": 0.4783, + "step": 7523 + }, + { + "epoch": 0.7322627737226277, + "grad_norm": 1.581063207517385, + "learning_rate": 1.7651816793183162e-06, + "loss": 0.4579, + "step": 7524 + }, + { + "epoch": 0.732360097323601, + "grad_norm": 1.3810397061406794, + "learning_rate": 1.7639799888470304e-06, + "loss": 0.2207, + "step": 7525 + }, + { + "epoch": 0.7324574209245742, + "grad_norm": 1.492835178917681, + "learning_rate": 1.762778619941043e-06, + "loss": 0.4312, + "step": 7526 + }, + { + "epoch": 0.7325547445255475, + "grad_norm": 1.5638536311877314, + "learning_rate": 1.7615775727197325e-06, + "loss": 0.4385, + "step": 7527 + }, + { + "epoch": 0.7326520681265207, + "grad_norm": 1.4543406006302655, + "learning_rate": 1.7603768473024497e-06, + "loss": 0.2498, + "step": 7528 + }, + { + "epoch": 0.7327493917274939, + "grad_norm": 1.6426424247503166, + "learning_rate": 1.7591764438085101e-06, + "loss": 0.2959, + "step": 7529 + }, + { + "epoch": 0.7328467153284671, + "grad_norm": 2.026922340181345, + "learning_rate": 1.7579763623571995e-06, + "loss": 0.3848, + "step": 7530 + }, + { + "epoch": 0.7329440389294404, + "grad_norm": 1.3301555594199723, + "learning_rate": 1.7567766030677703e-06, + "loss": 0.2128, + "step": 7531 + }, + { + "epoch": 0.7330413625304136, + "grad_norm": 1.506838310867784, + "learning_rate": 1.7555771660594434e-06, + "loss": 0.3936, + "step": 7532 + }, + { + "epoch": 0.7331386861313869, + "grad_norm": 2.1116800782405782, + "learning_rate": 1.7543780514514097e-06, + "loss": 0.4778, + "step": 7533 + }, + { + "epoch": 0.7332360097323601, + "grad_norm": 1.7353925726944077, + "learning_rate": 1.7531792593628215e-06, + "loss": 0.2909, + "step": 7534 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.5594006889841756, + "learning_rate": 1.7519807899128072e-06, + "loss": 0.3832, + "step": 7535 + }, + { + "epoch": 0.7334306569343065, + "grad_norm": 1.9977319493457493, + "learning_rate": 1.7507826432204571e-06, + "loss": 0.4134, + "step": 7536 + }, + { + "epoch": 0.7335279805352798, + "grad_norm": 1.8618553991909086, + "learning_rate": 1.7495848194048354e-06, + "loss": 0.3868, + "step": 7537 + }, + { + "epoch": 0.733625304136253, + "grad_norm": 1.4464306604279464, + "learning_rate": 1.7483873185849658e-06, + "loss": 0.2678, + "step": 7538 + }, + { + "epoch": 0.7337226277372263, + "grad_norm": 1.7591646546400965, + "learning_rate": 1.747190140879847e-06, + "loss": 0.5141, + "step": 7539 + }, + { + "epoch": 0.7338199513381995, + "grad_norm": 1.221324841676646, + "learning_rate": 1.7459932864084434e-06, + "loss": 0.2351, + "step": 7540 + }, + { + "epoch": 0.7339172749391728, + "grad_norm": 1.6560361472206726, + "learning_rate": 1.7447967552896872e-06, + "loss": 0.2692, + "step": 7541 + }, + { + "epoch": 0.734014598540146, + "grad_norm": 1.7879300593835639, + "learning_rate": 1.7436005476424778e-06, + "loss": 0.555, + "step": 7542 + }, + { + "epoch": 0.7341119221411192, + "grad_norm": 1.2735748216974894, + "learning_rate": 1.742404663585684e-06, + "loss": 0.1332, + "step": 7543 + }, + { + "epoch": 0.7342092457420925, + "grad_norm": 1.6865081698063993, + "learning_rate": 1.7412091032381423e-06, + "loss": 0.2862, + "step": 7544 + }, + { + "epoch": 0.7343065693430657, + "grad_norm": 1.5594714759842738, + "learning_rate": 1.7400138667186534e-06, + "loss": 0.2612, + "step": 7545 + }, + { + "epoch": 0.734403892944039, + "grad_norm": 1.6578790103035794, + "learning_rate": 1.7388189541459905e-06, + "loss": 0.3305, + "step": 7546 + }, + { + "epoch": 0.7345012165450122, + "grad_norm": 1.469227976840527, + "learning_rate": 1.7376243656388925e-06, + "loss": 0.2079, + "step": 7547 + }, + { + "epoch": 0.7345985401459854, + "grad_norm": 1.678875678180384, + "learning_rate": 1.7364301013160683e-06, + "loss": 0.3725, + "step": 7548 + }, + { + "epoch": 0.7346958637469586, + "grad_norm": 1.5137766943293849, + "learning_rate": 1.7352361612961893e-06, + "loss": 0.3867, + "step": 7549 + }, + { + "epoch": 0.7347931873479319, + "grad_norm": 1.4534359670994708, + "learning_rate": 1.7340425456978994e-06, + "loss": 0.3262, + "step": 7550 + }, + { + "epoch": 0.7348905109489051, + "grad_norm": 1.3096092724676331, + "learning_rate": 1.732849254639809e-06, + "loss": 0.2931, + "step": 7551 + }, + { + "epoch": 0.7349878345498784, + "grad_norm": 1.32350558091195, + "learning_rate": 1.7316562882404963e-06, + "loss": 0.2274, + "step": 7552 + }, + { + "epoch": 0.7350851581508516, + "grad_norm": 1.6301692027635266, + "learning_rate": 1.7304636466185077e-06, + "loss": 0.3276, + "step": 7553 + }, + { + "epoch": 0.7351824817518248, + "grad_norm": 1.968830470302516, + "learning_rate": 1.7292713298923558e-06, + "loss": 0.4546, + "step": 7554 + }, + { + "epoch": 0.735279805352798, + "grad_norm": 1.513113162385566, + "learning_rate": 1.728079338180524e-06, + "loss": 0.3915, + "step": 7555 + }, + { + "epoch": 0.7353771289537713, + "grad_norm": 1.6968343206485885, + "learning_rate": 1.7268876716014576e-06, + "loss": 0.3383, + "step": 7556 + }, + { + "epoch": 0.7354744525547445, + "grad_norm": 1.3146851832154292, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.2486, + "step": 7557 + }, + { + "epoch": 0.7355717761557178, + "grad_norm": 1.36390029274877, + "learning_rate": 1.7245053143152608e-06, + "loss": 0.2934, + "step": 7558 + }, + { + "epoch": 0.735669099756691, + "grad_norm": 1.3582948800165233, + "learning_rate": 1.7233146238448684e-06, + "loss": 0.4049, + "step": 7559 + }, + { + "epoch": 0.7357664233576642, + "grad_norm": 1.5289017713790114, + "learning_rate": 1.7221242589807141e-06, + "loss": 0.394, + "step": 7560 + }, + { + "epoch": 0.7358637469586374, + "grad_norm": 1.5658912477987932, + "learning_rate": 1.7209342198410872e-06, + "loss": 0.3807, + "step": 7561 + }, + { + "epoch": 0.7359610705596107, + "grad_norm": 1.264182080623541, + "learning_rate": 1.7197445065442419e-06, + "loss": 0.3302, + "step": 7562 + }, + { + "epoch": 0.7360583941605839, + "grad_norm": 1.7145882592143993, + "learning_rate": 1.7185551192084015e-06, + "loss": 0.2475, + "step": 7563 + }, + { + "epoch": 0.7361557177615572, + "grad_norm": 1.503189034793199, + "learning_rate": 1.717366057951756e-06, + "loss": 0.3676, + "step": 7564 + }, + { + "epoch": 0.7362530413625304, + "grad_norm": 1.4604045691467542, + "learning_rate": 1.7161773228924628e-06, + "loss": 0.4088, + "step": 7565 + }, + { + "epoch": 0.7363503649635037, + "grad_norm": 1.6683337774793787, + "learning_rate": 1.7149889141486487e-06, + "loss": 0.4217, + "step": 7566 + }, + { + "epoch": 0.7364476885644768, + "grad_norm": 1.466565394517573, + "learning_rate": 1.7138008318384042e-06, + "loss": 0.3266, + "step": 7567 + }, + { + "epoch": 0.7365450121654501, + "grad_norm": 1.7135262609405737, + "learning_rate": 1.7126130760797904e-06, + "loss": 0.233, + "step": 7568 + }, + { + "epoch": 0.7366423357664234, + "grad_norm": 1.6408056250551577, + "learning_rate": 1.711425646990838e-06, + "loss": 0.4538, + "step": 7569 + }, + { + "epoch": 0.7367396593673966, + "grad_norm": 1.6690719493947737, + "learning_rate": 1.7102385446895386e-06, + "loss": 0.5484, + "step": 7570 + }, + { + "epoch": 0.7368369829683699, + "grad_norm": 1.102867165494536, + "learning_rate": 1.7090517692938568e-06, + "loss": 0.2761, + "step": 7571 + }, + { + "epoch": 0.7369343065693431, + "grad_norm": 1.48579322924846, + "learning_rate": 1.7078653209217239e-06, + "loss": 0.3053, + "step": 7572 + }, + { + "epoch": 0.7370316301703163, + "grad_norm": 1.2819682643372348, + "learning_rate": 1.706679199691037e-06, + "loss": 0.2289, + "step": 7573 + }, + { + "epoch": 0.7371289537712895, + "grad_norm": 1.5424795413399397, + "learning_rate": 1.7054934057196626e-06, + "loss": 0.4125, + "step": 7574 + }, + { + "epoch": 0.7372262773722628, + "grad_norm": 1.299469891083765, + "learning_rate": 1.7043079391254335e-06, + "loss": 0.3498, + "step": 7575 + }, + { + "epoch": 0.737323600973236, + "grad_norm": 1.7337614469143436, + "learning_rate": 1.7031228000261502e-06, + "loss": 0.7505, + "step": 7576 + }, + { + "epoch": 0.7374209245742093, + "grad_norm": 1.7219704021741329, + "learning_rate": 1.7019379885395821e-06, + "loss": 0.3597, + "step": 7577 + }, + { + "epoch": 0.7375182481751825, + "grad_norm": 1.4112249746882883, + "learning_rate": 1.7007535047834616e-06, + "loss": 0.4297, + "step": 7578 + }, + { + "epoch": 0.7376155717761557, + "grad_norm": 1.5834918779334106, + "learning_rate": 1.6995693488754939e-06, + "loss": 0.3318, + "step": 7579 + }, + { + "epoch": 0.7377128953771289, + "grad_norm": 1.7224453820814, + "learning_rate": 1.6983855209333493e-06, + "loss": 0.3577, + "step": 7580 + }, + { + "epoch": 0.7378102189781022, + "grad_norm": 1.595701107925018, + "learning_rate": 1.6972020210746637e-06, + "loss": 0.5642, + "step": 7581 + }, + { + "epoch": 0.7379075425790754, + "grad_norm": 1.3739921170415283, + "learning_rate": 1.6960188494170433e-06, + "loss": 0.1812, + "step": 7582 + }, + { + "epoch": 0.7380048661800487, + "grad_norm": 1.576117570170836, + "learning_rate": 1.6948360060780605e-06, + "loss": 0.41, + "step": 7583 + }, + { + "epoch": 0.7381021897810219, + "grad_norm": 1.863212728140119, + "learning_rate": 1.693653491175255e-06, + "loss": 0.2813, + "step": 7584 + }, + { + "epoch": 0.7381995133819952, + "grad_norm": 1.8615703285082135, + "learning_rate": 1.692471304826135e-06, + "loss": 0.6034, + "step": 7585 + }, + { + "epoch": 0.7382968369829683, + "grad_norm": 1.466433544849947, + "learning_rate": 1.691289447148174e-06, + "loss": 0.3115, + "step": 7586 + }, + { + "epoch": 0.7383941605839416, + "grad_norm": 1.469049382641137, + "learning_rate": 1.6901079182588158e-06, + "loss": 0.4683, + "step": 7587 + }, + { + "epoch": 0.7384914841849148, + "grad_norm": 1.558934607078218, + "learning_rate": 1.6889267182754665e-06, + "loss": 0.2053, + "step": 7588 + }, + { + "epoch": 0.7385888077858881, + "grad_norm": 1.8146397029677583, + "learning_rate": 1.6877458473155045e-06, + "loss": 0.3707, + "step": 7589 + }, + { + "epoch": 0.7386861313868613, + "grad_norm": 1.3453189207271603, + "learning_rate": 1.6865653054962738e-06, + "loss": 0.2804, + "step": 7590 + }, + { + "epoch": 0.7387834549878346, + "grad_norm": 1.4134183649509062, + "learning_rate": 1.6853850929350868e-06, + "loss": 0.3104, + "step": 7591 + }, + { + "epoch": 0.7388807785888077, + "grad_norm": 1.3604830906275251, + "learning_rate": 1.684205209749219e-06, + "loss": 0.2554, + "step": 7592 + }, + { + "epoch": 0.738978102189781, + "grad_norm": 2.2343344518010015, + "learning_rate": 1.6830256560559172e-06, + "loss": 0.369, + "step": 7593 + }, + { + "epoch": 0.7390754257907542, + "grad_norm": 1.6383227952920287, + "learning_rate": 1.6818464319723948e-06, + "loss": 0.5596, + "step": 7594 + }, + { + "epoch": 0.7391727493917275, + "grad_norm": 1.5921146754268276, + "learning_rate": 1.680667537615832e-06, + "loss": 0.2823, + "step": 7595 + }, + { + "epoch": 0.7392700729927008, + "grad_norm": 1.3295444756765844, + "learning_rate": 1.6794889731033758e-06, + "loss": 0.3484, + "step": 7596 + }, + { + "epoch": 0.739367396593674, + "grad_norm": 1.808088590464363, + "learning_rate": 1.678310738552142e-06, + "loss": 0.2991, + "step": 7597 + }, + { + "epoch": 0.7394647201946472, + "grad_norm": 1.5921399829126863, + "learning_rate": 1.6771328340792131e-06, + "loss": 0.2686, + "step": 7598 + }, + { + "epoch": 0.7395620437956204, + "grad_norm": 1.8696168553942065, + "learning_rate": 1.6759552598016355e-06, + "loss": 0.4151, + "step": 7599 + }, + { + "epoch": 0.7396593673965937, + "grad_norm": 1.725162819075155, + "learning_rate": 1.6747780158364262e-06, + "loss": 0.6142, + "step": 7600 + }, + { + "epoch": 0.7397566909975669, + "grad_norm": 1.5079775705060783, + "learning_rate": 1.67360110230057e-06, + "loss": 0.1699, + "step": 7601 + }, + { + "epoch": 0.7398540145985402, + "grad_norm": 1.2340714286583592, + "learning_rate": 1.6724245193110178e-06, + "loss": 0.3668, + "step": 7602 + }, + { + "epoch": 0.7399513381995134, + "grad_norm": 1.8868236946947243, + "learning_rate": 1.6712482669846853e-06, + "loss": 0.4169, + "step": 7603 + }, + { + "epoch": 0.7400486618004867, + "grad_norm": 2.1700238359258472, + "learning_rate": 1.6700723454384582e-06, + "loss": 0.2984, + "step": 7604 + }, + { + "epoch": 0.7401459854014598, + "grad_norm": 1.6369351992729133, + "learning_rate": 1.6688967547891877e-06, + "loss": 0.3721, + "step": 7605 + }, + { + "epoch": 0.7402433090024331, + "grad_norm": 1.4748462208903876, + "learning_rate": 1.6677214951536968e-06, + "loss": 0.298, + "step": 7606 + }, + { + "epoch": 0.7403406326034063, + "grad_norm": 1.7310391900239237, + "learning_rate": 1.666546566648768e-06, + "loss": 0.3256, + "step": 7607 + }, + { + "epoch": 0.7404379562043796, + "grad_norm": 2.009052413271909, + "learning_rate": 1.6653719693911558e-06, + "loss": 0.4736, + "step": 7608 + }, + { + "epoch": 0.7405352798053528, + "grad_norm": 1.3947856254342346, + "learning_rate": 1.664197703497582e-06, + "loss": 0.3013, + "step": 7609 + }, + { + "epoch": 0.7406326034063261, + "grad_norm": 1.8475046882002093, + "learning_rate": 1.6630237690847312e-06, + "loss": 0.3941, + "step": 7610 + }, + { + "epoch": 0.7407299270072992, + "grad_norm": 1.3985738847600482, + "learning_rate": 1.6618501662692593e-06, + "loss": 0.3914, + "step": 7611 + }, + { + "epoch": 0.7408272506082725, + "grad_norm": 1.7609596393212308, + "learning_rate": 1.660676895167788e-06, + "loss": 0.3002, + "step": 7612 + }, + { + "epoch": 0.7409245742092457, + "grad_norm": 1.3910338411581922, + "learning_rate": 1.6595039558969084e-06, + "loss": 0.4256, + "step": 7613 + }, + { + "epoch": 0.741021897810219, + "grad_norm": 1.5309173845541366, + "learning_rate": 1.6583313485731722e-06, + "loss": 0.2596, + "step": 7614 + }, + { + "epoch": 0.7411192214111922, + "grad_norm": 1.7629735161776812, + "learning_rate": 1.6571590733131022e-06, + "loss": 0.2874, + "step": 7615 + }, + { + "epoch": 0.7412165450121655, + "grad_norm": 1.8241153663240228, + "learning_rate": 1.6559871302331926e-06, + "loss": 0.413, + "step": 7616 + }, + { + "epoch": 0.7413138686131386, + "grad_norm": 1.1349831666412327, + "learning_rate": 1.6548155194498961e-06, + "loss": 0.1366, + "step": 7617 + }, + { + "epoch": 0.7414111922141119, + "grad_norm": 1.3925238942505276, + "learning_rate": 1.6536442410796372e-06, + "loss": 0.3073, + "step": 7618 + }, + { + "epoch": 0.7415085158150851, + "grad_norm": 2.078454013105589, + "learning_rate": 1.6524732952388068e-06, + "loss": 0.3262, + "step": 7619 + }, + { + "epoch": 0.7416058394160584, + "grad_norm": 1.4514724194320567, + "learning_rate": 1.6513026820437645e-06, + "loss": 0.2068, + "step": 7620 + }, + { + "epoch": 0.7417031630170317, + "grad_norm": 1.4842961340582688, + "learning_rate": 1.6501324016108317e-06, + "loss": 0.4537, + "step": 7621 + }, + { + "epoch": 0.7418004866180049, + "grad_norm": 1.3325790665949664, + "learning_rate": 1.6489624540563003e-06, + "loss": 0.2784, + "step": 7622 + }, + { + "epoch": 0.741897810218978, + "grad_norm": 1.8019270489201629, + "learning_rate": 1.6477928394964298e-06, + "loss": 0.3074, + "step": 7623 + }, + { + "epoch": 0.7419951338199513, + "grad_norm": 1.7343591225387052, + "learning_rate": 1.6466235580474478e-06, + "loss": 0.5764, + "step": 7624 + }, + { + "epoch": 0.7420924574209246, + "grad_norm": 1.8564717041579797, + "learning_rate": 1.6454546098255396e-06, + "loss": 0.3685, + "step": 7625 + }, + { + "epoch": 0.7421897810218978, + "grad_norm": 1.6642387905674532, + "learning_rate": 1.644285994946871e-06, + "loss": 0.2135, + "step": 7626 + }, + { + "epoch": 0.7422871046228711, + "grad_norm": 1.6127150606561178, + "learning_rate": 1.6431177135275666e-06, + "loss": 0.439, + "step": 7627 + }, + { + "epoch": 0.7423844282238443, + "grad_norm": 1.4902498936876278, + "learning_rate": 1.6419497656837175e-06, + "loss": 0.3799, + "step": 7628 + }, + { + "epoch": 0.7424817518248176, + "grad_norm": 1.3250087305896954, + "learning_rate": 1.6407821515313838e-06, + "loss": 0.3191, + "step": 7629 + }, + { + "epoch": 0.7425790754257907, + "grad_norm": 1.5580425004659073, + "learning_rate": 1.6396148711865922e-06, + "loss": 0.3433, + "step": 7630 + }, + { + "epoch": 0.742676399026764, + "grad_norm": 2.056287483407051, + "learning_rate": 1.6384479247653384e-06, + "loss": 0.3944, + "step": 7631 + }, + { + "epoch": 0.7427737226277372, + "grad_norm": 1.7453585790222177, + "learning_rate": 1.6372813123835784e-06, + "loss": 0.4732, + "step": 7632 + }, + { + "epoch": 0.7428710462287105, + "grad_norm": 1.3505239741412312, + "learning_rate": 1.636115034157242e-06, + "loss": 0.2436, + "step": 7633 + }, + { + "epoch": 0.7429683698296837, + "grad_norm": 1.490653962627681, + "learning_rate": 1.634949090202223e-06, + "loss": 0.388, + "step": 7634 + }, + { + "epoch": 0.743065693430657, + "grad_norm": 1.4428606010352405, + "learning_rate": 1.6337834806343783e-06, + "loss": 0.2549, + "step": 7635 + }, + { + "epoch": 0.7431630170316301, + "grad_norm": 1.7525586088776801, + "learning_rate": 1.63261820556954e-06, + "loss": 0.3929, + "step": 7636 + }, + { + "epoch": 0.7432603406326034, + "grad_norm": 1.630150335903184, + "learning_rate": 1.6314532651235005e-06, + "loss": 0.3554, + "step": 7637 + }, + { + "epoch": 0.7433576642335766, + "grad_norm": 1.6461580978149422, + "learning_rate": 1.6302886594120222e-06, + "loss": 0.3676, + "step": 7638 + }, + { + "epoch": 0.7434549878345499, + "grad_norm": 1.6931891438277422, + "learning_rate": 1.629124388550829e-06, + "loss": 0.273, + "step": 7639 + }, + { + "epoch": 0.7435523114355231, + "grad_norm": 1.710106704572782, + "learning_rate": 1.627960452655617e-06, + "loss": 0.2254, + "step": 7640 + }, + { + "epoch": 0.7436496350364964, + "grad_norm": 1.506158821227075, + "learning_rate": 1.6267968518420479e-06, + "loss": 0.3648, + "step": 7641 + }, + { + "epoch": 0.7437469586374695, + "grad_norm": 1.847789154756844, + "learning_rate": 1.6256335862257506e-06, + "loss": 0.4975, + "step": 7642 + }, + { + "epoch": 0.7438442822384428, + "grad_norm": 1.5472539572407051, + "learning_rate": 1.6244706559223162e-06, + "loss": 0.374, + "step": 7643 + }, + { + "epoch": 0.743941605839416, + "grad_norm": 1.581398693828398, + "learning_rate": 1.6233080610473073e-06, + "loss": 0.2862, + "step": 7644 + }, + { + "epoch": 0.7440389294403893, + "grad_norm": 1.3979695991845247, + "learning_rate": 1.6221458017162533e-06, + "loss": 0.2492, + "step": 7645 + }, + { + "epoch": 0.7441362530413625, + "grad_norm": 1.6332450914225967, + "learning_rate": 1.6209838780446441e-06, + "loss": 0.5106, + "step": 7646 + }, + { + "epoch": 0.7442335766423358, + "grad_norm": 1.6433868413161832, + "learning_rate": 1.6198222901479454e-06, + "loss": 0.5335, + "step": 7647 + }, + { + "epoch": 0.7443309002433091, + "grad_norm": 1.8922223365909594, + "learning_rate": 1.618661038141584e-06, + "loss": 0.4617, + "step": 7648 + }, + { + "epoch": 0.7444282238442822, + "grad_norm": 1.3478227153190183, + "learning_rate": 1.6175001221409547e-06, + "loss": 0.4061, + "step": 7649 + }, + { + "epoch": 0.7445255474452555, + "grad_norm": 1.8250139288174783, + "learning_rate": 1.616339542261416e-06, + "loss": 0.4277, + "step": 7650 + }, + { + "epoch": 0.7446228710462287, + "grad_norm": 1.5887274795004565, + "learning_rate": 1.6151792986182962e-06, + "loss": 0.3729, + "step": 7651 + }, + { + "epoch": 0.744720194647202, + "grad_norm": 1.8118088818658438, + "learning_rate": 1.614019391326892e-06, + "loss": 0.374, + "step": 7652 + }, + { + "epoch": 0.7448175182481752, + "grad_norm": 1.5908479635459467, + "learning_rate": 1.6128598205024597e-06, + "loss": 0.3668, + "step": 7653 + }, + { + "epoch": 0.7449148418491485, + "grad_norm": 1.3665568613895167, + "learning_rate": 1.6117005862602297e-06, + "loss": 0.3415, + "step": 7654 + }, + { + "epoch": 0.7450121654501216, + "grad_norm": 1.8976232721345871, + "learning_rate": 1.6105416887153952e-06, + "loss": 0.38, + "step": 7655 + }, + { + "epoch": 0.7451094890510949, + "grad_norm": 1.6357965534342267, + "learning_rate": 1.6093831279831163e-06, + "loss": 0.4427, + "step": 7656 + }, + { + "epoch": 0.7452068126520681, + "grad_norm": 1.6592071333350198, + "learning_rate": 1.6082249041785196e-06, + "loss": 0.3618, + "step": 7657 + }, + { + "epoch": 0.7453041362530414, + "grad_norm": 1.6807069035976443, + "learning_rate": 1.607067017416699e-06, + "loss": 0.338, + "step": 7658 + }, + { + "epoch": 0.7454014598540146, + "grad_norm": 1.6031226769740952, + "learning_rate": 1.6059094678127147e-06, + "loss": 0.3043, + "step": 7659 + }, + { + "epoch": 0.7454987834549879, + "grad_norm": 1.9975704695358607, + "learning_rate": 1.6047522554815941e-06, + "loss": 0.4539, + "step": 7660 + }, + { + "epoch": 0.745596107055961, + "grad_norm": 1.6490037858409463, + "learning_rate": 1.6035953805383269e-06, + "loss": 0.3996, + "step": 7661 + }, + { + "epoch": 0.7456934306569343, + "grad_norm": 1.3804861209037687, + "learning_rate": 1.602438843097875e-06, + "loss": 0.3039, + "step": 7662 + }, + { + "epoch": 0.7457907542579075, + "grad_norm": 1.5086460477830628, + "learning_rate": 1.6012826432751649e-06, + "loss": 0.275, + "step": 7663 + }, + { + "epoch": 0.7458880778588808, + "grad_norm": 2.1957477396063347, + "learning_rate": 1.6001267811850856e-06, + "loss": 0.3519, + "step": 7664 + }, + { + "epoch": 0.745985401459854, + "grad_norm": 1.6406754985485326, + "learning_rate": 1.5989712569424982e-06, + "loss": 0.4887, + "step": 7665 + }, + { + "epoch": 0.7460827250608273, + "grad_norm": 1.5592833406040738, + "learning_rate": 1.5978160706622275e-06, + "loss": 0.1848, + "step": 7666 + }, + { + "epoch": 0.7461800486618004, + "grad_norm": 1.6364467640675626, + "learning_rate": 1.5966612224590644e-06, + "loss": 0.3854, + "step": 7667 + }, + { + "epoch": 0.7462773722627737, + "grad_norm": 1.7010586303438184, + "learning_rate": 1.595506712447768e-06, + "loss": 0.2725, + "step": 7668 + }, + { + "epoch": 0.7463746958637469, + "grad_norm": 1.707267762304431, + "learning_rate": 1.5943525407430621e-06, + "loss": 0.3875, + "step": 7669 + }, + { + "epoch": 0.7464720194647202, + "grad_norm": 1.823143456293372, + "learning_rate": 1.593198707459639e-06, + "loss": 0.4733, + "step": 7670 + }, + { + "epoch": 0.7465693430656934, + "grad_norm": 1.6660951190689932, + "learning_rate": 1.5920452127121528e-06, + "loss": 0.4543, + "step": 7671 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 1.3947523780552225, + "learning_rate": 1.5908920566152287e-06, + "loss": 0.232, + "step": 7672 + }, + { + "epoch": 0.74676399026764, + "grad_norm": 1.44548660466873, + "learning_rate": 1.589739239283456e-06, + "loss": 0.3857, + "step": 7673 + }, + { + "epoch": 0.7468613138686131, + "grad_norm": 3.2312030073102314, + "learning_rate": 1.5885867608313927e-06, + "loss": 0.309, + "step": 7674 + }, + { + "epoch": 0.7469586374695864, + "grad_norm": 1.3358025287464308, + "learning_rate": 1.5874346213735586e-06, + "loss": 0.2462, + "step": 7675 + }, + { + "epoch": 0.7470559610705596, + "grad_norm": 1.5757281542690051, + "learning_rate": 1.5862828210244435e-06, + "loss": 0.4119, + "step": 7676 + }, + { + "epoch": 0.7471532846715329, + "grad_norm": 1.7551062202369905, + "learning_rate": 1.585131359898503e-06, + "loss": 0.6434, + "step": 7677 + }, + { + "epoch": 0.7472506082725061, + "grad_norm": 1.651025557414346, + "learning_rate": 1.5839802381101576e-06, + "loss": 0.2304, + "step": 7678 + }, + { + "epoch": 0.7473479318734794, + "grad_norm": 1.698034395532271, + "learning_rate": 1.582829455773796e-06, + "loss": 0.4483, + "step": 7679 + }, + { + "epoch": 0.7474452554744525, + "grad_norm": 1.7155819218686994, + "learning_rate": 1.5816790130037719e-06, + "loss": 0.5339, + "step": 7680 + }, + { + "epoch": 0.7475425790754258, + "grad_norm": 1.621352650753709, + "learning_rate": 1.5805289099144067e-06, + "loss": 0.4906, + "step": 7681 + }, + { + "epoch": 0.747639902676399, + "grad_norm": 1.7097912436769425, + "learning_rate": 1.5793791466199837e-06, + "loss": 0.2924, + "step": 7682 + }, + { + "epoch": 0.7477372262773723, + "grad_norm": 1.6257039526070596, + "learning_rate": 1.5782297232347576e-06, + "loss": 0.604, + "step": 7683 + }, + { + "epoch": 0.7478345498783455, + "grad_norm": 1.9438103080789404, + "learning_rate": 1.5770806398729472e-06, + "loss": 0.5511, + "step": 7684 + }, + { + "epoch": 0.7479318734793188, + "grad_norm": 1.4857736523263783, + "learning_rate": 1.575931896648739e-06, + "loss": 0.3206, + "step": 7685 + }, + { + "epoch": 0.7480291970802919, + "grad_norm": 1.2836953529828308, + "learning_rate": 1.5747834936762812e-06, + "loss": 0.206, + "step": 7686 + }, + { + "epoch": 0.7481265206812652, + "grad_norm": 1.573265752728345, + "learning_rate": 1.5736354310696928e-06, + "loss": 0.3399, + "step": 7687 + }, + { + "epoch": 0.7482238442822384, + "grad_norm": 1.7078775712832672, + "learning_rate": 1.572487708943058e-06, + "loss": 0.4871, + "step": 7688 + }, + { + "epoch": 0.7483211678832117, + "grad_norm": 1.5140392226879476, + "learning_rate": 1.5713403274104262e-06, + "loss": 0.3618, + "step": 7689 + }, + { + "epoch": 0.7484184914841849, + "grad_norm": 1.4040952492653052, + "learning_rate": 1.5701932865858133e-06, + "loss": 0.3111, + "step": 7690 + }, + { + "epoch": 0.7485158150851582, + "grad_norm": 1.4355312632016743, + "learning_rate": 1.5690465865832016e-06, + "loss": 0.3534, + "step": 7691 + }, + { + "epoch": 0.7486131386861314, + "grad_norm": 1.7049608048215605, + "learning_rate": 1.5679002275165412e-06, + "loss": 0.2442, + "step": 7692 + }, + { + "epoch": 0.7487104622871046, + "grad_norm": 1.5620858215708369, + "learning_rate": 1.5667542094997429e-06, + "loss": 0.2689, + "step": 7693 + }, + { + "epoch": 0.7488077858880778, + "grad_norm": 1.5511715491512343, + "learning_rate": 1.565608532646689e-06, + "loss": 0.3409, + "step": 7694 + }, + { + "epoch": 0.7489051094890511, + "grad_norm": 1.4946891226690897, + "learning_rate": 1.5644631970712266e-06, + "loss": 0.371, + "step": 7695 + }, + { + "epoch": 0.7490024330900243, + "grad_norm": 1.7659614715002256, + "learning_rate": 1.563318202887169e-06, + "loss": 0.5009, + "step": 7696 + }, + { + "epoch": 0.7490997566909976, + "grad_norm": 1.2751739458095641, + "learning_rate": 1.562173550208293e-06, + "loss": 0.1691, + "step": 7697 + }, + { + "epoch": 0.7491970802919709, + "grad_norm": 1.5343647632140858, + "learning_rate": 1.5610292391483439e-06, + "loss": 0.3317, + "step": 7698 + }, + { + "epoch": 0.749294403892944, + "grad_norm": 1.5568551139526292, + "learning_rate": 1.5598852698210332e-06, + "loss": 0.2162, + "step": 7699 + }, + { + "epoch": 0.7493917274939172, + "grad_norm": 1.6717631935453563, + "learning_rate": 1.5587416423400376e-06, + "loss": 0.2748, + "step": 7700 + }, + { + "epoch": 0.7494890510948905, + "grad_norm": 1.426816548751413, + "learning_rate": 1.557598356819e-06, + "loss": 0.3605, + "step": 7701 + }, + { + "epoch": 0.7495863746958638, + "grad_norm": 1.590093044821458, + "learning_rate": 1.5564554133715304e-06, + "loss": 0.3073, + "step": 7702 + }, + { + "epoch": 0.749683698296837, + "grad_norm": 1.8907602435203605, + "learning_rate": 1.5553128121112038e-06, + "loss": 0.4927, + "step": 7703 + }, + { + "epoch": 0.7497810218978103, + "grad_norm": 1.4438847268395891, + "learning_rate": 1.5541705531515589e-06, + "loss": 0.3822, + "step": 7704 + }, + { + "epoch": 0.7498783454987834, + "grad_norm": 1.3926814443559916, + "learning_rate": 1.5530286366061036e-06, + "loss": 0.3144, + "step": 7705 + }, + { + "epoch": 0.7499756690997567, + "grad_norm": 1.4859703726195965, + "learning_rate": 1.5518870625883137e-06, + "loss": 0.4979, + "step": 7706 + }, + { + "epoch": 0.7500729927007299, + "grad_norm": 1.6052715245342355, + "learning_rate": 1.5507458312116241e-06, + "loss": 0.3149, + "step": 7707 + }, + { + "epoch": 0.7501703163017032, + "grad_norm": 1.7112405586957498, + "learning_rate": 1.549604942589441e-06, + "loss": 0.2468, + "step": 7708 + }, + { + "epoch": 0.7502676399026764, + "grad_norm": 1.6987859766773636, + "learning_rate": 1.5484643968351358e-06, + "loss": 0.3448, + "step": 7709 + }, + { + "epoch": 0.7503649635036497, + "grad_norm": 1.516026158678338, + "learning_rate": 1.5473241940620448e-06, + "loss": 0.4697, + "step": 7710 + }, + { + "epoch": 0.7504622871046228, + "grad_norm": 1.7092155954073969, + "learning_rate": 1.546184334383471e-06, + "loss": 0.3842, + "step": 7711 + }, + { + "epoch": 0.7505596107055961, + "grad_norm": 2.6302789559026354, + "learning_rate": 1.5450448179126825e-06, + "loss": 0.2703, + "step": 7712 + }, + { + "epoch": 0.7506569343065693, + "grad_norm": 1.6160101543995058, + "learning_rate": 1.5439056447629142e-06, + "loss": 0.3518, + "step": 7713 + }, + { + "epoch": 0.7507542579075426, + "grad_norm": 1.3607430863380885, + "learning_rate": 1.5427668150473679e-06, + "loss": 0.2555, + "step": 7714 + }, + { + "epoch": 0.7508515815085158, + "grad_norm": 1.4936966379761554, + "learning_rate": 1.541628328879206e-06, + "loss": 0.3988, + "step": 7715 + }, + { + "epoch": 0.7509489051094891, + "grad_norm": 1.4337946363849223, + "learning_rate": 1.5404901863715638e-06, + "loss": 0.3781, + "step": 7716 + }, + { + "epoch": 0.7510462287104623, + "grad_norm": 1.2640648633182723, + "learning_rate": 1.5393523876375388e-06, + "loss": 0.2298, + "step": 7717 + }, + { + "epoch": 0.7511435523114355, + "grad_norm": 1.6688350241499577, + "learning_rate": 1.538214932790193e-06, + "loss": 0.4842, + "step": 7718 + }, + { + "epoch": 0.7512408759124087, + "grad_norm": 1.4017987480633665, + "learning_rate": 1.5370778219425569e-06, + "loss": 0.3277, + "step": 7719 + }, + { + "epoch": 0.751338199513382, + "grad_norm": 1.4628502874580964, + "learning_rate": 1.5359410552076266e-06, + "loss": 0.3148, + "step": 7720 + }, + { + "epoch": 0.7514355231143552, + "grad_norm": 1.3474887467376244, + "learning_rate": 1.5348046326983623e-06, + "loss": 0.3596, + "step": 7721 + }, + { + "epoch": 0.7515328467153285, + "grad_norm": 1.43504015225051, + "learning_rate": 1.5336685545276919e-06, + "loss": 0.3089, + "step": 7722 + }, + { + "epoch": 0.7516301703163017, + "grad_norm": 1.5041154988357412, + "learning_rate": 1.5325328208085078e-06, + "loss": 0.4589, + "step": 7723 + }, + { + "epoch": 0.7517274939172749, + "grad_norm": 1.3804764493199713, + "learning_rate": 1.5313974316536684e-06, + "loss": 0.4304, + "step": 7724 + }, + { + "epoch": 0.7518248175182481, + "grad_norm": 1.5480094371872704, + "learning_rate": 1.5302623871759997e-06, + "loss": 0.4146, + "step": 7725 + }, + { + "epoch": 0.7519221411192214, + "grad_norm": 1.485336634694465, + "learning_rate": 1.5291276874882887e-06, + "loss": 0.3153, + "step": 7726 + }, + { + "epoch": 0.7520194647201947, + "grad_norm": 1.1788276418639037, + "learning_rate": 1.5279933327032925e-06, + "loss": 0.2036, + "step": 7727 + }, + { + "epoch": 0.7521167883211679, + "grad_norm": 1.398072264303022, + "learning_rate": 1.5268593229337352e-06, + "loss": 0.2677, + "step": 7728 + }, + { + "epoch": 0.7522141119221412, + "grad_norm": 1.457564759950467, + "learning_rate": 1.5257256582922996e-06, + "loss": 0.3132, + "step": 7729 + }, + { + "epoch": 0.7523114355231143, + "grad_norm": 1.8623914853429002, + "learning_rate": 1.5245923388916412e-06, + "loss": 0.3941, + "step": 7730 + }, + { + "epoch": 0.7524087591240876, + "grad_norm": 1.597171700641467, + "learning_rate": 1.523459364844378e-06, + "loss": 0.3924, + "step": 7731 + }, + { + "epoch": 0.7525060827250608, + "grad_norm": 1.4900682989250613, + "learning_rate": 1.5223267362630944e-06, + "loss": 0.4061, + "step": 7732 + }, + { + "epoch": 0.7526034063260341, + "grad_norm": 1.5271589162457029, + "learning_rate": 1.5211944532603407e-06, + "loss": 0.3329, + "step": 7733 + }, + { + "epoch": 0.7527007299270073, + "grad_norm": 1.4261146469148334, + "learning_rate": 1.5200625159486322e-06, + "loss": 0.3223, + "step": 7734 + }, + { + "epoch": 0.7527980535279806, + "grad_norm": 1.460979930241132, + "learning_rate": 1.5189309244404522e-06, + "loss": 0.276, + "step": 7735 + }, + { + "epoch": 0.7528953771289538, + "grad_norm": 1.5864520060495935, + "learning_rate": 1.5177996788482446e-06, + "loss": 0.5439, + "step": 7736 + }, + { + "epoch": 0.752992700729927, + "grad_norm": 1.7388034679054445, + "learning_rate": 1.516668779284423e-06, + "loss": 0.3311, + "step": 7737 + }, + { + "epoch": 0.7530900243309002, + "grad_norm": 1.4848428892261378, + "learning_rate": 1.5155382258613654e-06, + "loss": 0.3977, + "step": 7738 + }, + { + "epoch": 0.7531873479318735, + "grad_norm": 1.7636963429660022, + "learning_rate": 1.5144080186914183e-06, + "loss": 0.5733, + "step": 7739 + }, + { + "epoch": 0.7532846715328467, + "grad_norm": 1.621384854204493, + "learning_rate": 1.5132781578868872e-06, + "loss": 0.3277, + "step": 7740 + }, + { + "epoch": 0.75338199513382, + "grad_norm": 1.954521107471598, + "learning_rate": 1.512148643560048e-06, + "loss": 0.3937, + "step": 7741 + }, + { + "epoch": 0.7534793187347932, + "grad_norm": 1.5443633525166796, + "learning_rate": 1.5110194758231422e-06, + "loss": 0.4343, + "step": 7742 + }, + { + "epoch": 0.7535766423357664, + "grad_norm": 1.334911108307868, + "learning_rate": 1.5098906547883756e-06, + "loss": 0.3663, + "step": 7743 + }, + { + "epoch": 0.7536739659367396, + "grad_norm": 1.167130798024744, + "learning_rate": 1.5087621805679204e-06, + "loss": 0.188, + "step": 7744 + }, + { + "epoch": 0.7537712895377129, + "grad_norm": 1.5718451464872494, + "learning_rate": 1.5076340532739126e-06, + "loss": 0.3562, + "step": 7745 + }, + { + "epoch": 0.7538686131386861, + "grad_norm": 1.5656389369077786, + "learning_rate": 1.5065062730184572e-06, + "loss": 0.2976, + "step": 7746 + }, + { + "epoch": 0.7539659367396594, + "grad_norm": 1.553505683159232, + "learning_rate": 1.5053788399136189e-06, + "loss": 0.3687, + "step": 7747 + }, + { + "epoch": 0.7540632603406326, + "grad_norm": 1.522593650881599, + "learning_rate": 1.5042517540714335e-06, + "loss": 0.2421, + "step": 7748 + }, + { + "epoch": 0.7541605839416058, + "grad_norm": 1.3375795322907895, + "learning_rate": 1.5031250156039e-06, + "loss": 0.1887, + "step": 7749 + }, + { + "epoch": 0.754257907542579, + "grad_norm": 1.7123293158125326, + "learning_rate": 1.5019986246229845e-06, + "loss": 0.516, + "step": 7750 + }, + { + "epoch": 0.7543552311435523, + "grad_norm": 1.541828748413654, + "learning_rate": 1.5008725812406144e-06, + "loss": 0.4097, + "step": 7751 + }, + { + "epoch": 0.7544525547445255, + "grad_norm": 1.5517137159896413, + "learning_rate": 1.4997468855686864e-06, + "loss": 0.3209, + "step": 7752 + }, + { + "epoch": 0.7545498783454988, + "grad_norm": 1.4015948067538406, + "learning_rate": 1.4986215377190615e-06, + "loss": 0.2821, + "step": 7753 + }, + { + "epoch": 0.7546472019464721, + "grad_norm": 1.8706840432915013, + "learning_rate": 1.4974965378035671e-06, + "loss": 0.3001, + "step": 7754 + }, + { + "epoch": 0.7547445255474453, + "grad_norm": 1.627889778000414, + "learning_rate": 1.4963718859339944e-06, + "loss": 0.375, + "step": 7755 + }, + { + "epoch": 0.7548418491484185, + "grad_norm": 1.3706399976446444, + "learning_rate": 1.4952475822221008e-06, + "loss": 0.2959, + "step": 7756 + }, + { + "epoch": 0.7549391727493917, + "grad_norm": 1.5017620704283912, + "learning_rate": 1.4941236267796106e-06, + "loss": 0.4402, + "step": 7757 + }, + { + "epoch": 0.755036496350365, + "grad_norm": 1.3795390203639275, + "learning_rate": 1.4930000197182087e-06, + "loss": 0.1975, + "step": 7758 + }, + { + "epoch": 0.7551338199513382, + "grad_norm": 1.5252436959086833, + "learning_rate": 1.491876761149551e-06, + "loss": 0.47, + "step": 7759 + }, + { + "epoch": 0.7552311435523115, + "grad_norm": 1.5503825484700031, + "learning_rate": 1.4907538511852554e-06, + "loss": 0.4096, + "step": 7760 + }, + { + "epoch": 0.7553284671532847, + "grad_norm": 1.6692006477760577, + "learning_rate": 1.4896312899369086e-06, + "loss": 0.2989, + "step": 7761 + }, + { + "epoch": 0.7554257907542579, + "grad_norm": 1.3378022324814762, + "learning_rate": 1.488509077516056e-06, + "loss": 0.1902, + "step": 7762 + }, + { + "epoch": 0.7555231143552311, + "grad_norm": 1.637094095022718, + "learning_rate": 1.487387214034215e-06, + "loss": 0.4521, + "step": 7763 + }, + { + "epoch": 0.7556204379562044, + "grad_norm": 1.4116575750323157, + "learning_rate": 1.4862656996028658e-06, + "loss": 0.2515, + "step": 7764 + }, + { + "epoch": 0.7557177615571776, + "grad_norm": 1.4036299139003567, + "learning_rate": 1.4851445343334531e-06, + "loss": 0.3296, + "step": 7765 + }, + { + "epoch": 0.7558150851581509, + "grad_norm": 1.3034225087663027, + "learning_rate": 1.484023718337389e-06, + "loss": 0.3932, + "step": 7766 + }, + { + "epoch": 0.7559124087591241, + "grad_norm": 1.3644335667005367, + "learning_rate": 1.482903251726049e-06, + "loss": 0.2173, + "step": 7767 + }, + { + "epoch": 0.7560097323600973, + "grad_norm": 1.5486147566421244, + "learning_rate": 1.481783134610776e-06, + "loss": 0.3193, + "step": 7768 + }, + { + "epoch": 0.7561070559610705, + "grad_norm": 1.339322536507269, + "learning_rate": 1.4806633671028741e-06, + "loss": 0.2386, + "step": 7769 + }, + { + "epoch": 0.7562043795620438, + "grad_norm": 1.6670332346568377, + "learning_rate": 1.4795439493136165e-06, + "loss": 0.3451, + "step": 7770 + }, + { + "epoch": 0.756301703163017, + "grad_norm": 1.480209609730667, + "learning_rate": 1.4784248813542417e-06, + "loss": 0.4038, + "step": 7771 + }, + { + "epoch": 0.7563990267639903, + "grad_norm": 1.5778993501897463, + "learning_rate": 1.4773061633359498e-06, + "loss": 0.471, + "step": 7772 + }, + { + "epoch": 0.7564963503649635, + "grad_norm": 1.64022865957603, + "learning_rate": 1.4761877953699095e-06, + "loss": 0.4942, + "step": 7773 + }, + { + "epoch": 0.7565936739659367, + "grad_norm": 1.648140107578535, + "learning_rate": 1.4750697775672523e-06, + "loss": 0.4962, + "step": 7774 + }, + { + "epoch": 0.7566909975669099, + "grad_norm": 1.4791264953889414, + "learning_rate": 1.4739521100390814e-06, + "loss": 0.4463, + "step": 7775 + }, + { + "epoch": 0.7567883211678832, + "grad_norm": 1.743257613035926, + "learning_rate": 1.4728347928964549e-06, + "loss": 0.4083, + "step": 7776 + }, + { + "epoch": 0.7568856447688564, + "grad_norm": 1.492848035402606, + "learning_rate": 1.4717178262504027e-06, + "loss": 0.3325, + "step": 7777 + }, + { + "epoch": 0.7569829683698297, + "grad_norm": 1.4576709151125766, + "learning_rate": 1.4706012102119189e-06, + "loss": 0.326, + "step": 7778 + }, + { + "epoch": 0.757080291970803, + "grad_norm": 1.756076752612389, + "learning_rate": 1.4694849448919635e-06, + "loss": 0.4215, + "step": 7779 + }, + { + "epoch": 0.7571776155717762, + "grad_norm": 1.5993256369661717, + "learning_rate": 1.4683690304014581e-06, + "loss": 0.2829, + "step": 7780 + }, + { + "epoch": 0.7572749391727494, + "grad_norm": 1.6491858062995666, + "learning_rate": 1.4672534668512928e-06, + "loss": 0.6126, + "step": 7781 + }, + { + "epoch": 0.7573722627737226, + "grad_norm": 1.493631113979562, + "learning_rate": 1.4661382543523228e-06, + "loss": 0.3886, + "step": 7782 + }, + { + "epoch": 0.7574695863746959, + "grad_norm": 1.6669771699949654, + "learning_rate": 1.4650233930153656e-06, + "loss": 0.5035, + "step": 7783 + }, + { + "epoch": 0.7575669099756691, + "grad_norm": 1.4375477658504299, + "learning_rate": 1.463908882951205e-06, + "loss": 0.389, + "step": 7784 + }, + { + "epoch": 0.7576642335766424, + "grad_norm": 1.4977856026488154, + "learning_rate": 1.4627947242705937e-06, + "loss": 0.3199, + "step": 7785 + }, + { + "epoch": 0.7577615571776156, + "grad_norm": 1.377532880221045, + "learning_rate": 1.4616809170842461e-06, + "loss": 0.3545, + "step": 7786 + }, + { + "epoch": 0.7578588807785888, + "grad_norm": 1.7520466144840299, + "learning_rate": 1.4605674615028393e-06, + "loss": 0.5094, + "step": 7787 + }, + { + "epoch": 0.757956204379562, + "grad_norm": 1.4413705216608568, + "learning_rate": 1.459454357637019e-06, + "loss": 0.451, + "step": 7788 + }, + { + "epoch": 0.7580535279805353, + "grad_norm": 1.6786532744423126, + "learning_rate": 1.4583416055973976e-06, + "loss": 0.3564, + "step": 7789 + }, + { + "epoch": 0.7581508515815085, + "grad_norm": 1.7640480493675037, + "learning_rate": 1.4572292054945452e-06, + "loss": 0.3879, + "step": 7790 + }, + { + "epoch": 0.7582481751824818, + "grad_norm": 1.895215097866571, + "learning_rate": 1.456117157439005e-06, + "loss": 0.5286, + "step": 7791 + }, + { + "epoch": 0.758345498783455, + "grad_norm": 1.465078675823223, + "learning_rate": 1.4550054615412812e-06, + "loss": 0.2829, + "step": 7792 + }, + { + "epoch": 0.7584428223844282, + "grad_norm": 1.8386053951410422, + "learning_rate": 1.4538941179118442e-06, + "loss": 0.4749, + "step": 7793 + }, + { + "epoch": 0.7585401459854014, + "grad_norm": 1.5042801033141018, + "learning_rate": 1.4527831266611264e-06, + "loss": 0.3377, + "step": 7794 + }, + { + "epoch": 0.7586374695863747, + "grad_norm": 1.2893867778378834, + "learning_rate": 1.4516724878995304e-06, + "loss": 0.2356, + "step": 7795 + }, + { + "epoch": 0.7587347931873479, + "grad_norm": 1.398051714828824, + "learning_rate": 1.4505622017374205e-06, + "loss": 0.4027, + "step": 7796 + }, + { + "epoch": 0.7588321167883212, + "grad_norm": 1.5210859881390626, + "learning_rate": 1.4494522682851275e-06, + "loss": 0.272, + "step": 7797 + }, + { + "epoch": 0.7589294403892944, + "grad_norm": 1.373616649760947, + "learning_rate": 1.448342687652944e-06, + "loss": 0.3663, + "step": 7798 + }, + { + "epoch": 0.7590267639902677, + "grad_norm": 1.4166560546627132, + "learning_rate": 1.4472334599511301e-06, + "loss": 0.2896, + "step": 7799 + }, + { + "epoch": 0.7591240875912408, + "grad_norm": 1.459583409721655, + "learning_rate": 1.446124585289913e-06, + "loss": 0.3518, + "step": 7800 + }, + { + "epoch": 0.7592214111922141, + "grad_norm": 1.3050258905758316, + "learning_rate": 1.4450160637794785e-06, + "loss": 0.313, + "step": 7801 + }, + { + "epoch": 0.7593187347931873, + "grad_norm": 1.4883062405628225, + "learning_rate": 1.4439078955299824e-06, + "loss": 0.2974, + "step": 7802 + }, + { + "epoch": 0.7594160583941606, + "grad_norm": 1.6446042441072921, + "learning_rate": 1.4428000806515452e-06, + "loss": 0.3362, + "step": 7803 + }, + { + "epoch": 0.7595133819951339, + "grad_norm": 1.1791736418357126, + "learning_rate": 1.4416926192542496e-06, + "loss": 0.1959, + "step": 7804 + }, + { + "epoch": 0.7596107055961071, + "grad_norm": 1.3456307047055482, + "learning_rate": 1.4405855114481466e-06, + "loss": 0.3606, + "step": 7805 + }, + { + "epoch": 0.7597080291970802, + "grad_norm": 1.2345304753376296, + "learning_rate": 1.4394787573432483e-06, + "loss": 0.1829, + "step": 7806 + }, + { + "epoch": 0.7598053527980535, + "grad_norm": 1.3927810756466952, + "learning_rate": 1.4383723570495362e-06, + "loss": 0.3062, + "step": 7807 + }, + { + "epoch": 0.7599026763990268, + "grad_norm": 1.6867380187933658, + "learning_rate": 1.4372663106769502e-06, + "loss": 0.3229, + "step": 7808 + }, + { + "epoch": 0.76, + "grad_norm": 1.5605568055945764, + "learning_rate": 1.4361606183354009e-06, + "loss": 0.5085, + "step": 7809 + }, + { + "epoch": 0.7600973236009733, + "grad_norm": 1.2578921085541062, + "learning_rate": 1.4350552801347617e-06, + "loss": 0.3916, + "step": 7810 + }, + { + "epoch": 0.7601946472019465, + "grad_norm": 1.7369681552917309, + "learning_rate": 1.4339502961848722e-06, + "loss": 0.369, + "step": 7811 + }, + { + "epoch": 0.7602919708029197, + "grad_norm": 1.3829141353175225, + "learning_rate": 1.4328456665955314e-06, + "loss": 0.2856, + "step": 7812 + }, + { + "epoch": 0.7603892944038929, + "grad_norm": 1.488430065814828, + "learning_rate": 1.43174139147651e-06, + "loss": 0.3318, + "step": 7813 + }, + { + "epoch": 0.7604866180048662, + "grad_norm": 1.671026852531797, + "learning_rate": 1.4306374709375391e-06, + "loss": 0.3974, + "step": 7814 + }, + { + "epoch": 0.7605839416058394, + "grad_norm": 2.0242786205747025, + "learning_rate": 1.4295339050883173e-06, + "loss": 0.4058, + "step": 7815 + }, + { + "epoch": 0.7606812652068127, + "grad_norm": 1.5431131416072197, + "learning_rate": 1.4284306940385056e-06, + "loss": 0.3425, + "step": 7816 + }, + { + "epoch": 0.7607785888077859, + "grad_norm": 1.58825289076571, + "learning_rate": 1.4273278378977312e-06, + "loss": 0.4379, + "step": 7817 + }, + { + "epoch": 0.7608759124087591, + "grad_norm": 1.4227612426828402, + "learning_rate": 1.4262253367755868e-06, + "loss": 0.3118, + "step": 7818 + }, + { + "epoch": 0.7609732360097323, + "grad_norm": 1.6009700516729723, + "learning_rate": 1.4251231907816259e-06, + "loss": 0.1862, + "step": 7819 + }, + { + "epoch": 0.7610705596107056, + "grad_norm": 1.4408050358414743, + "learning_rate": 1.4240214000253705e-06, + "loss": 0.3878, + "step": 7820 + }, + { + "epoch": 0.7611678832116788, + "grad_norm": 1.589081602628774, + "learning_rate": 1.4229199646163073e-06, + "loss": 0.3446, + "step": 7821 + }, + { + "epoch": 0.7612652068126521, + "grad_norm": 1.7189046616759744, + "learning_rate": 1.4218188846638864e-06, + "loss": 0.6199, + "step": 7822 + }, + { + "epoch": 0.7613625304136253, + "grad_norm": 1.3968952561516328, + "learning_rate": 1.4207181602775217e-06, + "loss": 0.3105, + "step": 7823 + }, + { + "epoch": 0.7614598540145986, + "grad_norm": 1.8200558984996396, + "learning_rate": 1.4196177915665926e-06, + "loss": 0.3522, + "step": 7824 + }, + { + "epoch": 0.7615571776155717, + "grad_norm": 1.4165994216816513, + "learning_rate": 1.4185177786404447e-06, + "loss": 0.3545, + "step": 7825 + }, + { + "epoch": 0.761654501216545, + "grad_norm": 1.5425284614595745, + "learning_rate": 1.4174181216083865e-06, + "loss": 0.3711, + "step": 7826 + }, + { + "epoch": 0.7617518248175182, + "grad_norm": 1.576025898163907, + "learning_rate": 1.4163188205796913e-06, + "loss": 0.4037, + "step": 7827 + }, + { + "epoch": 0.7618491484184915, + "grad_norm": 1.499544233381407, + "learning_rate": 1.4152198756635982e-06, + "loss": 0.4372, + "step": 7828 + }, + { + "epoch": 0.7619464720194647, + "grad_norm": 1.304424419262004, + "learning_rate": 1.4141212869693105e-06, + "loss": 0.3408, + "step": 7829 + }, + { + "epoch": 0.762043795620438, + "grad_norm": 1.287886944743282, + "learning_rate": 1.413023054605993e-06, + "loss": 0.3068, + "step": 7830 + }, + { + "epoch": 0.7621411192214111, + "grad_norm": 1.5897269241186747, + "learning_rate": 1.4119251786827793e-06, + "loss": 0.3535, + "step": 7831 + }, + { + "epoch": 0.7622384428223844, + "grad_norm": 1.7820270917911674, + "learning_rate": 1.4108276593087661e-06, + "loss": 0.3465, + "step": 7832 + }, + { + "epoch": 0.7623357664233577, + "grad_norm": 1.597271264286213, + "learning_rate": 1.409730496593016e-06, + "loss": 0.4709, + "step": 7833 + }, + { + "epoch": 0.7624330900243309, + "grad_norm": 1.261265818077985, + "learning_rate": 1.4086336906445518e-06, + "loss": 0.2795, + "step": 7834 + }, + { + "epoch": 0.7625304136253042, + "grad_norm": 1.4768601374901407, + "learning_rate": 1.407537241572365e-06, + "loss": 0.2749, + "step": 7835 + }, + { + "epoch": 0.7626277372262774, + "grad_norm": 1.5545240153325408, + "learning_rate": 1.4064411494854107e-06, + "loss": 0.308, + "step": 7836 + }, + { + "epoch": 0.7627250608272506, + "grad_norm": 1.4090072727272875, + "learning_rate": 1.4053454144926082e-06, + "loss": 0.2564, + "step": 7837 + }, + { + "epoch": 0.7628223844282238, + "grad_norm": 1.7237642146265113, + "learning_rate": 1.4042500367028411e-06, + "loss": 0.4133, + "step": 7838 + }, + { + "epoch": 0.7629197080291971, + "grad_norm": 1.8601904611593756, + "learning_rate": 1.4031550162249584e-06, + "loss": 0.4317, + "step": 7839 + }, + { + "epoch": 0.7630170316301703, + "grad_norm": 1.486526375781323, + "learning_rate": 1.4020603531677735e-06, + "loss": 0.3913, + "step": 7840 + }, + { + "epoch": 0.7631143552311436, + "grad_norm": 1.8767576244845232, + "learning_rate": 1.4009660476400617e-06, + "loss": 0.3976, + "step": 7841 + }, + { + "epoch": 0.7632116788321168, + "grad_norm": 1.4411281741316846, + "learning_rate": 1.3998720997505655e-06, + "loss": 0.3178, + "step": 7842 + }, + { + "epoch": 0.7633090024330901, + "grad_norm": 1.6761394553604179, + "learning_rate": 1.3987785096079936e-06, + "loss": 0.4988, + "step": 7843 + }, + { + "epoch": 0.7634063260340632, + "grad_norm": 1.5744253169761186, + "learning_rate": 1.3976852773210126e-06, + "loss": 0.4224, + "step": 7844 + }, + { + "epoch": 0.7635036496350365, + "grad_norm": 1.538350516678272, + "learning_rate": 1.3965924029982602e-06, + "loss": 0.5032, + "step": 7845 + }, + { + "epoch": 0.7636009732360097, + "grad_norm": 1.561645808384256, + "learning_rate": 1.3954998867483354e-06, + "loss": 0.3841, + "step": 7846 + }, + { + "epoch": 0.763698296836983, + "grad_norm": 1.5956655472214338, + "learning_rate": 1.3944077286798024e-06, + "loss": 0.4405, + "step": 7847 + }, + { + "epoch": 0.7637956204379562, + "grad_norm": 1.4960967778506404, + "learning_rate": 1.3933159289011893e-06, + "loss": 0.3865, + "step": 7848 + }, + { + "epoch": 0.7638929440389295, + "grad_norm": 1.6972202518838093, + "learning_rate": 1.3922244875209894e-06, + "loss": 0.4726, + "step": 7849 + }, + { + "epoch": 0.7639902676399026, + "grad_norm": 1.6894401417492455, + "learning_rate": 1.39113340464766e-06, + "loss": 0.3765, + "step": 7850 + }, + { + "epoch": 0.7640875912408759, + "grad_norm": 1.3053161111808653, + "learning_rate": 1.3900426803896234e-06, + "loss": 0.1602, + "step": 7851 + }, + { + "epoch": 0.7641849148418491, + "grad_norm": 1.7972190195900501, + "learning_rate": 1.3889523148552631e-06, + "loss": 0.3869, + "step": 7852 + }, + { + "epoch": 0.7642822384428224, + "grad_norm": 1.6204099384906139, + "learning_rate": 1.3878623081529307e-06, + "loss": 0.381, + "step": 7853 + }, + { + "epoch": 0.7643795620437956, + "grad_norm": 1.9378258831003112, + "learning_rate": 1.3867726603909427e-06, + "loss": 0.3087, + "step": 7854 + }, + { + "epoch": 0.7644768856447689, + "grad_norm": 1.51676245411365, + "learning_rate": 1.3856833716775747e-06, + "loss": 0.373, + "step": 7855 + }, + { + "epoch": 0.764574209245742, + "grad_norm": 1.7112315722100961, + "learning_rate": 1.3845944421210717e-06, + "loss": 0.5464, + "step": 7856 + }, + { + "epoch": 0.7646715328467153, + "grad_norm": 1.7847025689190414, + "learning_rate": 1.3835058718296412e-06, + "loss": 0.4499, + "step": 7857 + }, + { + "epoch": 0.7647688564476885, + "grad_norm": 2.039767673203135, + "learning_rate": 1.3824176609114549e-06, + "loss": 0.7068, + "step": 7858 + }, + { + "epoch": 0.7648661800486618, + "grad_norm": 1.7629770323262892, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.3078, + "step": 7859 + }, + { + "epoch": 0.7649635036496351, + "grad_norm": 1.6502215959115287, + "learning_rate": 1.3802423176273245e-06, + "loss": 0.4482, + "step": 7860 + }, + { + "epoch": 0.7650608272506083, + "grad_norm": 1.545163681840995, + "learning_rate": 1.379155185477546e-06, + "loss": 0.6092, + "step": 7861 + }, + { + "epoch": 0.7651581508515815, + "grad_norm": 1.5158719520340032, + "learning_rate": 1.3780684131333432e-06, + "loss": 0.325, + "step": 7862 + }, + { + "epoch": 0.7652554744525547, + "grad_norm": 1.5411986422432038, + "learning_rate": 1.3769820007027075e-06, + "loss": 0.3506, + "step": 7863 + }, + { + "epoch": 0.765352798053528, + "grad_norm": 1.6676478200016922, + "learning_rate": 1.3758959482935964e-06, + "loss": 0.4474, + "step": 7864 + }, + { + "epoch": 0.7654501216545012, + "grad_norm": 1.855395763367132, + "learning_rate": 1.3748102560139348e-06, + "loss": 0.5629, + "step": 7865 + }, + { + "epoch": 0.7655474452554745, + "grad_norm": 1.7443690305284454, + "learning_rate": 1.3737249239716043e-06, + "loss": 0.5082, + "step": 7866 + }, + { + "epoch": 0.7656447688564477, + "grad_norm": 1.854639385984423, + "learning_rate": 1.3726399522744572e-06, + "loss": 0.6939, + "step": 7867 + }, + { + "epoch": 0.765742092457421, + "grad_norm": 1.7386780703795695, + "learning_rate": 1.3715553410303073e-06, + "loss": 0.3926, + "step": 7868 + }, + { + "epoch": 0.7658394160583941, + "grad_norm": 1.6307682948462758, + "learning_rate": 1.3704710903469337e-06, + "loss": 0.2702, + "step": 7869 + }, + { + "epoch": 0.7659367396593674, + "grad_norm": 1.5942180918514872, + "learning_rate": 1.3693872003320785e-06, + "loss": 0.4001, + "step": 7870 + }, + { + "epoch": 0.7660340632603406, + "grad_norm": 1.761263431148608, + "learning_rate": 1.3683036710934495e-06, + "loss": 0.4441, + "step": 7871 + }, + { + "epoch": 0.7661313868613139, + "grad_norm": 1.6133022353167517, + "learning_rate": 1.3672205027387176e-06, + "loss": 0.3181, + "step": 7872 + }, + { + "epoch": 0.7662287104622871, + "grad_norm": 1.7916159585681903, + "learning_rate": 1.3661376953755157e-06, + "loss": 0.447, + "step": 7873 + }, + { + "epoch": 0.7663260340632604, + "grad_norm": 1.5488561408839132, + "learning_rate": 1.365055249111445e-06, + "loss": 0.3443, + "step": 7874 + }, + { + "epoch": 0.7664233576642335, + "grad_norm": 1.5946271389992275, + "learning_rate": 1.3639731640540683e-06, + "loss": 0.4798, + "step": 7875 + }, + { + "epoch": 0.7665206812652068, + "grad_norm": 1.5428916429012827, + "learning_rate": 1.3628914403109144e-06, + "loss": 0.4648, + "step": 7876 + }, + { + "epoch": 0.76661800486618, + "grad_norm": 1.4108119296173862, + "learning_rate": 1.3618100779894728e-06, + "loss": 0.3672, + "step": 7877 + }, + { + "epoch": 0.7667153284671533, + "grad_norm": 1.578416098761305, + "learning_rate": 1.3607290771971993e-06, + "loss": 0.2838, + "step": 7878 + }, + { + "epoch": 0.7668126520681265, + "grad_norm": 1.6785686226215506, + "learning_rate": 1.3596484380415142e-06, + "loss": 0.3681, + "step": 7879 + }, + { + "epoch": 0.7669099756690998, + "grad_norm": 1.601051137280256, + "learning_rate": 1.3585681606298017e-06, + "loss": 0.3028, + "step": 7880 + }, + { + "epoch": 0.7670072992700729, + "grad_norm": 1.4241479166551427, + "learning_rate": 1.3574882450694094e-06, + "loss": 0.4884, + "step": 7881 + }, + { + "epoch": 0.7671046228710462, + "grad_norm": 1.5174550285148696, + "learning_rate": 1.356408691467649e-06, + "loss": 0.3915, + "step": 7882 + }, + { + "epoch": 0.7672019464720194, + "grad_norm": 2.041792988923985, + "learning_rate": 1.3553294999317974e-06, + "loss": 0.3818, + "step": 7883 + }, + { + "epoch": 0.7672992700729927, + "grad_norm": 1.525606850747028, + "learning_rate": 1.3542506705690927e-06, + "loss": 0.2335, + "step": 7884 + }, + { + "epoch": 0.767396593673966, + "grad_norm": 1.3961979498933004, + "learning_rate": 1.3531722034867395e-06, + "loss": 0.3052, + "step": 7885 + }, + { + "epoch": 0.7674939172749392, + "grad_norm": 1.4639885198737117, + "learning_rate": 1.3520940987919056e-06, + "loss": 0.3626, + "step": 7886 + }, + { + "epoch": 0.7675912408759125, + "grad_norm": 1.495042476369582, + "learning_rate": 1.3510163565917257e-06, + "loss": 0.3448, + "step": 7887 + }, + { + "epoch": 0.7676885644768856, + "grad_norm": 0.9228719073278262, + "learning_rate": 1.3499389769932909e-06, + "loss": 0.1899, + "step": 7888 + }, + { + "epoch": 0.7677858880778589, + "grad_norm": 1.5920309631844594, + "learning_rate": 1.348861960103664e-06, + "loss": 0.3862, + "step": 7889 + }, + { + "epoch": 0.7678832116788321, + "grad_norm": 1.8311121735767772, + "learning_rate": 1.347785306029868e-06, + "loss": 0.4002, + "step": 7890 + }, + { + "epoch": 0.7679805352798054, + "grad_norm": 1.4257687920849145, + "learning_rate": 1.3467090148788914e-06, + "loss": 0.1932, + "step": 7891 + }, + { + "epoch": 0.7680778588807786, + "grad_norm": 1.492613726552967, + "learning_rate": 1.345633086757685e-06, + "loss": 0.4393, + "step": 7892 + }, + { + "epoch": 0.7681751824817519, + "grad_norm": 1.2505367080504926, + "learning_rate": 1.3445575217731649e-06, + "loss": 0.2361, + "step": 7893 + }, + { + "epoch": 0.768272506082725, + "grad_norm": 1.4768120287410489, + "learning_rate": 1.3434823200322122e-06, + "loss": 0.2951, + "step": 7894 + }, + { + "epoch": 0.7683698296836983, + "grad_norm": 1.614513453130117, + "learning_rate": 1.3424074816416677e-06, + "loss": 0.3146, + "step": 7895 + }, + { + "epoch": 0.7684671532846715, + "grad_norm": 1.3904464264130176, + "learning_rate": 1.3413330067083396e-06, + "loss": 0.2444, + "step": 7896 + }, + { + "epoch": 0.7685644768856448, + "grad_norm": 1.5225882486158226, + "learning_rate": 1.3402588953389995e-06, + "loss": 0.4455, + "step": 7897 + }, + { + "epoch": 0.768661800486618, + "grad_norm": 1.586214839945439, + "learning_rate": 1.3391851476403838e-06, + "loss": 0.4929, + "step": 7898 + }, + { + "epoch": 0.7687591240875913, + "grad_norm": 1.4154122165313887, + "learning_rate": 1.3381117637191887e-06, + "loss": 0.3174, + "step": 7899 + }, + { + "epoch": 0.7688564476885644, + "grad_norm": 1.4414087311017771, + "learning_rate": 1.3370387436820787e-06, + "loss": 0.3853, + "step": 7900 + }, + { + "epoch": 0.7689537712895377, + "grad_norm": 1.4998612339682924, + "learning_rate": 1.3359660876356801e-06, + "loss": 0.4182, + "step": 7901 + }, + { + "epoch": 0.7690510948905109, + "grad_norm": 1.5230925340600847, + "learning_rate": 1.3348937956865833e-06, + "loss": 0.4115, + "step": 7902 + }, + { + "epoch": 0.7691484184914842, + "grad_norm": 1.4115192874073097, + "learning_rate": 1.3338218679413423e-06, + "loss": 0.2342, + "step": 7903 + }, + { + "epoch": 0.7692457420924574, + "grad_norm": 1.440382762079926, + "learning_rate": 1.332750304506476e-06, + "loss": 0.2579, + "step": 7904 + }, + { + "epoch": 0.7693430656934307, + "grad_norm": 1.7989096359026238, + "learning_rate": 1.3316791054884676e-06, + "loss": 0.162, + "step": 7905 + }, + { + "epoch": 0.769440389294404, + "grad_norm": 1.481137811460323, + "learning_rate": 1.3306082709937602e-06, + "loss": 0.5439, + "step": 7906 + }, + { + "epoch": 0.7695377128953771, + "grad_norm": 1.2572382214218802, + "learning_rate": 1.3295378011287636e-06, + "loss": 0.212, + "step": 7907 + }, + { + "epoch": 0.7696350364963503, + "grad_norm": 1.4760885941451831, + "learning_rate": 1.3284676959998539e-06, + "loss": 0.3592, + "step": 7908 + }, + { + "epoch": 0.7697323600973236, + "grad_norm": 1.2650407926292302, + "learning_rate": 1.3273979557133636e-06, + "loss": 0.2517, + "step": 7909 + }, + { + "epoch": 0.7698296836982969, + "grad_norm": 1.391262893959748, + "learning_rate": 1.3263285803755965e-06, + "loss": 0.2073, + "step": 7910 + }, + { + "epoch": 0.7699270072992701, + "grad_norm": 1.5035507772870098, + "learning_rate": 1.325259570092816e-06, + "loss": 0.3162, + "step": 7911 + }, + { + "epoch": 0.7700243309002434, + "grad_norm": 1.599355227969432, + "learning_rate": 1.3241909249712508e-06, + "loss": 0.595, + "step": 7912 + }, + { + "epoch": 0.7701216545012165, + "grad_norm": 1.6979665066321694, + "learning_rate": 1.3231226451170932e-06, + "loss": 0.4382, + "step": 7913 + }, + { + "epoch": 0.7702189781021898, + "grad_norm": 1.5929669719272659, + "learning_rate": 1.3220547306364977e-06, + "loss": 0.3768, + "step": 7914 + }, + { + "epoch": 0.770316301703163, + "grad_norm": 1.8145439199144737, + "learning_rate": 1.3209871816355846e-06, + "loss": 0.6182, + "step": 7915 + }, + { + "epoch": 0.7704136253041363, + "grad_norm": 1.4754303320605837, + "learning_rate": 1.3199199982204375e-06, + "loss": 0.3324, + "step": 7916 + }, + { + "epoch": 0.7705109489051095, + "grad_norm": 1.3790560632862776, + "learning_rate": 1.3188531804971012e-06, + "loss": 0.3002, + "step": 7917 + }, + { + "epoch": 0.7706082725060828, + "grad_norm": 1.6775435848799138, + "learning_rate": 1.3177867285715868e-06, + "loss": 0.4678, + "step": 7918 + }, + { + "epoch": 0.7707055961070559, + "grad_norm": 1.468909559308031, + "learning_rate": 1.3167206425498702e-06, + "loss": 0.3819, + "step": 7919 + }, + { + "epoch": 0.7708029197080292, + "grad_norm": 1.5287200892369317, + "learning_rate": 1.3156549225378855e-06, + "loss": 0.4261, + "step": 7920 + }, + { + "epoch": 0.7709002433090024, + "grad_norm": 1.720639871740175, + "learning_rate": 1.3145895686415355e-06, + "loss": 0.2768, + "step": 7921 + }, + { + "epoch": 0.7709975669099757, + "grad_norm": 1.6207703049408864, + "learning_rate": 1.3135245809666842e-06, + "loss": 0.5777, + "step": 7922 + }, + { + "epoch": 0.7710948905109489, + "grad_norm": 1.233996466045791, + "learning_rate": 1.312459959619164e-06, + "loss": 0.1403, + "step": 7923 + }, + { + "epoch": 0.7711922141119222, + "grad_norm": 1.44915919425724, + "learning_rate": 1.3113957047047627e-06, + "loss": 0.373, + "step": 7924 + }, + { + "epoch": 0.7712895377128953, + "grad_norm": 1.7193201679900614, + "learning_rate": 1.3103318163292379e-06, + "loss": 0.4619, + "step": 7925 + }, + { + "epoch": 0.7713868613138686, + "grad_norm": 1.5739673196000434, + "learning_rate": 1.3092682945983092e-06, + "loss": 0.2611, + "step": 7926 + }, + { + "epoch": 0.7714841849148418, + "grad_norm": 1.6511175416986905, + "learning_rate": 1.308205139617657e-06, + "loss": 0.3991, + "step": 7927 + }, + { + "epoch": 0.7715815085158151, + "grad_norm": 1.41146772693544, + "learning_rate": 1.3071423514929293e-06, + "loss": 0.297, + "step": 7928 + }, + { + "epoch": 0.7716788321167883, + "grad_norm": 1.86499865575338, + "learning_rate": 1.3060799303297367e-06, + "loss": 0.33, + "step": 7929 + }, + { + "epoch": 0.7717761557177616, + "grad_norm": 1.5355940721042187, + "learning_rate": 1.3050178762336524e-06, + "loss": 0.3848, + "step": 7930 + }, + { + "epoch": 0.7718734793187348, + "grad_norm": 1.5958842031207991, + "learning_rate": 1.303956189310212e-06, + "loss": 0.5061, + "step": 7931 + }, + { + "epoch": 0.771970802919708, + "grad_norm": 1.4531090027943792, + "learning_rate": 1.3028948696649164e-06, + "loss": 0.3583, + "step": 7932 + }, + { + "epoch": 0.7720681265206812, + "grad_norm": 1.2544158185353156, + "learning_rate": 1.301833917403229e-06, + "loss": 0.2189, + "step": 7933 + }, + { + "epoch": 0.7721654501216545, + "grad_norm": 1.5100489026165533, + "learning_rate": 1.300773332630581e-06, + "loss": 0.385, + "step": 7934 + }, + { + "epoch": 0.7722627737226277, + "grad_norm": 1.3970155359358676, + "learning_rate": 1.299713115452359e-06, + "loss": 0.2527, + "step": 7935 + }, + { + "epoch": 0.772360097323601, + "grad_norm": 1.5473430195255553, + "learning_rate": 1.2986532659739193e-06, + "loss": 0.4161, + "step": 7936 + }, + { + "epoch": 0.7724574209245743, + "grad_norm": 1.764788944496118, + "learning_rate": 1.2975937843005815e-06, + "loss": 0.579, + "step": 7937 + }, + { + "epoch": 0.7725547445255474, + "grad_norm": 1.4491692300649166, + "learning_rate": 1.296534670537623e-06, + "loss": 0.2159, + "step": 7938 + }, + { + "epoch": 0.7726520681265207, + "grad_norm": 1.5103122368150699, + "learning_rate": 1.2954759247902905e-06, + "loss": 0.3466, + "step": 7939 + }, + { + "epoch": 0.7727493917274939, + "grad_norm": 1.7823858822196685, + "learning_rate": 1.2944175471637927e-06, + "loss": 0.3276, + "step": 7940 + }, + { + "epoch": 0.7728467153284672, + "grad_norm": 1.799922846094268, + "learning_rate": 1.2933595377633023e-06, + "loss": 0.3348, + "step": 7941 + }, + { + "epoch": 0.7729440389294404, + "grad_norm": 1.6302377943925448, + "learning_rate": 1.292301896693951e-06, + "loss": 0.3943, + "step": 7942 + }, + { + "epoch": 0.7730413625304137, + "grad_norm": 1.6093218155748727, + "learning_rate": 1.2912446240608385e-06, + "loss": 0.419, + "step": 7943 + }, + { + "epoch": 0.7731386861313868, + "grad_norm": 1.4107342747932858, + "learning_rate": 1.2901877199690294e-06, + "loss": 0.4163, + "step": 7944 + }, + { + "epoch": 0.7732360097323601, + "grad_norm": 1.7619285712296502, + "learning_rate": 1.289131184523546e-06, + "loss": 0.3296, + "step": 7945 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 1.501127613932862, + "learning_rate": 1.288075017829377e-06, + "loss": 0.3592, + "step": 7946 + }, + { + "epoch": 0.7734306569343066, + "grad_norm": 1.5880491733576618, + "learning_rate": 1.2870192199914754e-06, + "loss": 0.3717, + "step": 7947 + }, + { + "epoch": 0.7735279805352798, + "grad_norm": 1.7300785218869492, + "learning_rate": 1.2859637911147576e-06, + "loss": 0.4578, + "step": 7948 + }, + { + "epoch": 0.7736253041362531, + "grad_norm": 1.6364871933316798, + "learning_rate": 1.2849087313040992e-06, + "loss": 0.3168, + "step": 7949 + }, + { + "epoch": 0.7737226277372263, + "grad_norm": 1.8650216665364483, + "learning_rate": 1.283854040664344e-06, + "loss": 0.3454, + "step": 7950 + }, + { + "epoch": 0.7738199513381995, + "grad_norm": 1.6512436688032501, + "learning_rate": 1.2827997193002966e-06, + "loss": 0.4521, + "step": 7951 + }, + { + "epoch": 0.7739172749391727, + "grad_norm": 1.6172163956129242, + "learning_rate": 1.2817457673167273e-06, + "loss": 0.4285, + "step": 7952 + }, + { + "epoch": 0.774014598540146, + "grad_norm": 1.658855619884036, + "learning_rate": 1.2806921848183635e-06, + "loss": 0.4285, + "step": 7953 + }, + { + "epoch": 0.7741119221411192, + "grad_norm": 1.7924251355614291, + "learning_rate": 1.279638971909905e-06, + "loss": 0.4577, + "step": 7954 + }, + { + "epoch": 0.7742092457420925, + "grad_norm": 1.920043196390925, + "learning_rate": 1.27858612869601e-06, + "loss": 0.2457, + "step": 7955 + }, + { + "epoch": 0.7743065693430657, + "grad_norm": 1.8335508376481173, + "learning_rate": 1.2775336552812972e-06, + "loss": 0.3851, + "step": 7956 + }, + { + "epoch": 0.7744038929440389, + "grad_norm": 1.4542453507756816, + "learning_rate": 1.2764815517703526e-06, + "loss": 0.423, + "step": 7957 + }, + { + "epoch": 0.7745012165450121, + "grad_norm": 1.547732106120922, + "learning_rate": 1.2754298182677256e-06, + "loss": 0.3262, + "step": 7958 + }, + { + "epoch": 0.7745985401459854, + "grad_norm": 1.2077361088883485, + "learning_rate": 1.274378454877928e-06, + "loss": 0.3275, + "step": 7959 + }, + { + "epoch": 0.7746958637469586, + "grad_norm": 1.922826019938908, + "learning_rate": 1.273327461705432e-06, + "loss": 0.3291, + "step": 7960 + }, + { + "epoch": 0.7747931873479319, + "grad_norm": 1.4497732248223738, + "learning_rate": 1.272276838854677e-06, + "loss": 0.3398, + "step": 7961 + }, + { + "epoch": 0.7748905109489052, + "grad_norm": 1.834323124582585, + "learning_rate": 1.2712265864300643e-06, + "loss": 0.3874, + "step": 7962 + }, + { + "epoch": 0.7749878345498783, + "grad_norm": 1.6295120978759328, + "learning_rate": 1.2701767045359553e-06, + "loss": 0.3265, + "step": 7963 + }, + { + "epoch": 0.7750851581508515, + "grad_norm": 1.4943415089120586, + "learning_rate": 1.2691271932766813e-06, + "loss": 0.2417, + "step": 7964 + }, + { + "epoch": 0.7751824817518248, + "grad_norm": 1.7435244916897348, + "learning_rate": 1.2680780527565313e-06, + "loss": 0.6215, + "step": 7965 + }, + { + "epoch": 0.7752798053527981, + "grad_norm": 1.2675653818486237, + "learning_rate": 1.2670292830797604e-06, + "loss": 0.2425, + "step": 7966 + }, + { + "epoch": 0.7753771289537713, + "grad_norm": 2.3643454433432023, + "learning_rate": 1.2659808843505822e-06, + "loss": 0.5044, + "step": 7967 + }, + { + "epoch": 0.7754744525547446, + "grad_norm": 1.53184015228964, + "learning_rate": 1.264932856673179e-06, + "loss": 0.2696, + "step": 7968 + }, + { + "epoch": 0.7755717761557177, + "grad_norm": 1.6727865710886582, + "learning_rate": 1.2638852001516932e-06, + "loss": 0.5426, + "step": 7969 + }, + { + "epoch": 0.775669099756691, + "grad_norm": 1.553958741188976, + "learning_rate": 1.2628379148902336e-06, + "loss": 0.4019, + "step": 7970 + }, + { + "epoch": 0.7757664233576642, + "grad_norm": 1.6042204596017222, + "learning_rate": 1.2617910009928659e-06, + "loss": 0.5246, + "step": 7971 + }, + { + "epoch": 0.7758637469586375, + "grad_norm": 1.7279109608439713, + "learning_rate": 1.2607444585636235e-06, + "loss": 0.4261, + "step": 7972 + }, + { + "epoch": 0.7759610705596107, + "grad_norm": 1.627177979683393, + "learning_rate": 1.2596982877065028e-06, + "loss": 0.2527, + "step": 7973 + }, + { + "epoch": 0.776058394160584, + "grad_norm": 1.3765880777188844, + "learning_rate": 1.2586524885254625e-06, + "loss": 0.2785, + "step": 7974 + }, + { + "epoch": 0.7761557177615572, + "grad_norm": 1.5227665915517687, + "learning_rate": 1.257607061124424e-06, + "loss": 0.3853, + "step": 7975 + }, + { + "epoch": 0.7762530413625304, + "grad_norm": 1.4206833089936424, + "learning_rate": 1.256562005607272e-06, + "loss": 0.3306, + "step": 7976 + }, + { + "epoch": 0.7763503649635036, + "grad_norm": 1.49817944354141, + "learning_rate": 1.2555173220778555e-06, + "loss": 0.3496, + "step": 7977 + }, + { + "epoch": 0.7764476885644769, + "grad_norm": 1.4108983250921605, + "learning_rate": 1.2544730106399833e-06, + "loss": 0.3183, + "step": 7978 + }, + { + "epoch": 0.7765450121654501, + "grad_norm": 1.4680199939499186, + "learning_rate": 1.25342907139743e-06, + "loss": 0.2122, + "step": 7979 + }, + { + "epoch": 0.7766423357664234, + "grad_norm": 1.450915389672003, + "learning_rate": 1.252385504453933e-06, + "loss": 0.1901, + "step": 7980 + }, + { + "epoch": 0.7767396593673966, + "grad_norm": 1.944774997709897, + "learning_rate": 1.2513423099131926e-06, + "loss": 0.4237, + "step": 7981 + }, + { + "epoch": 0.7768369829683698, + "grad_norm": 1.5308960486066336, + "learning_rate": 1.2502994878788704e-06, + "loss": 0.3784, + "step": 7982 + }, + { + "epoch": 0.776934306569343, + "grad_norm": 1.3381063245657632, + "learning_rate": 1.2492570384545927e-06, + "loss": 0.2809, + "step": 7983 + }, + { + "epoch": 0.7770316301703163, + "grad_norm": 1.502570492936247, + "learning_rate": 1.2482149617439481e-06, + "loss": 0.4767, + "step": 7984 + }, + { + "epoch": 0.7771289537712895, + "grad_norm": 1.5801319919452912, + "learning_rate": 1.2471732578504886e-06, + "loss": 0.2829, + "step": 7985 + }, + { + "epoch": 0.7772262773722628, + "grad_norm": 1.6245317518067086, + "learning_rate": 1.2461319268777294e-06, + "loss": 0.4292, + "step": 7986 + }, + { + "epoch": 0.777323600973236, + "grad_norm": 1.4848782940964584, + "learning_rate": 1.245090968929148e-06, + "loss": 0.1644, + "step": 7987 + }, + { + "epoch": 0.7774209245742092, + "grad_norm": 1.4583890177676826, + "learning_rate": 1.2440503841081863e-06, + "loss": 0.374, + "step": 7988 + }, + { + "epoch": 0.7775182481751824, + "grad_norm": 1.2261059148133207, + "learning_rate": 1.2430101725182453e-06, + "loss": 0.279, + "step": 7989 + }, + { + "epoch": 0.7776155717761557, + "grad_norm": 1.7131218748679982, + "learning_rate": 1.2419703342626925e-06, + "loss": 0.2968, + "step": 7990 + }, + { + "epoch": 0.777712895377129, + "grad_norm": 1.5447437747966324, + "learning_rate": 1.240930869444859e-06, + "loss": 0.4084, + "step": 7991 + }, + { + "epoch": 0.7778102189781022, + "grad_norm": 1.7455519960122845, + "learning_rate": 1.2398917781680342e-06, + "loss": 0.4937, + "step": 7992 + }, + { + "epoch": 0.7779075425790755, + "grad_norm": 1.7308898826670602, + "learning_rate": 1.2388530605354742e-06, + "loss": 0.4086, + "step": 7993 + }, + { + "epoch": 0.7780048661800487, + "grad_norm": 1.6835369622727692, + "learning_rate": 1.237814716650398e-06, + "loss": 0.3949, + "step": 7994 + }, + { + "epoch": 0.7781021897810219, + "grad_norm": 1.5331711686807037, + "learning_rate": 1.2367767466159853e-06, + "loss": 0.4158, + "step": 7995 + }, + { + "epoch": 0.7781995133819951, + "grad_norm": 1.4941267424293787, + "learning_rate": 1.2357391505353799e-06, + "loss": 0.2932, + "step": 7996 + }, + { + "epoch": 0.7782968369829684, + "grad_norm": 1.5143531103122556, + "learning_rate": 1.2347019285116897e-06, + "loss": 0.3148, + "step": 7997 + }, + { + "epoch": 0.7783941605839416, + "grad_norm": 1.4189651393358438, + "learning_rate": 1.2336650806479826e-06, + "loss": 0.3945, + "step": 7998 + }, + { + "epoch": 0.7784914841849149, + "grad_norm": 1.3358253744264663, + "learning_rate": 1.2326286070472932e-06, + "loss": 0.3246, + "step": 7999 + }, + { + "epoch": 0.7785888077858881, + "grad_norm": 1.6203173145307268, + "learning_rate": 1.2315925078126134e-06, + "loss": 0.2273, + "step": 8000 + }, + { + "epoch": 0.7786861313868613, + "grad_norm": 1.5546295308562903, + "learning_rate": 1.2305567830469016e-06, + "loss": 0.4466, + "step": 8001 + }, + { + "epoch": 0.7787834549878345, + "grad_norm": 1.4270934872602479, + "learning_rate": 1.2295214328530813e-06, + "loss": 0.3937, + "step": 8002 + }, + { + "epoch": 0.7788807785888078, + "grad_norm": 2.5353515737845216, + "learning_rate": 1.2284864573340322e-06, + "loss": 0.2474, + "step": 8003 + }, + { + "epoch": 0.778978102189781, + "grad_norm": 1.6056821361100397, + "learning_rate": 1.227451856592602e-06, + "loss": 0.2712, + "step": 8004 + }, + { + "epoch": 0.7790754257907543, + "grad_norm": 1.6203804377086684, + "learning_rate": 1.2264176307315995e-06, + "loss": 0.3325, + "step": 8005 + }, + { + "epoch": 0.7791727493917275, + "grad_norm": 1.8788726232616566, + "learning_rate": 1.2253837798537966e-06, + "loss": 0.5098, + "step": 8006 + }, + { + "epoch": 0.7792700729927007, + "grad_norm": 1.4545499316601775, + "learning_rate": 1.224350304061927e-06, + "loss": 0.3515, + "step": 8007 + }, + { + "epoch": 0.7793673965936739, + "grad_norm": 1.4250792564971901, + "learning_rate": 1.2233172034586887e-06, + "loss": 0.3441, + "step": 8008 + }, + { + "epoch": 0.7794647201946472, + "grad_norm": 1.9250344459127062, + "learning_rate": 1.222284478146743e-06, + "loss": 0.4836, + "step": 8009 + }, + { + "epoch": 0.7795620437956204, + "grad_norm": 1.6147609325888719, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.5422, + "step": 8010 + }, + { + "epoch": 0.7796593673965937, + "grad_norm": 1.4610419414539406, + "learning_rate": 1.220220153807174e-06, + "loss": 0.3588, + "step": 8011 + }, + { + "epoch": 0.779756690997567, + "grad_norm": 1.5543411865974044, + "learning_rate": 1.219188554984685e-06, + "loss": 0.2768, + "step": 8012 + }, + { + "epoch": 0.7798540145985401, + "grad_norm": 1.6054582061792608, + "learning_rate": 1.2181573318637546e-06, + "loss": 0.3668, + "step": 8013 + }, + { + "epoch": 0.7799513381995133, + "grad_norm": 1.6393052378684185, + "learning_rate": 1.2171264845468533e-06, + "loss": 0.3443, + "step": 8014 + }, + { + "epoch": 0.7800486618004866, + "grad_norm": 1.4765436080450656, + "learning_rate": 1.2160960131364186e-06, + "loss": 0.2828, + "step": 8015 + }, + { + "epoch": 0.7801459854014599, + "grad_norm": 1.8766210542130564, + "learning_rate": 1.2150659177348489e-06, + "loss": 0.3658, + "step": 8016 + }, + { + "epoch": 0.7802433090024331, + "grad_norm": 1.7191226208521349, + "learning_rate": 1.2140361984445048e-06, + "loss": 0.4229, + "step": 8017 + }, + { + "epoch": 0.7803406326034064, + "grad_norm": 1.8684570593224075, + "learning_rate": 1.213006855367711e-06, + "loss": 0.3817, + "step": 8018 + }, + { + "epoch": 0.7804379562043796, + "grad_norm": 1.4600234002367536, + "learning_rate": 1.211977888606754e-06, + "loss": 0.2087, + "step": 8019 + }, + { + "epoch": 0.7805352798053528, + "grad_norm": 1.6860837998431109, + "learning_rate": 1.2109492982638838e-06, + "loss": 0.4425, + "step": 8020 + }, + { + "epoch": 0.780632603406326, + "grad_norm": 1.766303075707019, + "learning_rate": 1.209921084441309e-06, + "loss": 0.3954, + "step": 8021 + }, + { + "epoch": 0.7807299270072993, + "grad_norm": 1.5446852781359672, + "learning_rate": 1.2088932472412057e-06, + "loss": 0.332, + "step": 8022 + }, + { + "epoch": 0.7808272506082725, + "grad_norm": 1.420104465000044, + "learning_rate": 1.2078657867657107e-06, + "loss": 0.2502, + "step": 8023 + }, + { + "epoch": 0.7809245742092458, + "grad_norm": 1.4736275543260646, + "learning_rate": 1.2068387031169248e-06, + "loss": 0.344, + "step": 8024 + }, + { + "epoch": 0.781021897810219, + "grad_norm": 1.6352215385252125, + "learning_rate": 1.2058119963969072e-06, + "loss": 0.3472, + "step": 8025 + }, + { + "epoch": 0.7811192214111922, + "grad_norm": 1.5341555995980578, + "learning_rate": 1.2047856667076835e-06, + "loss": 0.3093, + "step": 8026 + }, + { + "epoch": 0.7812165450121654, + "grad_norm": 1.8095254496218005, + "learning_rate": 1.2037597141512403e-06, + "loss": 0.5334, + "step": 8027 + }, + { + "epoch": 0.7813138686131387, + "grad_norm": 1.4914088589389176, + "learning_rate": 1.2027341388295283e-06, + "loss": 0.3996, + "step": 8028 + }, + { + "epoch": 0.7814111922141119, + "grad_norm": 1.3757463510344643, + "learning_rate": 1.2017089408444587e-06, + "loss": 0.3198, + "step": 8029 + }, + { + "epoch": 0.7815085158150852, + "grad_norm": 1.7849985132291357, + "learning_rate": 1.2006841202979057e-06, + "loss": 0.5423, + "step": 8030 + }, + { + "epoch": 0.7816058394160584, + "grad_norm": 1.593816718406655, + "learning_rate": 1.1996596772917091e-06, + "loss": 0.4319, + "step": 8031 + }, + { + "epoch": 0.7817031630170316, + "grad_norm": 1.731251627787021, + "learning_rate": 1.198635611927665e-06, + "loss": 0.542, + "step": 8032 + }, + { + "epoch": 0.7818004866180048, + "grad_norm": 1.2907197414104656, + "learning_rate": 1.1976119243075357e-06, + "loss": 0.3468, + "step": 8033 + }, + { + "epoch": 0.7818978102189781, + "grad_norm": 1.4988732874719506, + "learning_rate": 1.1965886145330475e-06, + "loss": 0.2441, + "step": 8034 + }, + { + "epoch": 0.7819951338199513, + "grad_norm": 1.552285490589026, + "learning_rate": 1.1955656827058877e-06, + "loss": 0.2507, + "step": 8035 + }, + { + "epoch": 0.7820924574209246, + "grad_norm": 1.4855226597503774, + "learning_rate": 1.1945431289277026e-06, + "loss": 0.3212, + "step": 8036 + }, + { + "epoch": 0.7821897810218978, + "grad_norm": 1.3533866347463486, + "learning_rate": 1.1935209533001057e-06, + "loss": 0.2935, + "step": 8037 + }, + { + "epoch": 0.7822871046228711, + "grad_norm": 1.4199810619165956, + "learning_rate": 1.1924991559246718e-06, + "loss": 0.2156, + "step": 8038 + }, + { + "epoch": 0.7823844282238442, + "grad_norm": 1.82290229428393, + "learning_rate": 1.1914777369029368e-06, + "loss": 0.3301, + "step": 8039 + }, + { + "epoch": 0.7824817518248175, + "grad_norm": 1.560588583072571, + "learning_rate": 1.1904566963364e-06, + "loss": 0.4295, + "step": 8040 + }, + { + "epoch": 0.7825790754257907, + "grad_norm": 1.4340165265832168, + "learning_rate": 1.1894360343265226e-06, + "loss": 0.2228, + "step": 8041 + }, + { + "epoch": 0.782676399026764, + "grad_norm": 1.4549755495514283, + "learning_rate": 1.1884157509747307e-06, + "loss": 0.2696, + "step": 8042 + }, + { + "epoch": 0.7827737226277373, + "grad_norm": 1.777563127665598, + "learning_rate": 1.1873958463824064e-06, + "loss": 0.4019, + "step": 8043 + }, + { + "epoch": 0.7828710462287105, + "grad_norm": 1.510922165391459, + "learning_rate": 1.1863763206509004e-06, + "loss": 0.3629, + "step": 8044 + }, + { + "epoch": 0.7829683698296837, + "grad_norm": 1.7007586637582162, + "learning_rate": 1.185357173881525e-06, + "loss": 0.3667, + "step": 8045 + }, + { + "epoch": 0.7830656934306569, + "grad_norm": 1.63381087886566, + "learning_rate": 1.1843384061755503e-06, + "loss": 0.4466, + "step": 8046 + }, + { + "epoch": 0.7831630170316302, + "grad_norm": 1.6295364587854009, + "learning_rate": 1.1833200176342136e-06, + "loss": 0.5516, + "step": 8047 + }, + { + "epoch": 0.7832603406326034, + "grad_norm": 1.8901840830776038, + "learning_rate": 1.1823020083587127e-06, + "loss": 0.4597, + "step": 8048 + }, + { + "epoch": 0.7833576642335767, + "grad_norm": 2.104427950912037, + "learning_rate": 1.1812843784502076e-06, + "loss": 0.4401, + "step": 8049 + }, + { + "epoch": 0.7834549878345499, + "grad_norm": 1.854203220250001, + "learning_rate": 1.180267128009821e-06, + "loss": 0.5456, + "step": 8050 + }, + { + "epoch": 0.7835523114355231, + "grad_norm": 1.445688064855444, + "learning_rate": 1.1792502571386383e-06, + "loss": 0.2863, + "step": 8051 + }, + { + "epoch": 0.7836496350364963, + "grad_norm": 1.5988757415547878, + "learning_rate": 1.1782337659377053e-06, + "loss": 0.3978, + "step": 8052 + }, + { + "epoch": 0.7837469586374696, + "grad_norm": 1.589488896689044, + "learning_rate": 1.1772176545080332e-06, + "loss": 0.3773, + "step": 8053 + }, + { + "epoch": 0.7838442822384428, + "grad_norm": 1.997668098948654, + "learning_rate": 1.1762019229505917e-06, + "loss": 0.437, + "step": 8054 + }, + { + "epoch": 0.7839416058394161, + "grad_norm": 1.5126813637944283, + "learning_rate": 1.1751865713663152e-06, + "loss": 0.4478, + "step": 8055 + }, + { + "epoch": 0.7840389294403893, + "grad_norm": 1.47157114055152, + "learning_rate": 1.1741715998561016e-06, + "loss": 0.249, + "step": 8056 + }, + { + "epoch": 0.7841362530413626, + "grad_norm": 1.798100857749952, + "learning_rate": 1.1731570085208056e-06, + "loss": 0.4803, + "step": 8057 + }, + { + "epoch": 0.7842335766423357, + "grad_norm": 1.6452713340833323, + "learning_rate": 1.1721427974612504e-06, + "loss": 0.3273, + "step": 8058 + }, + { + "epoch": 0.784330900243309, + "grad_norm": 1.6953792998342279, + "learning_rate": 1.1711289667782178e-06, + "loss": 0.3433, + "step": 8059 + }, + { + "epoch": 0.7844282238442822, + "grad_norm": 1.5901205585766462, + "learning_rate": 1.1701155165724537e-06, + "loss": 0.5484, + "step": 8060 + }, + { + "epoch": 0.7845255474452555, + "grad_norm": 1.728988393150012, + "learning_rate": 1.1691024469446644e-06, + "loss": 0.441, + "step": 8061 + }, + { + "epoch": 0.7846228710462287, + "grad_norm": 1.5448548197511236, + "learning_rate": 1.1680897579955196e-06, + "loss": 0.3675, + "step": 8062 + }, + { + "epoch": 0.784720194647202, + "grad_norm": 1.7557284738121903, + "learning_rate": 1.1670774498256526e-06, + "loss": 0.5959, + "step": 8063 + }, + { + "epoch": 0.7848175182481751, + "grad_norm": 1.424925246704001, + "learning_rate": 1.1660655225356533e-06, + "loss": 0.5063, + "step": 8064 + }, + { + "epoch": 0.7849148418491484, + "grad_norm": 1.8637938364196571, + "learning_rate": 1.1650539762260799e-06, + "loss": 0.3454, + "step": 8065 + }, + { + "epoch": 0.7850121654501216, + "grad_norm": 2.56697083276897, + "learning_rate": 1.16404281099745e-06, + "loss": 0.3414, + "step": 8066 + }, + { + "epoch": 0.7851094890510949, + "grad_norm": 1.4727330681069226, + "learning_rate": 1.1630320269502454e-06, + "loss": 0.3121, + "step": 8067 + }, + { + "epoch": 0.7852068126520682, + "grad_norm": 1.6992262566060843, + "learning_rate": 1.1620216241849047e-06, + "loss": 0.2941, + "step": 8068 + }, + { + "epoch": 0.7853041362530414, + "grad_norm": 1.42773821578442, + "learning_rate": 1.1610116028018348e-06, + "loss": 0.2779, + "step": 8069 + }, + { + "epoch": 0.7854014598540145, + "grad_norm": 1.4271058500792162, + "learning_rate": 1.1600019629014009e-06, + "loss": 0.3535, + "step": 8070 + }, + { + "epoch": 0.7854987834549878, + "grad_norm": 1.5506578679812335, + "learning_rate": 1.1589927045839327e-06, + "loss": 0.4337, + "step": 8071 + }, + { + "epoch": 0.7855961070559611, + "grad_norm": 1.3132205075405923, + "learning_rate": 1.1579838279497203e-06, + "loss": 0.3089, + "step": 8072 + }, + { + "epoch": 0.7856934306569343, + "grad_norm": 1.924474307524277, + "learning_rate": 1.1569753330990163e-06, + "loss": 0.3805, + "step": 8073 + }, + { + "epoch": 0.7857907542579076, + "grad_norm": 1.6737364629265001, + "learning_rate": 1.155967220132037e-06, + "loss": 0.4784, + "step": 8074 + }, + { + "epoch": 0.7858880778588808, + "grad_norm": 1.4003876030329363, + "learning_rate": 1.1549594891489563e-06, + "loss": 0.3638, + "step": 8075 + }, + { + "epoch": 0.785985401459854, + "grad_norm": 1.4921338136611924, + "learning_rate": 1.1539521402499142e-06, + "loss": 0.5409, + "step": 8076 + }, + { + "epoch": 0.7860827250608272, + "grad_norm": 1.4142941448516362, + "learning_rate": 1.1529451735350128e-06, + "loss": 0.4385, + "step": 8077 + }, + { + "epoch": 0.7861800486618005, + "grad_norm": 1.7445823733692551, + "learning_rate": 1.1519385891043145e-06, + "loss": 0.4873, + "step": 8078 + }, + { + "epoch": 0.7862773722627737, + "grad_norm": 1.72433383075366, + "learning_rate": 1.1509323870578432e-06, + "loss": 0.2599, + "step": 8079 + }, + { + "epoch": 0.786374695863747, + "grad_norm": 1.3783340779909445, + "learning_rate": 1.1499265674955862e-06, + "loss": 0.2781, + "step": 8080 + }, + { + "epoch": 0.7864720194647202, + "grad_norm": 2.1318735144707013, + "learning_rate": 1.1489211305174924e-06, + "loss": 0.4339, + "step": 8081 + }, + { + "epoch": 0.7865693430656935, + "grad_norm": 1.660578846719015, + "learning_rate": 1.1479160762234726e-06, + "loss": 0.3401, + "step": 8082 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 1.5094304076582512, + "learning_rate": 1.1469114047134006e-06, + "loss": 0.3415, + "step": 8083 + }, + { + "epoch": 0.7867639902676399, + "grad_norm": 1.462336151204388, + "learning_rate": 1.1459071160871104e-06, + "loss": 0.2441, + "step": 8084 + }, + { + "epoch": 0.7868613138686131, + "grad_norm": 1.323748614821101, + "learning_rate": 1.1449032104444003e-06, + "loss": 0.2074, + "step": 8085 + }, + { + "epoch": 0.7869586374695864, + "grad_norm": 1.4327784410461755, + "learning_rate": 1.1438996878850266e-06, + "loss": 0.2535, + "step": 8086 + }, + { + "epoch": 0.7870559610705596, + "grad_norm": 1.5938510488751345, + "learning_rate": 1.1428965485087107e-06, + "loss": 0.4825, + "step": 8087 + }, + { + "epoch": 0.7871532846715329, + "grad_norm": 1.7014714884873192, + "learning_rate": 1.1418937924151357e-06, + "loss": 0.5956, + "step": 8088 + }, + { + "epoch": 0.787250608272506, + "grad_norm": 1.531617256809351, + "learning_rate": 1.1408914197039473e-06, + "loss": 0.3294, + "step": 8089 + }, + { + "epoch": 0.7873479318734793, + "grad_norm": 1.4177784016952284, + "learning_rate": 1.1398894304747493e-06, + "loss": 0.3695, + "step": 8090 + }, + { + "epoch": 0.7874452554744525, + "grad_norm": 1.5225916152474699, + "learning_rate": 1.1388878248271096e-06, + "loss": 0.2239, + "step": 8091 + }, + { + "epoch": 0.7875425790754258, + "grad_norm": 1.9071689329410846, + "learning_rate": 1.1378866028605629e-06, + "loss": 0.3596, + "step": 8092 + }, + { + "epoch": 0.787639902676399, + "grad_norm": 1.8815894368791561, + "learning_rate": 1.1368857646745968e-06, + "loss": 0.3202, + "step": 8093 + }, + { + "epoch": 0.7877372262773723, + "grad_norm": 1.4326226794053734, + "learning_rate": 1.1358853103686667e-06, + "loss": 0.3899, + "step": 8094 + }, + { + "epoch": 0.7878345498783454, + "grad_norm": 1.4803351591122307, + "learning_rate": 1.1348852400421883e-06, + "loss": 0.3282, + "step": 8095 + }, + { + "epoch": 0.7879318734793187, + "grad_norm": 1.691389334860569, + "learning_rate": 1.133885553794541e-06, + "loss": 0.5184, + "step": 8096 + }, + { + "epoch": 0.788029197080292, + "grad_norm": 1.3730525183527786, + "learning_rate": 1.132886251725061e-06, + "loss": 0.3739, + "step": 8097 + }, + { + "epoch": 0.7881265206812652, + "grad_norm": 1.4606419228588312, + "learning_rate": 1.131887333933051e-06, + "loss": 0.2568, + "step": 8098 + }, + { + "epoch": 0.7882238442822385, + "grad_norm": 1.4570833662187797, + "learning_rate": 1.1308888005177759e-06, + "loss": 0.3517, + "step": 8099 + }, + { + "epoch": 0.7883211678832117, + "grad_norm": 1.7413803078116872, + "learning_rate": 1.1298906515784568e-06, + "loss": 0.4183, + "step": 8100 + }, + { + "epoch": 0.788418491484185, + "grad_norm": 1.6848865333862553, + "learning_rate": 1.1288928872142829e-06, + "loss": 0.4181, + "step": 8101 + }, + { + "epoch": 0.7885158150851581, + "grad_norm": 1.4667641220804344, + "learning_rate": 1.1278955075244008e-06, + "loss": 0.2386, + "step": 8102 + }, + { + "epoch": 0.7886131386861314, + "grad_norm": 2.031792905587045, + "learning_rate": 1.1268985126079245e-06, + "loss": 0.3987, + "step": 8103 + }, + { + "epoch": 0.7887104622871046, + "grad_norm": 1.6161173352568647, + "learning_rate": 1.1259019025639228e-06, + "loss": 0.484, + "step": 8104 + }, + { + "epoch": 0.7888077858880779, + "grad_norm": 1.8539425003616528, + "learning_rate": 1.1249056774914296e-06, + "loss": 0.5743, + "step": 8105 + }, + { + "epoch": 0.7889051094890511, + "grad_norm": 1.5083430646438427, + "learning_rate": 1.1239098374894414e-06, + "loss": 0.3759, + "step": 8106 + }, + { + "epoch": 0.7890024330900244, + "grad_norm": 1.5687963653642956, + "learning_rate": 1.1229143826569166e-06, + "loss": 0.3535, + "step": 8107 + }, + { + "epoch": 0.7890997566909975, + "grad_norm": 1.560113839617455, + "learning_rate": 1.1219193130927707e-06, + "loss": 0.2882, + "step": 8108 + }, + { + "epoch": 0.7891970802919708, + "grad_norm": 1.3713483872843002, + "learning_rate": 1.120924628895887e-06, + "loss": 0.3652, + "step": 8109 + }, + { + "epoch": 0.789294403892944, + "grad_norm": 1.6277104294738722, + "learning_rate": 1.1199303301651083e-06, + "loss": 0.5275, + "step": 8110 + }, + { + "epoch": 0.7893917274939173, + "grad_norm": 1.6774377705022236, + "learning_rate": 1.1189364169992363e-06, + "loss": 0.349, + "step": 8111 + }, + { + "epoch": 0.7894890510948905, + "grad_norm": 1.8630891137056542, + "learning_rate": 1.1179428894970361e-06, + "loss": 0.5011, + "step": 8112 + }, + { + "epoch": 0.7895863746958638, + "grad_norm": 1.7527898621113185, + "learning_rate": 1.1169497477572388e-06, + "loss": 0.3008, + "step": 8113 + }, + { + "epoch": 0.7896836982968369, + "grad_norm": 1.636414201393504, + "learning_rate": 1.1159569918785334e-06, + "loss": 0.3936, + "step": 8114 + }, + { + "epoch": 0.7897810218978102, + "grad_norm": 1.6025871162718357, + "learning_rate": 1.1149646219595672e-06, + "loss": 0.4747, + "step": 8115 + }, + { + "epoch": 0.7898783454987834, + "grad_norm": 1.6811719145952773, + "learning_rate": 1.1139726380989552e-06, + "loss": 0.2596, + "step": 8116 + }, + { + "epoch": 0.7899756690997567, + "grad_norm": 1.6378876875076136, + "learning_rate": 1.11298104039527e-06, + "loss": 0.2755, + "step": 8117 + }, + { + "epoch": 0.79007299270073, + "grad_norm": 1.6466465482113422, + "learning_rate": 1.1119898289470494e-06, + "loss": 0.4112, + "step": 8118 + }, + { + "epoch": 0.7901703163017032, + "grad_norm": 1.5731286754421239, + "learning_rate": 1.1109990038527878e-06, + "loss": 0.2964, + "step": 8119 + }, + { + "epoch": 0.7902676399026763, + "grad_norm": 1.9565324890145468, + "learning_rate": 1.110008565210946e-06, + "loss": 0.5184, + "step": 8120 + }, + { + "epoch": 0.7903649635036496, + "grad_norm": 1.7842800402384733, + "learning_rate": 1.1090185131199454e-06, + "loss": 0.3868, + "step": 8121 + }, + { + "epoch": 0.7904622871046229, + "grad_norm": 1.6739954825810923, + "learning_rate": 1.1080288476781637e-06, + "loss": 0.3217, + "step": 8122 + }, + { + "epoch": 0.7905596107055961, + "grad_norm": 1.5277138966392696, + "learning_rate": 1.1070395689839492e-06, + "loss": 0.434, + "step": 8123 + }, + { + "epoch": 0.7906569343065694, + "grad_norm": 1.5199479264825984, + "learning_rate": 1.1060506771356055e-06, + "loss": 0.2207, + "step": 8124 + }, + { + "epoch": 0.7907542579075426, + "grad_norm": 1.812888794338927, + "learning_rate": 1.105062172231401e-06, + "loss": 0.488, + "step": 8125 + }, + { + "epoch": 0.7908515815085159, + "grad_norm": 1.8500832874004898, + "learning_rate": 1.1040740543695605e-06, + "loss": 0.4116, + "step": 8126 + }, + { + "epoch": 0.790948905109489, + "grad_norm": 1.6507901582779927, + "learning_rate": 1.1030863236482764e-06, + "loss": 0.4842, + "step": 8127 + }, + { + "epoch": 0.7910462287104623, + "grad_norm": 1.7403623317681298, + "learning_rate": 1.1020989801657005e-06, + "loss": 0.4523, + "step": 8128 + }, + { + "epoch": 0.7911435523114355, + "grad_norm": 1.708810677556814, + "learning_rate": 1.1011120240199435e-06, + "loss": 0.3014, + "step": 8129 + }, + { + "epoch": 0.7912408759124088, + "grad_norm": 1.7319412338823943, + "learning_rate": 1.1001254553090812e-06, + "loss": 0.3733, + "step": 8130 + }, + { + "epoch": 0.791338199513382, + "grad_norm": 1.9280306850223352, + "learning_rate": 1.0991392741311497e-06, + "loss": 0.4706, + "step": 8131 + }, + { + "epoch": 0.7914355231143553, + "grad_norm": 1.8607344586278027, + "learning_rate": 1.098153480584146e-06, + "loss": 0.5408, + "step": 8132 + }, + { + "epoch": 0.7915328467153284, + "grad_norm": 1.6863664422576228, + "learning_rate": 1.0971680747660291e-06, + "loss": 0.5531, + "step": 8133 + }, + { + "epoch": 0.7916301703163017, + "grad_norm": 1.6916998416798412, + "learning_rate": 1.0961830567747196e-06, + "loss": 0.3833, + "step": 8134 + }, + { + "epoch": 0.7917274939172749, + "grad_norm": 2.6665719035318216, + "learning_rate": 1.095198426708099e-06, + "loss": 0.3072, + "step": 8135 + }, + { + "epoch": 0.7918248175182482, + "grad_norm": 1.1673143269422, + "learning_rate": 1.0942141846640125e-06, + "loss": 0.1837, + "step": 8136 + }, + { + "epoch": 0.7919221411192214, + "grad_norm": 1.7036328039912125, + "learning_rate": 1.0932303307402614e-06, + "loss": 0.2058, + "step": 8137 + }, + { + "epoch": 0.7920194647201947, + "grad_norm": 1.657360784055231, + "learning_rate": 1.0922468650346142e-06, + "loss": 0.2746, + "step": 8138 + }, + { + "epoch": 0.7921167883211678, + "grad_norm": 1.2673485405471254, + "learning_rate": 1.091263787644799e-06, + "loss": 0.2563, + "step": 8139 + }, + { + "epoch": 0.7922141119221411, + "grad_norm": 1.6958234869493818, + "learning_rate": 1.090281098668502e-06, + "loss": 0.4854, + "step": 8140 + }, + { + "epoch": 0.7923114355231143, + "grad_norm": 1.6999382512714944, + "learning_rate": 1.089298798203376e-06, + "loss": 0.5878, + "step": 8141 + }, + { + "epoch": 0.7924087591240876, + "grad_norm": 1.6345479403257106, + "learning_rate": 1.0883168863470316e-06, + "loss": 0.3624, + "step": 8142 + }, + { + "epoch": 0.7925060827250608, + "grad_norm": 1.5927050287558797, + "learning_rate": 1.0873353631970423e-06, + "loss": 0.3184, + "step": 8143 + }, + { + "epoch": 0.7926034063260341, + "grad_norm": 1.7371727312739687, + "learning_rate": 1.0863542288509438e-06, + "loss": 0.4657, + "step": 8144 + }, + { + "epoch": 0.7927007299270074, + "grad_norm": 1.6099616157505319, + "learning_rate": 1.08537348340623e-06, + "loss": 0.3688, + "step": 8145 + }, + { + "epoch": 0.7927980535279805, + "grad_norm": 1.3366305689483875, + "learning_rate": 1.0843931269603613e-06, + "loss": 0.2572, + "step": 8146 + }, + { + "epoch": 0.7928953771289537, + "grad_norm": 1.4232935274679048, + "learning_rate": 1.0834131596107534e-06, + "loss": 0.4628, + "step": 8147 + }, + { + "epoch": 0.792992700729927, + "grad_norm": 1.5947086966285289, + "learning_rate": 1.0824335814547866e-06, + "loss": 0.3318, + "step": 8148 + }, + { + "epoch": 0.7930900243309003, + "grad_norm": 1.3561583422087762, + "learning_rate": 1.0814543925898025e-06, + "loss": 0.3613, + "step": 8149 + }, + { + "epoch": 0.7931873479318735, + "grad_norm": 1.8499805114338195, + "learning_rate": 1.0804755931131055e-06, + "loss": 0.4864, + "step": 8150 + }, + { + "epoch": 0.7932846715328468, + "grad_norm": 1.400127902319207, + "learning_rate": 1.0794971831219565e-06, + "loss": 0.2617, + "step": 8151 + }, + { + "epoch": 0.7933819951338199, + "grad_norm": 1.6937680570758955, + "learning_rate": 1.0785191627135822e-06, + "loss": 0.3741, + "step": 8152 + }, + { + "epoch": 0.7934793187347932, + "grad_norm": 1.320409466495225, + "learning_rate": 1.077541531985169e-06, + "loss": 0.2644, + "step": 8153 + }, + { + "epoch": 0.7935766423357664, + "grad_norm": 1.700425694965205, + "learning_rate": 1.0765642910338647e-06, + "loss": 0.3815, + "step": 8154 + }, + { + "epoch": 0.7936739659367397, + "grad_norm": 1.9603989306089127, + "learning_rate": 1.075587439956779e-06, + "loss": 0.2854, + "step": 8155 + }, + { + "epoch": 0.7937712895377129, + "grad_norm": 1.2702919896195526, + "learning_rate": 1.0746109788509806e-06, + "loss": 0.2766, + "step": 8156 + }, + { + "epoch": 0.7938686131386862, + "grad_norm": 1.6889701373372161, + "learning_rate": 1.0736349078135039e-06, + "loss": 0.4129, + "step": 8157 + }, + { + "epoch": 0.7939659367396593, + "grad_norm": 1.7541547909271966, + "learning_rate": 1.0726592269413382e-06, + "loss": 0.2949, + "step": 8158 + }, + { + "epoch": 0.7940632603406326, + "grad_norm": 1.2955569267883664, + "learning_rate": 1.0716839363314391e-06, + "loss": 0.2792, + "step": 8159 + }, + { + "epoch": 0.7941605839416058, + "grad_norm": 1.4919835295752442, + "learning_rate": 1.0707090360807215e-06, + "loss": 0.3594, + "step": 8160 + }, + { + "epoch": 0.7942579075425791, + "grad_norm": 1.4529130842673215, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.4749, + "step": 8161 + }, + { + "epoch": 0.7943552311435523, + "grad_norm": 1.473726971084785, + "learning_rate": 1.0687604070443003e-06, + "loss": 0.4435, + "step": 8162 + }, + { + "epoch": 0.7944525547445256, + "grad_norm": 1.714117708942733, + "learning_rate": 1.0677866784522317e-06, + "loss": 0.4662, + "step": 8163 + }, + { + "epoch": 0.7945498783454988, + "grad_norm": 1.5871431625491488, + "learning_rate": 1.0668133406066177e-06, + "loss": 0.4161, + "step": 8164 + }, + { + "epoch": 0.794647201946472, + "grad_norm": 1.1931474313280932, + "learning_rate": 1.06584039360418e-06, + "loss": 0.2087, + "step": 8165 + }, + { + "epoch": 0.7947445255474452, + "grad_norm": 1.423888405359763, + "learning_rate": 1.0648678375416e-06, + "loss": 0.4545, + "step": 8166 + }, + { + "epoch": 0.7948418491484185, + "grad_norm": 1.591711872850261, + "learning_rate": 1.063895672515522e-06, + "loss": 0.2634, + "step": 8167 + }, + { + "epoch": 0.7949391727493917, + "grad_norm": 1.39323052617182, + "learning_rate": 1.0629238986225515e-06, + "loss": 0.2546, + "step": 8168 + }, + { + "epoch": 0.795036496350365, + "grad_norm": 1.7692399279890014, + "learning_rate": 1.0619525159592514e-06, + "loss": 0.5559, + "step": 8169 + }, + { + "epoch": 0.7951338199513382, + "grad_norm": 1.6614486869444285, + "learning_rate": 1.0609815246221512e-06, + "loss": 0.4697, + "step": 8170 + }, + { + "epoch": 0.7952311435523114, + "grad_norm": 1.5879031623073243, + "learning_rate": 1.0600109247077372e-06, + "loss": 0.3804, + "step": 8171 + }, + { + "epoch": 0.7953284671532846, + "grad_norm": 1.4318366235313287, + "learning_rate": 1.0590407163124612e-06, + "loss": 0.3151, + "step": 8172 + }, + { + "epoch": 0.7954257907542579, + "grad_norm": 1.5169564916900762, + "learning_rate": 1.0580708995327298e-06, + "loss": 0.4243, + "step": 8173 + }, + { + "epoch": 0.7955231143552312, + "grad_norm": 1.6108001028791707, + "learning_rate": 1.0571014744649161e-06, + "loss": 0.509, + "step": 8174 + }, + { + "epoch": 0.7956204379562044, + "grad_norm": 1.6457370858680287, + "learning_rate": 1.0561324412053525e-06, + "loss": 0.3624, + "step": 8175 + }, + { + "epoch": 0.7957177615571777, + "grad_norm": 2.111362227624098, + "learning_rate": 1.0551637998503323e-06, + "loss": 0.3964, + "step": 8176 + }, + { + "epoch": 0.7958150851581508, + "grad_norm": 1.1962588684008413, + "learning_rate": 1.05419555049611e-06, + "loss": 0.1947, + "step": 8177 + }, + { + "epoch": 0.7959124087591241, + "grad_norm": 1.3787680062218877, + "learning_rate": 1.0532276932389013e-06, + "loss": 0.3013, + "step": 8178 + }, + { + "epoch": 0.7960097323600973, + "grad_norm": 1.6055723282195695, + "learning_rate": 1.052260228174884e-06, + "loss": 0.3849, + "step": 8179 + }, + { + "epoch": 0.7961070559610706, + "grad_norm": 1.6128518777795382, + "learning_rate": 1.0512931554001926e-06, + "loss": 0.3578, + "step": 8180 + }, + { + "epoch": 0.7962043795620438, + "grad_norm": 1.5194149806396982, + "learning_rate": 1.0503264750109277e-06, + "loss": 0.4129, + "step": 8181 + }, + { + "epoch": 0.7963017031630171, + "grad_norm": 1.5154284320270044, + "learning_rate": 1.0493601871031495e-06, + "loss": 0.3976, + "step": 8182 + }, + { + "epoch": 0.7963990267639902, + "grad_norm": 1.259451988271287, + "learning_rate": 1.0483942917728768e-06, + "loss": 0.2639, + "step": 8183 + }, + { + "epoch": 0.7964963503649635, + "grad_norm": 1.510202440876656, + "learning_rate": 1.0474287891160923e-06, + "loss": 0.4174, + "step": 8184 + }, + { + "epoch": 0.7965936739659367, + "grad_norm": 1.6930441029409642, + "learning_rate": 1.046463679228738e-06, + "loss": 0.3078, + "step": 8185 + }, + { + "epoch": 0.79669099756691, + "grad_norm": 1.5897093769995547, + "learning_rate": 1.0454989622067186e-06, + "loss": 0.3232, + "step": 8186 + }, + { + "epoch": 0.7967883211678832, + "grad_norm": 1.3892923034136269, + "learning_rate": 1.044534638145897e-06, + "loss": 0.3094, + "step": 8187 + }, + { + "epoch": 0.7968856447688565, + "grad_norm": 1.4715448126597754, + "learning_rate": 1.0435707071421004e-06, + "loss": 0.3631, + "step": 8188 + }, + { + "epoch": 0.7969829683698297, + "grad_norm": 1.585519594651868, + "learning_rate": 1.0426071692911145e-06, + "loss": 0.4309, + "step": 8189 + }, + { + "epoch": 0.7970802919708029, + "grad_norm": 1.5519762791659382, + "learning_rate": 1.0416440246886877e-06, + "loss": 0.302, + "step": 8190 + }, + { + "epoch": 0.7971776155717761, + "grad_norm": 1.376972344446985, + "learning_rate": 1.0406812734305254e-06, + "loss": 0.2456, + "step": 8191 + }, + { + "epoch": 0.7972749391727494, + "grad_norm": 2.067153074593478, + "learning_rate": 1.0397189156122994e-06, + "loss": 0.2285, + "step": 8192 + }, + { + "epoch": 0.7973722627737226, + "grad_norm": 1.8088148076719655, + "learning_rate": 1.0387569513296397e-06, + "loss": 0.2742, + "step": 8193 + }, + { + "epoch": 0.7974695863746959, + "grad_norm": 1.3801686503145756, + "learning_rate": 1.0377953806781354e-06, + "loss": 0.2925, + "step": 8194 + }, + { + "epoch": 0.7975669099756691, + "grad_norm": 1.3646340203032081, + "learning_rate": 1.036834203753339e-06, + "loss": 0.3091, + "step": 8195 + }, + { + "epoch": 0.7976642335766423, + "grad_norm": 1.5013809205458493, + "learning_rate": 1.0358734206507642e-06, + "loss": 0.4133, + "step": 8196 + }, + { + "epoch": 0.7977615571776155, + "grad_norm": 1.5449692517526308, + "learning_rate": 1.0349130314658846e-06, + "loss": 0.3511, + "step": 8197 + }, + { + "epoch": 0.7978588807785888, + "grad_norm": 1.4533160504500484, + "learning_rate": 1.0339530362941335e-06, + "loss": 0.266, + "step": 8198 + }, + { + "epoch": 0.797956204379562, + "grad_norm": 1.3227990566249144, + "learning_rate": 1.0329934352309068e-06, + "loss": 0.3033, + "step": 8199 + }, + { + "epoch": 0.7980535279805353, + "grad_norm": 1.7365953821182578, + "learning_rate": 1.032034228371563e-06, + "loss": 0.2775, + "step": 8200 + }, + { + "epoch": 0.7981508515815086, + "grad_norm": 1.609549355552729, + "learning_rate": 1.0310754158114144e-06, + "loss": 0.4429, + "step": 8201 + }, + { + "epoch": 0.7982481751824817, + "grad_norm": 1.3330713948606716, + "learning_rate": 1.030116997645742e-06, + "loss": 0.4156, + "step": 8202 + }, + { + "epoch": 0.798345498783455, + "grad_norm": 1.5762338921542953, + "learning_rate": 1.0291589739697832e-06, + "loss": 0.4871, + "step": 8203 + }, + { + "epoch": 0.7984428223844282, + "grad_norm": 1.5038738296779455, + "learning_rate": 1.0282013448787393e-06, + "loss": 0.4045, + "step": 8204 + }, + { + "epoch": 0.7985401459854015, + "grad_norm": 1.2830623276572097, + "learning_rate": 1.0272441104677678e-06, + "loss": 0.2843, + "step": 8205 + }, + { + "epoch": 0.7986374695863747, + "grad_norm": 1.9681750472226829, + "learning_rate": 1.0262872708319905e-06, + "loss": 0.2584, + "step": 8206 + }, + { + "epoch": 0.798734793187348, + "grad_norm": 1.4267265634946316, + "learning_rate": 1.0253308260664901e-06, + "loss": 0.4684, + "step": 8207 + }, + { + "epoch": 0.7988321167883212, + "grad_norm": 1.549583606779781, + "learning_rate": 1.024374776266308e-06, + "loss": 0.2316, + "step": 8208 + }, + { + "epoch": 0.7989294403892944, + "grad_norm": 1.5597927290818268, + "learning_rate": 1.0234191215264484e-06, + "loss": 0.247, + "step": 8209 + }, + { + "epoch": 0.7990267639902676, + "grad_norm": 1.6685581282703315, + "learning_rate": 1.0224638619418748e-06, + "loss": 0.4695, + "step": 8210 + }, + { + "epoch": 0.7991240875912409, + "grad_norm": 1.5629133059327291, + "learning_rate": 1.0215089976075132e-06, + "loss": 0.3017, + "step": 8211 + }, + { + "epoch": 0.7992214111922141, + "grad_norm": 1.5154428274256342, + "learning_rate": 1.0205545286182466e-06, + "loss": 0.2563, + "step": 8212 + }, + { + "epoch": 0.7993187347931874, + "grad_norm": 1.4819856386299661, + "learning_rate": 1.0196004550689227e-06, + "loss": 0.3738, + "step": 8213 + }, + { + "epoch": 0.7994160583941606, + "grad_norm": 1.5641646863423193, + "learning_rate": 1.0186467770543478e-06, + "loss": 0.4329, + "step": 8214 + }, + { + "epoch": 0.7995133819951338, + "grad_norm": 1.580694844492842, + "learning_rate": 1.0176934946692913e-06, + "loss": 0.4777, + "step": 8215 + }, + { + "epoch": 0.799610705596107, + "grad_norm": 1.6695155670043478, + "learning_rate": 1.0167406080084784e-06, + "loss": 0.3178, + "step": 8216 + }, + { + "epoch": 0.7997080291970803, + "grad_norm": 1.57338069445776, + "learning_rate": 1.0157881171666e-06, + "loss": 0.5051, + "step": 8217 + }, + { + "epoch": 0.7998053527980535, + "grad_norm": 1.4387825758864465, + "learning_rate": 1.0148360222383047e-06, + "loss": 0.3131, + "step": 8218 + }, + { + "epoch": 0.7999026763990268, + "grad_norm": 1.604229005205104, + "learning_rate": 1.013884323318204e-06, + "loss": 0.2967, + "step": 8219 + }, + { + "epoch": 0.8, + "grad_norm": 1.8001198702211243, + "learning_rate": 1.0129330205008675e-06, + "loss": 0.3773, + "step": 8220 + }, + { + "epoch": 0.8000973236009732, + "grad_norm": 1.6782403433885749, + "learning_rate": 1.0119821138808277e-06, + "loss": 0.3416, + "step": 8221 + }, + { + "epoch": 0.8001946472019464, + "grad_norm": 1.6042441642504093, + "learning_rate": 1.011031603552578e-06, + "loss": 0.3556, + "step": 8222 + }, + { + "epoch": 0.8002919708029197, + "grad_norm": 1.4125334339067153, + "learning_rate": 1.0100814896105682e-06, + "loss": 0.3481, + "step": 8223 + }, + { + "epoch": 0.800389294403893, + "grad_norm": 1.9527923300671006, + "learning_rate": 1.0091317721492134e-06, + "loss": 0.3131, + "step": 8224 + }, + { + "epoch": 0.8004866180048662, + "grad_norm": 1.5597185937452964, + "learning_rate": 1.0081824512628874e-06, + "loss": 0.3796, + "step": 8225 + }, + { + "epoch": 0.8005839416058395, + "grad_norm": 1.5596877157203128, + "learning_rate": 1.0072335270459255e-06, + "loss": 0.3579, + "step": 8226 + }, + { + "epoch": 0.8006812652068126, + "grad_norm": 1.252931685531581, + "learning_rate": 1.0062849995926216e-06, + "loss": 0.2449, + "step": 8227 + }, + { + "epoch": 0.8007785888077859, + "grad_norm": 1.451072473831098, + "learning_rate": 1.0053368689972314e-06, + "loss": 0.2742, + "step": 8228 + }, + { + "epoch": 0.8008759124087591, + "grad_norm": 1.821066115082511, + "learning_rate": 1.004389135353972e-06, + "loss": 0.5976, + "step": 8229 + }, + { + "epoch": 0.8009732360097324, + "grad_norm": 1.6126775096803707, + "learning_rate": 1.0034417987570205e-06, + "loss": 0.3192, + "step": 8230 + }, + { + "epoch": 0.8010705596107056, + "grad_norm": 1.582303247499391, + "learning_rate": 1.0024948593005135e-06, + "loss": 0.4613, + "step": 8231 + }, + { + "epoch": 0.8011678832116789, + "grad_norm": 1.5766380027332203, + "learning_rate": 1.0015483170785495e-06, + "loss": 0.2179, + "step": 8232 + }, + { + "epoch": 0.8012652068126521, + "grad_norm": 2.1157431091968064, + "learning_rate": 1.0006021721851883e-06, + "loss": 0.4579, + "step": 8233 + }, + { + "epoch": 0.8013625304136253, + "grad_norm": 1.7800036972521842, + "learning_rate": 9.996564247144459e-07, + "loss": 0.4678, + "step": 8234 + }, + { + "epoch": 0.8014598540145985, + "grad_norm": 1.4872921838566666, + "learning_rate": 9.987110747603036e-07, + "loss": 0.3514, + "step": 8235 + }, + { + "epoch": 0.8015571776155718, + "grad_norm": 1.5176431649310755, + "learning_rate": 9.977661224167012e-07, + "loss": 0.4443, + "step": 8236 + }, + { + "epoch": 0.801654501216545, + "grad_norm": 1.2798814209759193, + "learning_rate": 9.968215677775406e-07, + "loss": 0.1943, + "step": 8237 + }, + { + "epoch": 0.8017518248175183, + "grad_norm": 1.6074570273058655, + "learning_rate": 9.958774109366798e-07, + "loss": 0.4818, + "step": 8238 + }, + { + "epoch": 0.8018491484184915, + "grad_norm": 1.502536774374111, + "learning_rate": 9.949336519879422e-07, + "loss": 0.3373, + "step": 8239 + }, + { + "epoch": 0.8019464720194647, + "grad_norm": 1.7217055831060901, + "learning_rate": 9.939902910251088e-07, + "loss": 0.3612, + "step": 8240 + }, + { + "epoch": 0.8020437956204379, + "grad_norm": 1.8488430220057979, + "learning_rate": 9.93047328141923e-07, + "loss": 0.5847, + "step": 8241 + }, + { + "epoch": 0.8021411192214112, + "grad_norm": 1.8692800376185765, + "learning_rate": 9.921047634320863e-07, + "loss": 0.2968, + "step": 8242 + }, + { + "epoch": 0.8022384428223844, + "grad_norm": 1.688344602795139, + "learning_rate": 9.911625969892635e-07, + "loss": 0.3639, + "step": 8243 + }, + { + "epoch": 0.8023357664233577, + "grad_norm": 1.6818753919313925, + "learning_rate": 9.90220828907078e-07, + "loss": 0.3191, + "step": 8244 + }, + { + "epoch": 0.8024330900243309, + "grad_norm": 1.69347785086461, + "learning_rate": 9.892794592791127e-07, + "loss": 0.4134, + "step": 8245 + }, + { + "epoch": 0.8025304136253041, + "grad_norm": 1.4764567342349137, + "learning_rate": 9.883384881989121e-07, + "loss": 0.3453, + "step": 8246 + }, + { + "epoch": 0.8026277372262773, + "grad_norm": 1.3165783868641983, + "learning_rate": 9.873979157599838e-07, + "loss": 0.2588, + "step": 8247 + }, + { + "epoch": 0.8027250608272506, + "grad_norm": 1.4324842936135758, + "learning_rate": 9.864577420557892e-07, + "loss": 0.2305, + "step": 8248 + }, + { + "epoch": 0.8028223844282238, + "grad_norm": 1.5835452523542426, + "learning_rate": 9.855179671797554e-07, + "loss": 0.3829, + "step": 8249 + }, + { + "epoch": 0.8029197080291971, + "grad_norm": 1.6555715646901366, + "learning_rate": 9.84578591225268e-07, + "loss": 0.5805, + "step": 8250 + }, + { + "epoch": 0.8030170316301704, + "grad_norm": 1.5645234548164424, + "learning_rate": 9.836396142856764e-07, + "loss": 0.2633, + "step": 8251 + }, + { + "epoch": 0.8031143552311436, + "grad_norm": 1.7656166279011962, + "learning_rate": 9.827010364542838e-07, + "loss": 0.3386, + "step": 8252 + }, + { + "epoch": 0.8032116788321167, + "grad_norm": 1.4839455234214443, + "learning_rate": 9.817628578243582e-07, + "loss": 0.2795, + "step": 8253 + }, + { + "epoch": 0.80330900243309, + "grad_norm": 1.5137620468353297, + "learning_rate": 9.808250784891271e-07, + "loss": 0.4058, + "step": 8254 + }, + { + "epoch": 0.8034063260340633, + "grad_norm": 1.36249237408385, + "learning_rate": 9.798876985417798e-07, + "loss": 0.2194, + "step": 8255 + }, + { + "epoch": 0.8035036496350365, + "grad_norm": 1.4960809214069362, + "learning_rate": 9.78950718075462e-07, + "loss": 0.2844, + "step": 8256 + }, + { + "epoch": 0.8036009732360098, + "grad_norm": 1.6063405289724626, + "learning_rate": 9.780141371832824e-07, + "loss": 0.3979, + "step": 8257 + }, + { + "epoch": 0.803698296836983, + "grad_norm": 1.7217897757539593, + "learning_rate": 9.770779559583116e-07, + "loss": 0.3784, + "step": 8258 + }, + { + "epoch": 0.8037956204379562, + "grad_norm": 1.9621643848518788, + "learning_rate": 9.761421744935756e-07, + "loss": 0.6162, + "step": 8259 + }, + { + "epoch": 0.8038929440389294, + "grad_norm": 1.5151604009136546, + "learning_rate": 9.752067928820635e-07, + "loss": 0.4282, + "step": 8260 + }, + { + "epoch": 0.8039902676399027, + "grad_norm": 1.602360893912926, + "learning_rate": 9.742718112167276e-07, + "loss": 0.2247, + "step": 8261 + }, + { + "epoch": 0.8040875912408759, + "grad_norm": 1.433521903002085, + "learning_rate": 9.733372295904776e-07, + "loss": 0.3294, + "step": 8262 + }, + { + "epoch": 0.8041849148418492, + "grad_norm": 1.608867380028881, + "learning_rate": 9.72403048096181e-07, + "loss": 0.434, + "step": 8263 + }, + { + "epoch": 0.8042822384428224, + "grad_norm": 1.4957781027937402, + "learning_rate": 9.71469266826669e-07, + "loss": 0.351, + "step": 8264 + }, + { + "epoch": 0.8043795620437956, + "grad_norm": 1.7044140898051645, + "learning_rate": 9.705358858747332e-07, + "loss": 0.3292, + "step": 8265 + }, + { + "epoch": 0.8044768856447688, + "grad_norm": 1.5885406327374538, + "learning_rate": 9.696029053331218e-07, + "loss": 0.4515, + "step": 8266 + }, + { + "epoch": 0.8045742092457421, + "grad_norm": 1.8262004013689415, + "learning_rate": 9.686703252945472e-07, + "loss": 0.335, + "step": 8267 + }, + { + "epoch": 0.8046715328467153, + "grad_norm": 1.6558453047456732, + "learning_rate": 9.677381458516798e-07, + "loss": 0.3089, + "step": 8268 + }, + { + "epoch": 0.8047688564476886, + "grad_norm": 1.3953393470086575, + "learning_rate": 9.66806367097153e-07, + "loss": 0.3779, + "step": 8269 + }, + { + "epoch": 0.8048661800486618, + "grad_norm": 1.8041688908775428, + "learning_rate": 9.658749891235537e-07, + "loss": 0.3807, + "step": 8270 + }, + { + "epoch": 0.804963503649635, + "grad_norm": 1.5677388675119859, + "learning_rate": 9.649440120234377e-07, + "loss": 0.3043, + "step": 8271 + }, + { + "epoch": 0.8050608272506082, + "grad_norm": 1.5539803741865383, + "learning_rate": 9.640134358893155e-07, + "loss": 0.1994, + "step": 8272 + }, + { + "epoch": 0.8051581508515815, + "grad_norm": 1.3243975223284201, + "learning_rate": 9.630832608136598e-07, + "loss": 0.3162, + "step": 8273 + }, + { + "epoch": 0.8052554744525547, + "grad_norm": 1.4482095911495203, + "learning_rate": 9.62153486888901e-07, + "loss": 0.4924, + "step": 8274 + }, + { + "epoch": 0.805352798053528, + "grad_norm": 1.888149942828695, + "learning_rate": 9.612241142074314e-07, + "loss": 0.5125, + "step": 8275 + }, + { + "epoch": 0.8054501216545012, + "grad_norm": 1.5591936606118286, + "learning_rate": 9.602951428616058e-07, + "loss": 0.3339, + "step": 8276 + }, + { + "epoch": 0.8055474452554745, + "grad_norm": 1.4771858796687205, + "learning_rate": 9.593665729437336e-07, + "loss": 0.396, + "step": 8277 + }, + { + "epoch": 0.8056447688564476, + "grad_norm": 1.537266646874025, + "learning_rate": 9.584384045460883e-07, + "loss": 0.4773, + "step": 8278 + }, + { + "epoch": 0.8057420924574209, + "grad_norm": 1.4768676442509172, + "learning_rate": 9.575106377609027e-07, + "loss": 0.4213, + "step": 8279 + }, + { + "epoch": 0.8058394160583942, + "grad_norm": 1.4428429202251531, + "learning_rate": 9.565832726803713e-07, + "loss": 0.3364, + "step": 8280 + }, + { + "epoch": 0.8059367396593674, + "grad_norm": 1.4531734827628975, + "learning_rate": 9.556563093966431e-07, + "loss": 0.2643, + "step": 8281 + }, + { + "epoch": 0.8060340632603407, + "grad_norm": 1.5705418964810505, + "learning_rate": 9.54729748001834e-07, + "loss": 0.4397, + "step": 8282 + }, + { + "epoch": 0.8061313868613139, + "grad_norm": 1.7877918378639155, + "learning_rate": 9.53803588588018e-07, + "loss": 0.3813, + "step": 8283 + }, + { + "epoch": 0.8062287104622871, + "grad_norm": 1.2936596825075395, + "learning_rate": 9.528778312472254e-07, + "loss": 0.2387, + "step": 8284 + }, + { + "epoch": 0.8063260340632603, + "grad_norm": 1.409255651592936, + "learning_rate": 9.519524760714494e-07, + "loss": 0.3517, + "step": 8285 + }, + { + "epoch": 0.8064233576642336, + "grad_norm": 1.387983148670416, + "learning_rate": 9.510275231526444e-07, + "loss": 0.2809, + "step": 8286 + }, + { + "epoch": 0.8065206812652068, + "grad_norm": 1.0833143208131393, + "learning_rate": 9.501029725827244e-07, + "loss": 0.1931, + "step": 8287 + }, + { + "epoch": 0.8066180048661801, + "grad_norm": 1.5679635466454598, + "learning_rate": 9.491788244535599e-07, + "loss": 0.4704, + "step": 8288 + }, + { + "epoch": 0.8067153284671533, + "grad_norm": 1.4063430331503632, + "learning_rate": 9.482550788569856e-07, + "loss": 0.2978, + "step": 8289 + }, + { + "epoch": 0.8068126520681265, + "grad_norm": 1.383665134477483, + "learning_rate": 9.473317358847945e-07, + "loss": 0.2737, + "step": 8290 + }, + { + "epoch": 0.8069099756690997, + "grad_norm": 1.4923275387546178, + "learning_rate": 9.464087956287398e-07, + "loss": 0.2837, + "step": 8291 + }, + { + "epoch": 0.807007299270073, + "grad_norm": 1.796812902289763, + "learning_rate": 9.454862581805346e-07, + "loss": 0.4449, + "step": 8292 + }, + { + "epoch": 0.8071046228710462, + "grad_norm": 1.6135556631874668, + "learning_rate": 9.44564123631852e-07, + "loss": 0.403, + "step": 8293 + }, + { + "epoch": 0.8072019464720195, + "grad_norm": 1.6375504522340862, + "learning_rate": 9.436423920743265e-07, + "loss": 0.4262, + "step": 8294 + }, + { + "epoch": 0.8072992700729927, + "grad_norm": 1.1911125446444595, + "learning_rate": 9.427210635995482e-07, + "loss": 0.2432, + "step": 8295 + }, + { + "epoch": 0.807396593673966, + "grad_norm": 1.48494211707029, + "learning_rate": 9.418001382990716e-07, + "loss": 0.235, + "step": 8296 + }, + { + "epoch": 0.8074939172749391, + "grad_norm": 1.629008191621439, + "learning_rate": 9.408796162644102e-07, + "loss": 0.382, + "step": 8297 + }, + { + "epoch": 0.8075912408759124, + "grad_norm": 1.477019304933211, + "learning_rate": 9.399594975870369e-07, + "loss": 0.2833, + "step": 8298 + }, + { + "epoch": 0.8076885644768856, + "grad_norm": 1.746488590529748, + "learning_rate": 9.390397823583824e-07, + "loss": 0.5771, + "step": 8299 + }, + { + "epoch": 0.8077858880778589, + "grad_norm": 1.6890774171295788, + "learning_rate": 9.381204706698416e-07, + "loss": 0.3533, + "step": 8300 + }, + { + "epoch": 0.8078832116788321, + "grad_norm": 1.6920746854617321, + "learning_rate": 9.372015626127656e-07, + "loss": 0.5509, + "step": 8301 + }, + { + "epoch": 0.8079805352798054, + "grad_norm": 1.5781941918038658, + "learning_rate": 9.362830582784677e-07, + "loss": 0.4232, + "step": 8302 + }, + { + "epoch": 0.8080778588807785, + "grad_norm": 1.5243657340893848, + "learning_rate": 9.353649577582202e-07, + "loss": 0.4098, + "step": 8303 + }, + { + "epoch": 0.8081751824817518, + "grad_norm": 1.641081247014223, + "learning_rate": 9.344472611432548e-07, + "loss": 0.4315, + "step": 8304 + }, + { + "epoch": 0.808272506082725, + "grad_norm": 1.7209194017119804, + "learning_rate": 9.335299685247656e-07, + "loss": 0.5302, + "step": 8305 + }, + { + "epoch": 0.8083698296836983, + "grad_norm": 1.9962277600716223, + "learning_rate": 9.326130799939015e-07, + "loss": 0.475, + "step": 8306 + }, + { + "epoch": 0.8084671532846716, + "grad_norm": 1.3609382147628362, + "learning_rate": 9.316965956417756e-07, + "loss": 0.2634, + "step": 8307 + }, + { + "epoch": 0.8085644768856448, + "grad_norm": 1.5461822654055422, + "learning_rate": 9.307805155594595e-07, + "loss": 0.3117, + "step": 8308 + }, + { + "epoch": 0.808661800486618, + "grad_norm": 1.7206585084844186, + "learning_rate": 9.298648398379861e-07, + "loss": 0.2867, + "step": 8309 + }, + { + "epoch": 0.8087591240875912, + "grad_norm": 1.5974208966222947, + "learning_rate": 9.289495685683442e-07, + "loss": 0.351, + "step": 8310 + }, + { + "epoch": 0.8088564476885645, + "grad_norm": 1.6170189073928123, + "learning_rate": 9.280347018414859e-07, + "loss": 0.4442, + "step": 8311 + }, + { + "epoch": 0.8089537712895377, + "grad_norm": 1.459811097688063, + "learning_rate": 9.271202397483214e-07, + "loss": 0.387, + "step": 8312 + }, + { + "epoch": 0.809051094890511, + "grad_norm": 1.5482604632905668, + "learning_rate": 9.262061823797231e-07, + "loss": 0.396, + "step": 8313 + }, + { + "epoch": 0.8091484184914842, + "grad_norm": 1.4759384531974744, + "learning_rate": 9.252925298265198e-07, + "loss": 0.343, + "step": 8314 + }, + { + "epoch": 0.8092457420924575, + "grad_norm": 1.5333593981365752, + "learning_rate": 9.243792821795023e-07, + "loss": 0.5209, + "step": 8315 + }, + { + "epoch": 0.8093430656934306, + "grad_norm": 1.891238404696684, + "learning_rate": 9.234664395294218e-07, + "loss": 0.2001, + "step": 8316 + }, + { + "epoch": 0.8094403892944039, + "grad_norm": 1.5720106972580374, + "learning_rate": 9.22554001966986e-07, + "loss": 0.4175, + "step": 8317 + }, + { + "epoch": 0.8095377128953771, + "grad_norm": 1.7147237609733332, + "learning_rate": 9.216419695828648e-07, + "loss": 0.4365, + "step": 8318 + }, + { + "epoch": 0.8096350364963504, + "grad_norm": 1.4043443271222635, + "learning_rate": 9.207303424676894e-07, + "loss": 0.3127, + "step": 8319 + }, + { + "epoch": 0.8097323600973236, + "grad_norm": 1.657912373747744, + "learning_rate": 9.19819120712046e-07, + "loss": 0.5488, + "step": 8320 + }, + { + "epoch": 0.8098296836982969, + "grad_norm": 1.6114382064670425, + "learning_rate": 9.18908304406484e-07, + "loss": 0.3294, + "step": 8321 + }, + { + "epoch": 0.80992700729927, + "grad_norm": 1.5529729695814052, + "learning_rate": 9.179978936415129e-07, + "loss": 0.2851, + "step": 8322 + }, + { + "epoch": 0.8100243309002433, + "grad_norm": 1.648270946603919, + "learning_rate": 9.170878885075995e-07, + "loss": 0.5698, + "step": 8323 + }, + { + "epoch": 0.8101216545012165, + "grad_norm": 1.8091124085464096, + "learning_rate": 9.161782890951725e-07, + "loss": 0.3533, + "step": 8324 + }, + { + "epoch": 0.8102189781021898, + "grad_norm": 1.89230662820606, + "learning_rate": 9.152690954946186e-07, + "loss": 0.4133, + "step": 8325 + }, + { + "epoch": 0.810316301703163, + "grad_norm": 1.4759758485249943, + "learning_rate": 9.143603077962859e-07, + "loss": 0.281, + "step": 8326 + }, + { + "epoch": 0.8104136253041363, + "grad_norm": 1.5177581562494784, + "learning_rate": 9.134519260904818e-07, + "loss": 0.4806, + "step": 8327 + }, + { + "epoch": 0.8105109489051094, + "grad_norm": 1.7270519546707563, + "learning_rate": 9.125439504674699e-07, + "loss": 0.3543, + "step": 8328 + }, + { + "epoch": 0.8106082725060827, + "grad_norm": 1.548495282154576, + "learning_rate": 9.116363810174783e-07, + "loss": 0.3902, + "step": 8329 + }, + { + "epoch": 0.810705596107056, + "grad_norm": 2.0607250552404848, + "learning_rate": 9.107292178306942e-07, + "loss": 0.4069, + "step": 8330 + }, + { + "epoch": 0.8108029197080292, + "grad_norm": 1.5590622276723398, + "learning_rate": 9.098224609972594e-07, + "loss": 0.5971, + "step": 8331 + }, + { + "epoch": 0.8109002433090025, + "grad_norm": 1.746144816699414, + "learning_rate": 9.089161106072803e-07, + "loss": 0.5274, + "step": 8332 + }, + { + "epoch": 0.8109975669099757, + "grad_norm": 1.761208200168515, + "learning_rate": 9.080101667508223e-07, + "loss": 0.3213, + "step": 8333 + }, + { + "epoch": 0.8110948905109489, + "grad_norm": 1.381812299654273, + "learning_rate": 9.071046295179092e-07, + "loss": 0.23, + "step": 8334 + }, + { + "epoch": 0.8111922141119221, + "grad_norm": 1.410489790507963, + "learning_rate": 9.061994989985251e-07, + "loss": 0.359, + "step": 8335 + }, + { + "epoch": 0.8112895377128954, + "grad_norm": 2.2725248307318284, + "learning_rate": 9.052947752826125e-07, + "loss": 0.2891, + "step": 8336 + }, + { + "epoch": 0.8113868613138686, + "grad_norm": 1.70834397878664, + "learning_rate": 9.04390458460076e-07, + "loss": 0.5257, + "step": 8337 + }, + { + "epoch": 0.8114841849148419, + "grad_norm": 1.2522604530889752, + "learning_rate": 9.034865486207761e-07, + "loss": 0.2466, + "step": 8338 + }, + { + "epoch": 0.8115815085158151, + "grad_norm": 1.467849374153229, + "learning_rate": 9.025830458545359e-07, + "loss": 0.3352, + "step": 8339 + }, + { + "epoch": 0.8116788321167884, + "grad_norm": 1.4656253611354748, + "learning_rate": 9.016799502511364e-07, + "loss": 0.2479, + "step": 8340 + }, + { + "epoch": 0.8117761557177615, + "grad_norm": 1.6658091087291045, + "learning_rate": 9.007772619003213e-07, + "loss": 0.4877, + "step": 8341 + }, + { + "epoch": 0.8118734793187348, + "grad_norm": 1.6608863571489547, + "learning_rate": 8.998749808917872e-07, + "loss": 0.3742, + "step": 8342 + }, + { + "epoch": 0.811970802919708, + "grad_norm": 1.593480629689817, + "learning_rate": 8.989731073151969e-07, + "loss": 0.4384, + "step": 8343 + }, + { + "epoch": 0.8120681265206813, + "grad_norm": 1.3903007986648528, + "learning_rate": 8.980716412601692e-07, + "loss": 0.2575, + "step": 8344 + }, + { + "epoch": 0.8121654501216545, + "grad_norm": 1.774313957989085, + "learning_rate": 8.971705828162841e-07, + "loss": 0.4356, + "step": 8345 + }, + { + "epoch": 0.8122627737226278, + "grad_norm": 1.523478033064548, + "learning_rate": 8.9626993207308e-07, + "loss": 0.283, + "step": 8346 + }, + { + "epoch": 0.8123600973236009, + "grad_norm": 1.8519873795542776, + "learning_rate": 8.953696891200553e-07, + "loss": 0.3915, + "step": 8347 + }, + { + "epoch": 0.8124574209245742, + "grad_norm": 1.4776877500523116, + "learning_rate": 8.944698540466684e-07, + "loss": 0.3718, + "step": 8348 + }, + { + "epoch": 0.8125547445255474, + "grad_norm": 1.5598554738660808, + "learning_rate": 8.93570426942334e-07, + "loss": 0.2678, + "step": 8349 + }, + { + "epoch": 0.8126520681265207, + "grad_norm": 1.548039085027831, + "learning_rate": 8.92671407896431e-07, + "loss": 0.2148, + "step": 8350 + }, + { + "epoch": 0.8127493917274939, + "grad_norm": 1.7733260468402587, + "learning_rate": 8.917727969982943e-07, + "loss": 0.2499, + "step": 8351 + }, + { + "epoch": 0.8128467153284672, + "grad_norm": 1.7254561940559956, + "learning_rate": 8.90874594337221e-07, + "loss": 0.436, + "step": 8352 + }, + { + "epoch": 0.8129440389294403, + "grad_norm": 1.5626788227749853, + "learning_rate": 8.899768000024639e-07, + "loss": 0.412, + "step": 8353 + }, + { + "epoch": 0.8130413625304136, + "grad_norm": 1.517502320113443, + "learning_rate": 8.890794140832387e-07, + "loss": 0.4174, + "step": 8354 + }, + { + "epoch": 0.8131386861313868, + "grad_norm": 1.4218407930723913, + "learning_rate": 8.881824366687186e-07, + "loss": 0.3146, + "step": 8355 + }, + { + "epoch": 0.8132360097323601, + "grad_norm": 1.601611476242381, + "learning_rate": 8.872858678480373e-07, + "loss": 0.4836, + "step": 8356 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 2.6559124900465294, + "learning_rate": 8.863897077102867e-07, + "loss": 0.4663, + "step": 8357 + }, + { + "epoch": 0.8134306569343066, + "grad_norm": 1.53292047118392, + "learning_rate": 8.854939563445198e-07, + "loss": 0.4454, + "step": 8358 + }, + { + "epoch": 0.8135279805352799, + "grad_norm": 1.3039408848884098, + "learning_rate": 8.845986138397483e-07, + "loss": 0.3033, + "step": 8359 + }, + { + "epoch": 0.813625304136253, + "grad_norm": 1.6675816249743665, + "learning_rate": 8.837036802849408e-07, + "loss": 0.4876, + "step": 8360 + }, + { + "epoch": 0.8137226277372263, + "grad_norm": 1.429087862264777, + "learning_rate": 8.828091557690288e-07, + "loss": 0.3287, + "step": 8361 + }, + { + "epoch": 0.8138199513381995, + "grad_norm": 1.3804499384705304, + "learning_rate": 8.819150403809018e-07, + "loss": 0.3389, + "step": 8362 + }, + { + "epoch": 0.8139172749391728, + "grad_norm": 1.6271102847685028, + "learning_rate": 8.810213342094098e-07, + "loss": 0.5728, + "step": 8363 + }, + { + "epoch": 0.814014598540146, + "grad_norm": 1.6375864137168874, + "learning_rate": 8.801280373433579e-07, + "loss": 0.322, + "step": 8364 + }, + { + "epoch": 0.8141119221411193, + "grad_norm": 1.6745087743092442, + "learning_rate": 8.792351498715152e-07, + "loss": 0.4994, + "step": 8365 + }, + { + "epoch": 0.8142092457420924, + "grad_norm": 1.5671352297888765, + "learning_rate": 8.783426718826088e-07, + "loss": 0.5024, + "step": 8366 + }, + { + "epoch": 0.8143065693430657, + "grad_norm": 1.5611102981308842, + "learning_rate": 8.77450603465324e-07, + "loss": 0.3536, + "step": 8367 + }, + { + "epoch": 0.8144038929440389, + "grad_norm": 2.7289885737039086, + "learning_rate": 8.765589447083073e-07, + "loss": 0.2806, + "step": 8368 + }, + { + "epoch": 0.8145012165450122, + "grad_norm": 1.5880156933467862, + "learning_rate": 8.756676957001619e-07, + "loss": 0.401, + "step": 8369 + }, + { + "epoch": 0.8145985401459854, + "grad_norm": 1.5968811094760667, + "learning_rate": 8.747768565294545e-07, + "loss": 0.2562, + "step": 8370 + }, + { + "epoch": 0.8146958637469587, + "grad_norm": 1.852788849009096, + "learning_rate": 8.738864272847053e-07, + "loss": 0.3074, + "step": 8371 + }, + { + "epoch": 0.8147931873479318, + "grad_norm": 1.6018889303863078, + "learning_rate": 8.729964080543973e-07, + "loss": 0.3314, + "step": 8372 + }, + { + "epoch": 0.8148905109489051, + "grad_norm": 1.3428650536927376, + "learning_rate": 8.721067989269732e-07, + "loss": 0.1677, + "step": 8373 + }, + { + "epoch": 0.8149878345498783, + "grad_norm": 1.6290238518651834, + "learning_rate": 8.712175999908351e-07, + "loss": 0.3014, + "step": 8374 + }, + { + "epoch": 0.8150851581508516, + "grad_norm": 2.0815941099416335, + "learning_rate": 8.703288113343406e-07, + "loss": 0.6475, + "step": 8375 + }, + { + "epoch": 0.8151824817518248, + "grad_norm": 2.085359757669811, + "learning_rate": 8.694404330458101e-07, + "loss": 0.5727, + "step": 8376 + }, + { + "epoch": 0.8152798053527981, + "grad_norm": 1.8078052622815899, + "learning_rate": 8.68552465213523e-07, + "loss": 0.4333, + "step": 8377 + }, + { + "epoch": 0.8153771289537712, + "grad_norm": 1.9008911351913662, + "learning_rate": 8.676649079257165e-07, + "loss": 0.3752, + "step": 8378 + }, + { + "epoch": 0.8154744525547445, + "grad_norm": 1.6132573825650118, + "learning_rate": 8.66777761270588e-07, + "loss": 0.5297, + "step": 8379 + }, + { + "epoch": 0.8155717761557177, + "grad_norm": 1.680869108256555, + "learning_rate": 8.65891025336294e-07, + "loss": 0.3869, + "step": 8380 + }, + { + "epoch": 0.815669099756691, + "grad_norm": 2.168698079919592, + "learning_rate": 8.650047002109513e-07, + "loss": 0.2389, + "step": 8381 + }, + { + "epoch": 0.8157664233576642, + "grad_norm": 1.8664539125088333, + "learning_rate": 8.641187859826311e-07, + "loss": 0.495, + "step": 8382 + }, + { + "epoch": 0.8158637469586375, + "grad_norm": 1.6839525588323292, + "learning_rate": 8.632332827393703e-07, + "loss": 0.4894, + "step": 8383 + }, + { + "epoch": 0.8159610705596108, + "grad_norm": 2.1879143458726826, + "learning_rate": 8.623481905691611e-07, + "loss": 0.2496, + "step": 8384 + }, + { + "epoch": 0.8160583941605839, + "grad_norm": 1.5444132164024813, + "learning_rate": 8.61463509559955e-07, + "loss": 0.2499, + "step": 8385 + }, + { + "epoch": 0.8161557177615572, + "grad_norm": 1.7819632725040162, + "learning_rate": 8.605792397996632e-07, + "loss": 0.278, + "step": 8386 + }, + { + "epoch": 0.8162530413625304, + "grad_norm": 1.7937967885790271, + "learning_rate": 8.596953813761566e-07, + "loss": 0.3465, + "step": 8387 + }, + { + "epoch": 0.8163503649635037, + "grad_norm": 1.6107516280501581, + "learning_rate": 8.58811934377265e-07, + "loss": 0.3904, + "step": 8388 + }, + { + "epoch": 0.8164476885644769, + "grad_norm": 1.4380608169576985, + "learning_rate": 8.579288988907769e-07, + "loss": 0.3512, + "step": 8389 + }, + { + "epoch": 0.8165450121654502, + "grad_norm": 1.691711821055243, + "learning_rate": 8.5704627500444e-07, + "loss": 0.3218, + "step": 8390 + }, + { + "epoch": 0.8166423357664233, + "grad_norm": 1.5855355340590112, + "learning_rate": 8.561640628059609e-07, + "loss": 0.2961, + "step": 8391 + }, + { + "epoch": 0.8167396593673966, + "grad_norm": 1.4913409163330993, + "learning_rate": 8.552822623830076e-07, + "loss": 0.1836, + "step": 8392 + }, + { + "epoch": 0.8168369829683698, + "grad_norm": 1.6474102851393784, + "learning_rate": 8.544008738232018e-07, + "loss": 0.4513, + "step": 8393 + }, + { + "epoch": 0.8169343065693431, + "grad_norm": 1.3310786893699869, + "learning_rate": 8.535198972141295e-07, + "loss": 0.2034, + "step": 8394 + }, + { + "epoch": 0.8170316301703163, + "grad_norm": 1.5812020094754773, + "learning_rate": 8.526393326433352e-07, + "loss": 0.4228, + "step": 8395 + }, + { + "epoch": 0.8171289537712896, + "grad_norm": 1.4401216235720966, + "learning_rate": 8.517591801983177e-07, + "loss": 0.45, + "step": 8396 + }, + { + "epoch": 0.8172262773722627, + "grad_norm": 1.5593055113329588, + "learning_rate": 8.508794399665404e-07, + "loss": 0.3937, + "step": 8397 + }, + { + "epoch": 0.817323600973236, + "grad_norm": 1.5310632241448212, + "learning_rate": 8.500001120354234e-07, + "loss": 0.4129, + "step": 8398 + }, + { + "epoch": 0.8174209245742092, + "grad_norm": 1.6438519667412148, + "learning_rate": 8.491211964923462e-07, + "loss": 0.281, + "step": 8399 + }, + { + "epoch": 0.8175182481751825, + "grad_norm": 1.5817666668460735, + "learning_rate": 8.482426934246468e-07, + "loss": 0.494, + "step": 8400 + }, + { + "epoch": 0.8176155717761557, + "grad_norm": 1.5199672201035155, + "learning_rate": 8.473646029196231e-07, + "loss": 0.4793, + "step": 8401 + }, + { + "epoch": 0.817712895377129, + "grad_norm": 1.504477493984947, + "learning_rate": 8.464869250645324e-07, + "loss": 0.3781, + "step": 8402 + }, + { + "epoch": 0.8178102189781022, + "grad_norm": 1.8552794671133859, + "learning_rate": 8.456096599465874e-07, + "loss": 0.5114, + "step": 8403 + }, + { + "epoch": 0.8179075425790754, + "grad_norm": 1.5041473591728602, + "learning_rate": 8.447328076529643e-07, + "loss": 0.2815, + "step": 8404 + }, + { + "epoch": 0.8180048661800486, + "grad_norm": 1.9304936381533826, + "learning_rate": 8.438563682707962e-07, + "loss": 0.4259, + "step": 8405 + }, + { + "epoch": 0.8181021897810219, + "grad_norm": 2.383616527547317, + "learning_rate": 8.429803418871762e-07, + "loss": 0.4083, + "step": 8406 + }, + { + "epoch": 0.8181995133819951, + "grad_norm": 1.571738738871395, + "learning_rate": 8.421047285891537e-07, + "loss": 0.4581, + "step": 8407 + }, + { + "epoch": 0.8182968369829684, + "grad_norm": 1.37179582232651, + "learning_rate": 8.412295284637401e-07, + "loss": 0.3711, + "step": 8408 + }, + { + "epoch": 0.8183941605839417, + "grad_norm": 1.5585814358408825, + "learning_rate": 8.40354741597903e-07, + "loss": 0.3632, + "step": 8409 + }, + { + "epoch": 0.8184914841849148, + "grad_norm": 1.5812784546715661, + "learning_rate": 8.394803680785746e-07, + "loss": 0.4247, + "step": 8410 + }, + { + "epoch": 0.818588807785888, + "grad_norm": 1.5512756789400532, + "learning_rate": 8.386064079926375e-07, + "loss": 0.4388, + "step": 8411 + }, + { + "epoch": 0.8186861313868613, + "grad_norm": 1.8147162994942616, + "learning_rate": 8.377328614269403e-07, + "loss": 0.5282, + "step": 8412 + }, + { + "epoch": 0.8187834549878346, + "grad_norm": 1.515612100762602, + "learning_rate": 8.368597284682878e-07, + "loss": 0.3242, + "step": 8413 + }, + { + "epoch": 0.8188807785888078, + "grad_norm": 1.2374079660428365, + "learning_rate": 8.359870092034417e-07, + "loss": 0.2636, + "step": 8414 + }, + { + "epoch": 0.8189781021897811, + "grad_norm": 1.7696887487251824, + "learning_rate": 8.351147037191259e-07, + "loss": 0.3734, + "step": 8415 + }, + { + "epoch": 0.8190754257907542, + "grad_norm": 1.3238483462304655, + "learning_rate": 8.342428121020218e-07, + "loss": 0.3502, + "step": 8416 + }, + { + "epoch": 0.8191727493917275, + "grad_norm": 1.3434659080935833, + "learning_rate": 8.333713344387717e-07, + "loss": 0.2898, + "step": 8417 + }, + { + "epoch": 0.8192700729927007, + "grad_norm": 1.6620200073745592, + "learning_rate": 8.325002708159713e-07, + "loss": 0.4132, + "step": 8418 + }, + { + "epoch": 0.819367396593674, + "grad_norm": 1.4510220312947473, + "learning_rate": 8.316296213201796e-07, + "loss": 0.3123, + "step": 8419 + }, + { + "epoch": 0.8194647201946472, + "grad_norm": 1.6361482850611733, + "learning_rate": 8.307593860379165e-07, + "loss": 0.3371, + "step": 8420 + }, + { + "epoch": 0.8195620437956205, + "grad_norm": 1.4491656106013084, + "learning_rate": 8.298895650556554e-07, + "loss": 0.2731, + "step": 8421 + }, + { + "epoch": 0.8196593673965936, + "grad_norm": 1.6678389559057896, + "learning_rate": 8.290201584598307e-07, + "loss": 0.4127, + "step": 8422 + }, + { + "epoch": 0.8197566909975669, + "grad_norm": 1.4837207356164495, + "learning_rate": 8.281511663368369e-07, + "loss": 0.2748, + "step": 8423 + }, + { + "epoch": 0.8198540145985401, + "grad_norm": 1.717213307847111, + "learning_rate": 8.272825887730268e-07, + "loss": 0.5487, + "step": 8424 + }, + { + "epoch": 0.8199513381995134, + "grad_norm": 2.8681722088218495, + "learning_rate": 8.264144258547097e-07, + "loss": 0.3903, + "step": 8425 + }, + { + "epoch": 0.8200486618004866, + "grad_norm": 1.7309601303524795, + "learning_rate": 8.255466776681564e-07, + "loss": 0.291, + "step": 8426 + }, + { + "epoch": 0.8201459854014599, + "grad_norm": 1.653073413527135, + "learning_rate": 8.246793442995954e-07, + "loss": 0.5024, + "step": 8427 + }, + { + "epoch": 0.8202433090024331, + "grad_norm": 1.5113909856442893, + "learning_rate": 8.238124258352159e-07, + "loss": 0.2964, + "step": 8428 + }, + { + "epoch": 0.8203406326034063, + "grad_norm": 1.3621232763116542, + "learning_rate": 8.229459223611596e-07, + "loss": 0.3668, + "step": 8429 + }, + { + "epoch": 0.8204379562043795, + "grad_norm": 2.0185129930003862, + "learning_rate": 8.220798339635361e-07, + "loss": 0.3674, + "step": 8430 + }, + { + "epoch": 0.8205352798053528, + "grad_norm": 1.649731674918047, + "learning_rate": 8.212141607284085e-07, + "loss": 0.2439, + "step": 8431 + }, + { + "epoch": 0.820632603406326, + "grad_norm": 1.7594882013955706, + "learning_rate": 8.203489027417966e-07, + "loss": 0.2645, + "step": 8432 + }, + { + "epoch": 0.8207299270072993, + "grad_norm": 1.773570522075255, + "learning_rate": 8.194840600896836e-07, + "loss": 0.4707, + "step": 8433 + }, + { + "epoch": 0.8208272506082726, + "grad_norm": 1.6728599712919863, + "learning_rate": 8.186196328580093e-07, + "loss": 0.2906, + "step": 8434 + }, + { + "epoch": 0.8209245742092457, + "grad_norm": 1.6302212683151303, + "learning_rate": 8.177556211326732e-07, + "loss": 0.348, + "step": 8435 + }, + { + "epoch": 0.821021897810219, + "grad_norm": 1.7108283857276207, + "learning_rate": 8.168920249995305e-07, + "loss": 0.4203, + "step": 8436 + }, + { + "epoch": 0.8211192214111922, + "grad_norm": 1.406321036346, + "learning_rate": 8.160288445443982e-07, + "loss": 0.3131, + "step": 8437 + }, + { + "epoch": 0.8212165450121655, + "grad_norm": 1.5032263866341844, + "learning_rate": 8.151660798530525e-07, + "loss": 0.3947, + "step": 8438 + }, + { + "epoch": 0.8213138686131387, + "grad_norm": 1.7818031790558218, + "learning_rate": 8.143037310112234e-07, + "loss": 0.3774, + "step": 8439 + }, + { + "epoch": 0.821411192214112, + "grad_norm": 1.784985156384894, + "learning_rate": 8.134417981046067e-07, + "loss": 0.3621, + "step": 8440 + }, + { + "epoch": 0.8215085158150851, + "grad_norm": 1.2925157934880391, + "learning_rate": 8.125802812188516e-07, + "loss": 0.1804, + "step": 8441 + }, + { + "epoch": 0.8216058394160584, + "grad_norm": 1.5428146864837118, + "learning_rate": 8.117191804395685e-07, + "loss": 0.4019, + "step": 8442 + }, + { + "epoch": 0.8217031630170316, + "grad_norm": 1.5666618614258359, + "learning_rate": 8.108584958523241e-07, + "loss": 0.4222, + "step": 8443 + }, + { + "epoch": 0.8218004866180049, + "grad_norm": 1.3380600425209073, + "learning_rate": 8.099982275426454e-07, + "loss": 0.3325, + "step": 8444 + }, + { + "epoch": 0.8218978102189781, + "grad_norm": 1.9129346366120812, + "learning_rate": 8.091383755960186e-07, + "loss": 0.5231, + "step": 8445 + }, + { + "epoch": 0.8219951338199514, + "grad_norm": 1.6612325612595942, + "learning_rate": 8.082789400978885e-07, + "loss": 0.3439, + "step": 8446 + }, + { + "epoch": 0.8220924574209246, + "grad_norm": 1.758265119136404, + "learning_rate": 8.074199211336553e-07, + "loss": 0.6109, + "step": 8447 + }, + { + "epoch": 0.8221897810218978, + "grad_norm": 1.5906337115755578, + "learning_rate": 8.065613187886817e-07, + "loss": 0.4181, + "step": 8448 + }, + { + "epoch": 0.822287104622871, + "grad_norm": 1.5296448838996763, + "learning_rate": 8.057031331482878e-07, + "loss": 0.3719, + "step": 8449 + }, + { + "epoch": 0.8223844282238443, + "grad_norm": 1.463294753925276, + "learning_rate": 8.0484536429775e-07, + "loss": 0.2459, + "step": 8450 + }, + { + "epoch": 0.8224817518248175, + "grad_norm": 1.528261187204968, + "learning_rate": 8.039880123223077e-07, + "loss": 0.4563, + "step": 8451 + }, + { + "epoch": 0.8225790754257908, + "grad_norm": 1.6104986471782168, + "learning_rate": 8.03131077307156e-07, + "loss": 0.2692, + "step": 8452 + }, + { + "epoch": 0.822676399026764, + "grad_norm": 1.4564238808748196, + "learning_rate": 8.022745593374492e-07, + "loss": 0.346, + "step": 8453 + }, + { + "epoch": 0.8227737226277372, + "grad_norm": 1.6764344407348686, + "learning_rate": 8.014184584982992e-07, + "loss": 0.5887, + "step": 8454 + }, + { + "epoch": 0.8228710462287104, + "grad_norm": 1.2445058241723603, + "learning_rate": 8.005627748747774e-07, + "loss": 0.1516, + "step": 8455 + }, + { + "epoch": 0.8229683698296837, + "grad_norm": 1.521920412235942, + "learning_rate": 7.997075085519146e-07, + "loss": 0.4181, + "step": 8456 + }, + { + "epoch": 0.8230656934306569, + "grad_norm": 1.569907824246264, + "learning_rate": 7.988526596146978e-07, + "loss": 0.2605, + "step": 8457 + }, + { + "epoch": 0.8231630170316302, + "grad_norm": 1.579251859609775, + "learning_rate": 7.979982281480741e-07, + "loss": 0.3578, + "step": 8458 + }, + { + "epoch": 0.8232603406326034, + "grad_norm": 1.5453003854922795, + "learning_rate": 7.971442142369496e-07, + "loss": 0.4467, + "step": 8459 + }, + { + "epoch": 0.8233576642335766, + "grad_norm": 1.8161532974490906, + "learning_rate": 7.962906179661872e-07, + "loss": 0.3744, + "step": 8460 + }, + { + "epoch": 0.8234549878345498, + "grad_norm": 1.7800499068989681, + "learning_rate": 7.954374394206104e-07, + "loss": 0.4661, + "step": 8461 + }, + { + "epoch": 0.8235523114355231, + "grad_norm": 1.7058900558266819, + "learning_rate": 7.945846786849992e-07, + "loss": 0.5414, + "step": 8462 + }, + { + "epoch": 0.8236496350364964, + "grad_norm": 1.5557653630016244, + "learning_rate": 7.937323358440935e-07, + "loss": 0.1821, + "step": 8463 + }, + { + "epoch": 0.8237469586374696, + "grad_norm": 1.6992141197404587, + "learning_rate": 7.928804109825921e-07, + "loss": 0.3581, + "step": 8464 + }, + { + "epoch": 0.8238442822384429, + "grad_norm": 1.5576017277347993, + "learning_rate": 7.920289041851493e-07, + "loss": 0.3617, + "step": 8465 + }, + { + "epoch": 0.8239416058394161, + "grad_norm": 3.071183020575951, + "learning_rate": 7.911778155363803e-07, + "loss": 0.3682, + "step": 8466 + }, + { + "epoch": 0.8240389294403893, + "grad_norm": 1.8622228147356057, + "learning_rate": 7.903271451208599e-07, + "loss": 0.5656, + "step": 8467 + }, + { + "epoch": 0.8241362530413625, + "grad_norm": 1.5430061432375595, + "learning_rate": 7.894768930231178e-07, + "loss": 0.2205, + "step": 8468 + }, + { + "epoch": 0.8242335766423358, + "grad_norm": 1.7113566124797384, + "learning_rate": 7.886270593276446e-07, + "loss": 0.297, + "step": 8469 + }, + { + "epoch": 0.824330900243309, + "grad_norm": 1.3597550135031038, + "learning_rate": 7.877776441188889e-07, + "loss": 0.3124, + "step": 8470 + }, + { + "epoch": 0.8244282238442823, + "grad_norm": 1.6936244031221055, + "learning_rate": 7.869286474812582e-07, + "loss": 0.2847, + "step": 8471 + }, + { + "epoch": 0.8245255474452555, + "grad_norm": 1.675070160492396, + "learning_rate": 7.860800694991166e-07, + "loss": 0.4261, + "step": 8472 + }, + { + "epoch": 0.8246228710462287, + "grad_norm": 1.663854666612525, + "learning_rate": 7.85231910256789e-07, + "loss": 0.2811, + "step": 8473 + }, + { + "epoch": 0.8247201946472019, + "grad_norm": 1.5705588987249137, + "learning_rate": 7.843841698385585e-07, + "loss": 0.3556, + "step": 8474 + }, + { + "epoch": 0.8248175182481752, + "grad_norm": 1.6478376698357642, + "learning_rate": 7.835368483286626e-07, + "loss": 0.3289, + "step": 8475 + }, + { + "epoch": 0.8249148418491484, + "grad_norm": 1.6081901559035754, + "learning_rate": 7.826899458113019e-07, + "loss": 0.4915, + "step": 8476 + }, + { + "epoch": 0.8250121654501217, + "grad_norm": 1.5992094292157084, + "learning_rate": 7.818434623706333e-07, + "loss": 0.5246, + "step": 8477 + }, + { + "epoch": 0.8251094890510949, + "grad_norm": 1.5338165416755212, + "learning_rate": 7.809973980907742e-07, + "loss": 0.3045, + "step": 8478 + }, + { + "epoch": 0.8252068126520681, + "grad_norm": 1.7046004737297455, + "learning_rate": 7.801517530557951e-07, + "loss": 0.3665, + "step": 8479 + }, + { + "epoch": 0.8253041362530413, + "grad_norm": 1.7576330474806017, + "learning_rate": 7.793065273497302e-07, + "loss": 0.4076, + "step": 8480 + }, + { + "epoch": 0.8254014598540146, + "grad_norm": 2.09172996891279, + "learning_rate": 7.784617210565698e-07, + "loss": 0.4347, + "step": 8481 + }, + { + "epoch": 0.8254987834549878, + "grad_norm": 1.4082153997005342, + "learning_rate": 7.776173342602633e-07, + "loss": 0.2659, + "step": 8482 + }, + { + "epoch": 0.8255961070559611, + "grad_norm": 1.761134623856262, + "learning_rate": 7.767733670447175e-07, + "loss": 0.2619, + "step": 8483 + }, + { + "epoch": 0.8256934306569343, + "grad_norm": 1.8439889526698254, + "learning_rate": 7.759298194937976e-07, + "loss": 0.3088, + "step": 8484 + }, + { + "epoch": 0.8257907542579075, + "grad_norm": 1.6609470772668964, + "learning_rate": 7.750866916913291e-07, + "loss": 0.3519, + "step": 8485 + }, + { + "epoch": 0.8258880778588807, + "grad_norm": 2.1407092001289203, + "learning_rate": 7.742439837210924e-07, + "loss": 0.3493, + "step": 8486 + }, + { + "epoch": 0.825985401459854, + "grad_norm": 1.8911901842984078, + "learning_rate": 7.734016956668277e-07, + "loss": 0.2925, + "step": 8487 + }, + { + "epoch": 0.8260827250608272, + "grad_norm": 1.4743735421671984, + "learning_rate": 7.725598276122348e-07, + "loss": 0.1724, + "step": 8488 + }, + { + "epoch": 0.8261800486618005, + "grad_norm": 2.0066698437197723, + "learning_rate": 7.71718379640971e-07, + "loss": 0.3221, + "step": 8489 + }, + { + "epoch": 0.8262773722627738, + "grad_norm": 1.8293115400613988, + "learning_rate": 7.708773518366497e-07, + "loss": 0.4529, + "step": 8490 + }, + { + "epoch": 0.826374695863747, + "grad_norm": 1.582478777847092, + "learning_rate": 7.700367442828455e-07, + "loss": 0.2012, + "step": 8491 + }, + { + "epoch": 0.8264720194647202, + "grad_norm": 2.0102846830327135, + "learning_rate": 7.6919655706309e-07, + "loss": 0.3569, + "step": 8492 + }, + { + "epoch": 0.8265693430656934, + "grad_norm": 1.6704808777459978, + "learning_rate": 7.68356790260873e-07, + "loss": 0.412, + "step": 8493 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 1.3632836109567217, + "learning_rate": 7.675174439596422e-07, + "loss": 0.2031, + "step": 8494 + }, + { + "epoch": 0.8267639902676399, + "grad_norm": 1.7150625727946704, + "learning_rate": 7.666785182428055e-07, + "loss": 0.2449, + "step": 8495 + }, + { + "epoch": 0.8268613138686132, + "grad_norm": 1.9111430421768887, + "learning_rate": 7.658400131937266e-07, + "loss": 0.3597, + "step": 8496 + }, + { + "epoch": 0.8269586374695864, + "grad_norm": 1.651549380565205, + "learning_rate": 7.650019288957278e-07, + "loss": 0.5377, + "step": 8497 + }, + { + "epoch": 0.8270559610705596, + "grad_norm": 1.9261399993084707, + "learning_rate": 7.641642654320896e-07, + "loss": 0.3397, + "step": 8498 + }, + { + "epoch": 0.8271532846715328, + "grad_norm": 1.5817674958569945, + "learning_rate": 7.633270228860523e-07, + "loss": 0.1748, + "step": 8499 + }, + { + "epoch": 0.8272506082725061, + "grad_norm": 1.5979824339434694, + "learning_rate": 7.624902013408142e-07, + "loss": 0.1875, + "step": 8500 + }, + { + "epoch": 0.8273479318734793, + "grad_norm": 1.4587789490482372, + "learning_rate": 7.616538008795283e-07, + "loss": 0.2841, + "step": 8501 + }, + { + "epoch": 0.8274452554744526, + "grad_norm": 1.6610032736960159, + "learning_rate": 7.608178215853085e-07, + "loss": 0.4225, + "step": 8502 + }, + { + "epoch": 0.8275425790754258, + "grad_norm": 1.612415885209967, + "learning_rate": 7.59982263541228e-07, + "loss": 0.3336, + "step": 8503 + }, + { + "epoch": 0.827639902676399, + "grad_norm": 1.3200247185733505, + "learning_rate": 7.591471268303158e-07, + "loss": 0.3065, + "step": 8504 + }, + { + "epoch": 0.8277372262773722, + "grad_norm": 1.4465785074986195, + "learning_rate": 7.583124115355605e-07, + "loss": 0.1995, + "step": 8505 + }, + { + "epoch": 0.8278345498783455, + "grad_norm": 1.4905802274173892, + "learning_rate": 7.574781177399076e-07, + "loss": 0.277, + "step": 8506 + }, + { + "epoch": 0.8279318734793187, + "grad_norm": 1.545399356942915, + "learning_rate": 7.566442455262634e-07, + "loss": 0.3877, + "step": 8507 + }, + { + "epoch": 0.828029197080292, + "grad_norm": 1.621358679758913, + "learning_rate": 7.558107949774874e-07, + "loss": 0.5023, + "step": 8508 + }, + { + "epoch": 0.8281265206812652, + "grad_norm": 1.4670204368109103, + "learning_rate": 7.549777661764018e-07, + "loss": 0.3516, + "step": 8509 + }, + { + "epoch": 0.8282238442822385, + "grad_norm": 1.564514616139971, + "learning_rate": 7.541451592057841e-07, + "loss": 0.4125, + "step": 8510 + }, + { + "epoch": 0.8283211678832116, + "grad_norm": 1.6055541375464513, + "learning_rate": 7.53312974148373e-07, + "loss": 0.3947, + "step": 8511 + }, + { + "epoch": 0.8284184914841849, + "grad_norm": 1.7720002387457832, + "learning_rate": 7.524812110868607e-07, + "loss": 0.4002, + "step": 8512 + }, + { + "epoch": 0.8285158150851581, + "grad_norm": 1.8245112901350324, + "learning_rate": 7.516498701039016e-07, + "loss": 0.3383, + "step": 8513 + }, + { + "epoch": 0.8286131386861314, + "grad_norm": 1.6476422601850218, + "learning_rate": 7.508189512821057e-07, + "loss": 0.4616, + "step": 8514 + }, + { + "epoch": 0.8287104622871047, + "grad_norm": 1.9040028819535186, + "learning_rate": 7.499884547040426e-07, + "loss": 0.5753, + "step": 8515 + }, + { + "epoch": 0.8288077858880779, + "grad_norm": 1.4474534132976413, + "learning_rate": 7.49158380452239e-07, + "loss": 0.2413, + "step": 8516 + }, + { + "epoch": 0.828905109489051, + "grad_norm": 1.7732310574061771, + "learning_rate": 7.483287286091794e-07, + "loss": 0.5047, + "step": 8517 + }, + { + "epoch": 0.8290024330900243, + "grad_norm": 1.6198648597723118, + "learning_rate": 7.474994992573093e-07, + "loss": 0.4773, + "step": 8518 + }, + { + "epoch": 0.8290997566909976, + "grad_norm": 1.5960483807852985, + "learning_rate": 7.466706924790257e-07, + "loss": 0.2929, + "step": 8519 + }, + { + "epoch": 0.8291970802919708, + "grad_norm": 1.911649841638084, + "learning_rate": 7.458423083566907e-07, + "loss": 0.6987, + "step": 8520 + }, + { + "epoch": 0.8292944038929441, + "grad_norm": 4.198090555203923, + "learning_rate": 7.450143469726206e-07, + "loss": 0.5347, + "step": 8521 + }, + { + "epoch": 0.8293917274939173, + "grad_norm": 1.5890119846533022, + "learning_rate": 7.441868084090897e-07, + "loss": 0.2883, + "step": 8522 + }, + { + "epoch": 0.8294890510948905, + "grad_norm": 1.3406992147039773, + "learning_rate": 7.433596927483311e-07, + "loss": 0.188, + "step": 8523 + }, + { + "epoch": 0.8295863746958637, + "grad_norm": 1.7976489846531598, + "learning_rate": 7.425330000725361e-07, + "loss": 0.4347, + "step": 8524 + }, + { + "epoch": 0.829683698296837, + "grad_norm": 1.6595421945713655, + "learning_rate": 7.417067304638537e-07, + "loss": 0.2239, + "step": 8525 + }, + { + "epoch": 0.8297810218978102, + "grad_norm": 1.2218343391384832, + "learning_rate": 7.408808840043913e-07, + "loss": 0.2358, + "step": 8526 + }, + { + "epoch": 0.8298783454987835, + "grad_norm": 1.6711977005666543, + "learning_rate": 7.400554607762129e-07, + "loss": 0.4763, + "step": 8527 + }, + { + "epoch": 0.8299756690997567, + "grad_norm": 1.6147880260374157, + "learning_rate": 7.392304608613416e-07, + "loss": 0.3145, + "step": 8528 + }, + { + "epoch": 0.8300729927007299, + "grad_norm": 1.6613868371135543, + "learning_rate": 7.384058843417596e-07, + "loss": 0.3691, + "step": 8529 + }, + { + "epoch": 0.8301703163017031, + "grad_norm": 1.7534075667376292, + "learning_rate": 7.375817312994032e-07, + "loss": 0.3058, + "step": 8530 + }, + { + "epoch": 0.8302676399026764, + "grad_norm": 1.4145458090987433, + "learning_rate": 7.367580018161702e-07, + "loss": 0.3041, + "step": 8531 + }, + { + "epoch": 0.8303649635036496, + "grad_norm": 1.6052730840173675, + "learning_rate": 7.359346959739161e-07, + "loss": 0.5314, + "step": 8532 + }, + { + "epoch": 0.8304622871046229, + "grad_norm": 1.767528554316132, + "learning_rate": 7.351118138544511e-07, + "loss": 0.5447, + "step": 8533 + }, + { + "epoch": 0.8305596107055961, + "grad_norm": 1.5005044089039314, + "learning_rate": 7.342893555395464e-07, + "loss": 0.2781, + "step": 8534 + }, + { + "epoch": 0.8306569343065694, + "grad_norm": 1.933306185213851, + "learning_rate": 7.3346732111093e-07, + "loss": 0.4388, + "step": 8535 + }, + { + "epoch": 0.8307542579075425, + "grad_norm": 1.932468279421543, + "learning_rate": 7.326457106502888e-07, + "loss": 0.4213, + "step": 8536 + }, + { + "epoch": 0.8308515815085158, + "grad_norm": 2.098788220601136, + "learning_rate": 7.318245242392658e-07, + "loss": 0.4954, + "step": 8537 + }, + { + "epoch": 0.830948905109489, + "grad_norm": 1.860393469631337, + "learning_rate": 7.310037619594634e-07, + "loss": 0.4244, + "step": 8538 + }, + { + "epoch": 0.8310462287104623, + "grad_norm": 1.524397953200036, + "learning_rate": 7.301834238924427e-07, + "loss": 0.2804, + "step": 8539 + }, + { + "epoch": 0.8311435523114356, + "grad_norm": 1.4612297540963883, + "learning_rate": 7.293635101197183e-07, + "loss": 0.3336, + "step": 8540 + }, + { + "epoch": 0.8312408759124088, + "grad_norm": 1.754447326800058, + "learning_rate": 7.285440207227662e-07, + "loss": 0.3117, + "step": 8541 + }, + { + "epoch": 0.831338199513382, + "grad_norm": 1.594443226561633, + "learning_rate": 7.277249557830207e-07, + "loss": 0.5068, + "step": 8542 + }, + { + "epoch": 0.8314355231143552, + "grad_norm": 1.5019575696862064, + "learning_rate": 7.269063153818739e-07, + "loss": 0.3333, + "step": 8543 + }, + { + "epoch": 0.8315328467153285, + "grad_norm": 2.6813815353603423, + "learning_rate": 7.260880996006713e-07, + "loss": 0.3116, + "step": 8544 + }, + { + "epoch": 0.8316301703163017, + "grad_norm": 1.5663155317792048, + "learning_rate": 7.252703085207214e-07, + "loss": 0.3376, + "step": 8545 + }, + { + "epoch": 0.831727493917275, + "grad_norm": 1.3523063597201452, + "learning_rate": 7.244529422232882e-07, + "loss": 0.2078, + "step": 8546 + }, + { + "epoch": 0.8318248175182482, + "grad_norm": 1.6322378945353828, + "learning_rate": 7.236360007895937e-07, + "loss": 0.3601, + "step": 8547 + }, + { + "epoch": 0.8319221411192214, + "grad_norm": 1.5906467518696281, + "learning_rate": 7.22819484300819e-07, + "loss": 0.4066, + "step": 8548 + }, + { + "epoch": 0.8320194647201946, + "grad_norm": 1.4847694274484329, + "learning_rate": 7.220033928381009e-07, + "loss": 0.3151, + "step": 8549 + }, + { + "epoch": 0.8321167883211679, + "grad_norm": 1.560746467822195, + "learning_rate": 7.211877264825362e-07, + "loss": 0.4925, + "step": 8550 + }, + { + "epoch": 0.8322141119221411, + "grad_norm": 1.7371661434929782, + "learning_rate": 7.203724853151761e-07, + "loss": 0.3547, + "step": 8551 + }, + { + "epoch": 0.8323114355231144, + "grad_norm": 1.7898615089844225, + "learning_rate": 7.195576694170319e-07, + "loss": 0.2791, + "step": 8552 + }, + { + "epoch": 0.8324087591240876, + "grad_norm": 1.6287340397905474, + "learning_rate": 7.18743278869074e-07, + "loss": 0.434, + "step": 8553 + }, + { + "epoch": 0.8325060827250609, + "grad_norm": 1.6472164169317869, + "learning_rate": 7.179293137522286e-07, + "loss": 0.3623, + "step": 8554 + }, + { + "epoch": 0.832603406326034, + "grad_norm": 1.7317662595148324, + "learning_rate": 7.171157741473784e-07, + "loss": 0.5001, + "step": 8555 + }, + { + "epoch": 0.8327007299270073, + "grad_norm": 1.5191282703920355, + "learning_rate": 7.163026601353656e-07, + "loss": 0.3465, + "step": 8556 + }, + { + "epoch": 0.8327980535279805, + "grad_norm": 1.4559881245003041, + "learning_rate": 7.15489971796991e-07, + "loss": 0.2078, + "step": 8557 + }, + { + "epoch": 0.8328953771289538, + "grad_norm": 1.7207641590737977, + "learning_rate": 7.146777092130114e-07, + "loss": 0.4111, + "step": 8558 + }, + { + "epoch": 0.832992700729927, + "grad_norm": 1.5564791836178358, + "learning_rate": 7.138658724641417e-07, + "loss": 0.4267, + "step": 8559 + }, + { + "epoch": 0.8330900243309003, + "grad_norm": 1.674301525638343, + "learning_rate": 7.130544616310548e-07, + "loss": 0.5902, + "step": 8560 + }, + { + "epoch": 0.8331873479318734, + "grad_norm": 1.5137309401278491, + "learning_rate": 7.122434767943815e-07, + "loss": 0.3154, + "step": 8561 + }, + { + "epoch": 0.8332846715328467, + "grad_norm": 1.4936529024240945, + "learning_rate": 7.11432918034709e-07, + "loss": 0.3015, + "step": 8562 + }, + { + "epoch": 0.8333819951338199, + "grad_norm": 1.8057274402411736, + "learning_rate": 7.106227854325831e-07, + "loss": 0.4689, + "step": 8563 + }, + { + "epoch": 0.8334793187347932, + "grad_norm": 1.307474551637012, + "learning_rate": 7.098130790685076e-07, + "loss": 0.167, + "step": 8564 + }, + { + "epoch": 0.8335766423357664, + "grad_norm": 1.814497537551853, + "learning_rate": 7.090037990229442e-07, + "loss": 0.4195, + "step": 8565 + }, + { + "epoch": 0.8336739659367397, + "grad_norm": 1.7873667714013382, + "learning_rate": 7.081949453763099e-07, + "loss": 0.3294, + "step": 8566 + }, + { + "epoch": 0.8337712895377128, + "grad_norm": 2.052419941464072, + "learning_rate": 7.073865182089818e-07, + "loss": 0.5371, + "step": 8567 + }, + { + "epoch": 0.8338686131386861, + "grad_norm": 1.6722127983162858, + "learning_rate": 7.06578517601294e-07, + "loss": 0.3605, + "step": 8568 + }, + { + "epoch": 0.8339659367396594, + "grad_norm": 1.7074798064482961, + "learning_rate": 7.057709436335375e-07, + "loss": 0.3714, + "step": 8569 + }, + { + "epoch": 0.8340632603406326, + "grad_norm": 1.7199654876218387, + "learning_rate": 7.049637963859618e-07, + "loss": 0.4919, + "step": 8570 + }, + { + "epoch": 0.8341605839416059, + "grad_norm": 1.5838731380966813, + "learning_rate": 7.041570759387739e-07, + "loss": 0.3795, + "step": 8571 + }, + { + "epoch": 0.8342579075425791, + "grad_norm": 1.8021606987312517, + "learning_rate": 7.033507823721386e-07, + "loss": 0.4073, + "step": 8572 + }, + { + "epoch": 0.8343552311435523, + "grad_norm": 1.4219601783810147, + "learning_rate": 7.025449157661763e-07, + "loss": 0.3242, + "step": 8573 + }, + { + "epoch": 0.8344525547445255, + "grad_norm": 1.4983434271693445, + "learning_rate": 7.017394762009667e-07, + "loss": 0.4312, + "step": 8574 + }, + { + "epoch": 0.8345498783454988, + "grad_norm": 1.460210364223263, + "learning_rate": 7.009344637565484e-07, + "loss": 0.404, + "step": 8575 + }, + { + "epoch": 0.834647201946472, + "grad_norm": 1.388199523340607, + "learning_rate": 7.001298785129141e-07, + "loss": 0.3111, + "step": 8576 + }, + { + "epoch": 0.8347445255474453, + "grad_norm": 1.7969153358244494, + "learning_rate": 6.993257205500164e-07, + "loss": 0.3401, + "step": 8577 + }, + { + "epoch": 0.8348418491484185, + "grad_norm": 1.681639490780686, + "learning_rate": 6.985219899477641e-07, + "loss": 0.3092, + "step": 8578 + }, + { + "epoch": 0.8349391727493918, + "grad_norm": 1.6054515976929327, + "learning_rate": 6.977186867860281e-07, + "loss": 0.4981, + "step": 8579 + }, + { + "epoch": 0.8350364963503649, + "grad_norm": 1.4390050224232458, + "learning_rate": 6.969158111446289e-07, + "loss": 0.205, + "step": 8580 + }, + { + "epoch": 0.8351338199513382, + "grad_norm": 1.6766275611902814, + "learning_rate": 6.961133631033512e-07, + "loss": 0.3687, + "step": 8581 + }, + { + "epoch": 0.8352311435523114, + "grad_norm": 1.593305189104919, + "learning_rate": 6.953113427419333e-07, + "loss": 0.4552, + "step": 8582 + }, + { + "epoch": 0.8353284671532847, + "grad_norm": 3.171598375406762, + "learning_rate": 6.94509750140075e-07, + "loss": 0.1982, + "step": 8583 + }, + { + "epoch": 0.8354257907542579, + "grad_norm": 1.6973712567819714, + "learning_rate": 6.937085853774272e-07, + "loss": 0.3397, + "step": 8584 + }, + { + "epoch": 0.8355231143552312, + "grad_norm": 1.631879183859924, + "learning_rate": 6.92907848533605e-07, + "loss": 0.37, + "step": 8585 + }, + { + "epoch": 0.8356204379562043, + "grad_norm": 1.44325109341486, + "learning_rate": 6.921075396881777e-07, + "loss": 0.2179, + "step": 8586 + }, + { + "epoch": 0.8357177615571776, + "grad_norm": 1.926391690518115, + "learning_rate": 6.913076589206708e-07, + "loss": 0.2858, + "step": 8587 + }, + { + "epoch": 0.8358150851581508, + "grad_norm": 2.044274111864575, + "learning_rate": 6.905082063105695e-07, + "loss": 0.3106, + "step": 8588 + }, + { + "epoch": 0.8359124087591241, + "grad_norm": 1.5650767632326508, + "learning_rate": 6.897091819373175e-07, + "loss": 0.2645, + "step": 8589 + }, + { + "epoch": 0.8360097323600973, + "grad_norm": 2.090428008647602, + "learning_rate": 6.889105858803147e-07, + "loss": 0.3794, + "step": 8590 + }, + { + "epoch": 0.8361070559610706, + "grad_norm": 1.4618622815906854, + "learning_rate": 6.881124182189153e-07, + "loss": 0.2512, + "step": 8591 + }, + { + "epoch": 0.8362043795620437, + "grad_norm": 1.7695858170135843, + "learning_rate": 6.873146790324359e-07, + "loss": 0.3142, + "step": 8592 + }, + { + "epoch": 0.836301703163017, + "grad_norm": 1.7452572177748094, + "learning_rate": 6.86517368400148e-07, + "loss": 0.4342, + "step": 8593 + }, + { + "epoch": 0.8363990267639902, + "grad_norm": 1.7145656629588895, + "learning_rate": 6.857204864012796e-07, + "loss": 0.3788, + "step": 8594 + }, + { + "epoch": 0.8364963503649635, + "grad_norm": 1.3255674982256826, + "learning_rate": 6.849240331150186e-07, + "loss": 0.2187, + "step": 8595 + }, + { + "epoch": 0.8365936739659368, + "grad_norm": 1.634572080323381, + "learning_rate": 6.841280086205082e-07, + "loss": 0.3027, + "step": 8596 + }, + { + "epoch": 0.83669099756691, + "grad_norm": 1.4176279714634104, + "learning_rate": 6.833324129968516e-07, + "loss": 0.3121, + "step": 8597 + }, + { + "epoch": 0.8367883211678833, + "grad_norm": 1.3780331535614276, + "learning_rate": 6.825372463231039e-07, + "loss": 0.1995, + "step": 8598 + }, + { + "epoch": 0.8368856447688564, + "grad_norm": 1.6971802157739329, + "learning_rate": 6.817425086782853e-07, + "loss": 0.2675, + "step": 8599 + }, + { + "epoch": 0.8369829683698297, + "grad_norm": 1.5928509738370114, + "learning_rate": 6.809482001413675e-07, + "loss": 0.1929, + "step": 8600 + }, + { + "epoch": 0.8370802919708029, + "grad_norm": 2.207058364566572, + "learning_rate": 6.801543207912831e-07, + "loss": 0.3983, + "step": 8601 + }, + { + "epoch": 0.8371776155717762, + "grad_norm": 1.3219538311189645, + "learning_rate": 6.793608707069177e-07, + "loss": 0.2636, + "step": 8602 + }, + { + "epoch": 0.8372749391727494, + "grad_norm": 1.6056817648992607, + "learning_rate": 6.785678499671183e-07, + "loss": 0.3341, + "step": 8603 + }, + { + "epoch": 0.8373722627737227, + "grad_norm": 1.6680628729892766, + "learning_rate": 6.777752586506892e-07, + "loss": 0.5588, + "step": 8604 + }, + { + "epoch": 0.8374695863746958, + "grad_norm": 1.4215204823964227, + "learning_rate": 6.769830968363883e-07, + "loss": 0.3646, + "step": 8605 + }, + { + "epoch": 0.8375669099756691, + "grad_norm": 1.995817699118949, + "learning_rate": 6.761913646029339e-07, + "loss": 0.4712, + "step": 8606 + }, + { + "epoch": 0.8376642335766423, + "grad_norm": 1.381363190030633, + "learning_rate": 6.754000620290008e-07, + "loss": 0.2946, + "step": 8607 + }, + { + "epoch": 0.8377615571776156, + "grad_norm": 1.6879051569829766, + "learning_rate": 6.746091891932221e-07, + "loss": 0.2054, + "step": 8608 + }, + { + "epoch": 0.8378588807785888, + "grad_norm": 1.6050551129917523, + "learning_rate": 6.738187461741869e-07, + "loss": 0.3422, + "step": 8609 + }, + { + "epoch": 0.8379562043795621, + "grad_norm": 1.5566174973285833, + "learning_rate": 6.730287330504421e-07, + "loss": 0.3441, + "step": 8610 + }, + { + "epoch": 0.8380535279805352, + "grad_norm": 1.5313977540022496, + "learning_rate": 6.722391499004926e-07, + "loss": 0.4282, + "step": 8611 + }, + { + "epoch": 0.8381508515815085, + "grad_norm": 1.6900611797522427, + "learning_rate": 6.714499968027976e-07, + "loss": 0.3161, + "step": 8612 + }, + { + "epoch": 0.8382481751824817, + "grad_norm": 1.655212077125971, + "learning_rate": 6.706612738357771e-07, + "loss": 0.2535, + "step": 8613 + }, + { + "epoch": 0.838345498783455, + "grad_norm": 2.1010379632990634, + "learning_rate": 6.698729810778065e-07, + "loss": 0.4864, + "step": 8614 + }, + { + "epoch": 0.8384428223844282, + "grad_norm": 1.676358494785513, + "learning_rate": 6.690851186072212e-07, + "loss": 0.2741, + "step": 8615 + }, + { + "epoch": 0.8385401459854015, + "grad_norm": 1.659686643714806, + "learning_rate": 6.682976865023078e-07, + "loss": 0.5273, + "step": 8616 + }, + { + "epoch": 0.8386374695863748, + "grad_norm": 1.4850606037707512, + "learning_rate": 6.67510684841316e-07, + "loss": 0.351, + "step": 8617 + }, + { + "epoch": 0.8387347931873479, + "grad_norm": 1.4498216552234817, + "learning_rate": 6.667241137024505e-07, + "loss": 0.3457, + "step": 8618 + }, + { + "epoch": 0.8388321167883211, + "grad_norm": 1.6532611870149962, + "learning_rate": 6.659379731638732e-07, + "loss": 0.4286, + "step": 8619 + }, + { + "epoch": 0.8389294403892944, + "grad_norm": 1.35646108751345, + "learning_rate": 6.651522633037033e-07, + "loss": 0.2108, + "step": 8620 + }, + { + "epoch": 0.8390267639902677, + "grad_norm": 1.704881514742031, + "learning_rate": 6.643669842000172e-07, + "loss": 0.4756, + "step": 8621 + }, + { + "epoch": 0.8391240875912409, + "grad_norm": 1.8627227649983678, + "learning_rate": 6.635821359308503e-07, + "loss": 0.4357, + "step": 8622 + }, + { + "epoch": 0.8392214111922142, + "grad_norm": 2.14242667234055, + "learning_rate": 6.627977185741907e-07, + "loss": 0.365, + "step": 8623 + }, + { + "epoch": 0.8393187347931873, + "grad_norm": 1.8594593061039326, + "learning_rate": 6.620137322079878e-07, + "loss": 0.3943, + "step": 8624 + }, + { + "epoch": 0.8394160583941606, + "grad_norm": 1.7484495242662454, + "learning_rate": 6.612301769101464e-07, + "loss": 0.6012, + "step": 8625 + }, + { + "epoch": 0.8395133819951338, + "grad_norm": 1.3783435051453137, + "learning_rate": 6.604470527585305e-07, + "loss": 0.2383, + "step": 8626 + }, + { + "epoch": 0.8396107055961071, + "grad_norm": 1.5899346253012838, + "learning_rate": 6.596643598309571e-07, + "loss": 0.3467, + "step": 8627 + }, + { + "epoch": 0.8397080291970803, + "grad_norm": 1.7178503282784012, + "learning_rate": 6.588820982052041e-07, + "loss": 0.3709, + "step": 8628 + }, + { + "epoch": 0.8398053527980536, + "grad_norm": 1.7115790849445944, + "learning_rate": 6.581002679590048e-07, + "loss": 0.3853, + "step": 8629 + }, + { + "epoch": 0.8399026763990267, + "grad_norm": 1.6871646265273004, + "learning_rate": 6.573188691700511e-07, + "loss": 0.452, + "step": 8630 + }, + { + "epoch": 0.84, + "grad_norm": 1.4296357192371167, + "learning_rate": 6.565379019159907e-07, + "loss": 0.3619, + "step": 8631 + }, + { + "epoch": 0.8400973236009732, + "grad_norm": 1.3209544771076784, + "learning_rate": 6.557573662744282e-07, + "loss": 0.2646, + "step": 8632 + }, + { + "epoch": 0.8401946472019465, + "grad_norm": 1.6927973640699001, + "learning_rate": 6.549772623229278e-07, + "loss": 0.2955, + "step": 8633 + }, + { + "epoch": 0.8402919708029197, + "grad_norm": 1.8072285783146922, + "learning_rate": 6.541975901390063e-07, + "loss": 0.3156, + "step": 8634 + }, + { + "epoch": 0.840389294403893, + "grad_norm": 1.5032776943379242, + "learning_rate": 6.534183498001418e-07, + "loss": 0.4644, + "step": 8635 + }, + { + "epoch": 0.8404866180048661, + "grad_norm": 1.8775744248431792, + "learning_rate": 6.526395413837672e-07, + "loss": 0.4384, + "step": 8636 + }, + { + "epoch": 0.8405839416058394, + "grad_norm": 1.4166718744668418, + "learning_rate": 6.518611649672746e-07, + "loss": 0.335, + "step": 8637 + }, + { + "epoch": 0.8406812652068126, + "grad_norm": 1.5633450321147493, + "learning_rate": 6.510832206280093e-07, + "loss": 0.5016, + "step": 8638 + }, + { + "epoch": 0.8407785888077859, + "grad_norm": 1.6935655584944385, + "learning_rate": 6.503057084432773e-07, + "loss": 0.3664, + "step": 8639 + }, + { + "epoch": 0.8408759124087591, + "grad_norm": 1.6116366744119, + "learning_rate": 6.495286284903413e-07, + "loss": 0.3694, + "step": 8640 + }, + { + "epoch": 0.8409732360097324, + "grad_norm": 1.4764915328152817, + "learning_rate": 6.487519808464188e-07, + "loss": 0.213, + "step": 8641 + }, + { + "epoch": 0.8410705596107056, + "grad_norm": 1.5767092255593964, + "learning_rate": 6.47975765588687e-07, + "loss": 0.3403, + "step": 8642 + }, + { + "epoch": 0.8411678832116788, + "grad_norm": 1.3786885465205274, + "learning_rate": 6.471999827942777e-07, + "loss": 0.2776, + "step": 8643 + }, + { + "epoch": 0.841265206812652, + "grad_norm": 1.7941899680493631, + "learning_rate": 6.464246325402834e-07, + "loss": 0.331, + "step": 8644 + }, + { + "epoch": 0.8413625304136253, + "grad_norm": 1.4748190623368278, + "learning_rate": 6.456497149037478e-07, + "loss": 0.3056, + "step": 8645 + }, + { + "epoch": 0.8414598540145986, + "grad_norm": 1.7475327401648297, + "learning_rate": 6.448752299616762e-07, + "loss": 0.3612, + "step": 8646 + }, + { + "epoch": 0.8415571776155718, + "grad_norm": 1.7671061691875058, + "learning_rate": 6.4410117779103e-07, + "loss": 0.3985, + "step": 8647 + }, + { + "epoch": 0.8416545012165451, + "grad_norm": 1.9460907264174498, + "learning_rate": 6.433275584687287e-07, + "loss": 0.5034, + "step": 8648 + }, + { + "epoch": 0.8417518248175182, + "grad_norm": 1.4558811915886565, + "learning_rate": 6.425543720716442e-07, + "loss": 0.3574, + "step": 8649 + }, + { + "epoch": 0.8418491484184915, + "grad_norm": 1.8523496089538563, + "learning_rate": 6.417816186766102e-07, + "loss": 0.3854, + "step": 8650 + }, + { + "epoch": 0.8419464720194647, + "grad_norm": 1.8000537784807913, + "learning_rate": 6.410092983604149e-07, + "loss": 0.5357, + "step": 8651 + }, + { + "epoch": 0.842043795620438, + "grad_norm": 1.352374499944892, + "learning_rate": 6.402374111998055e-07, + "loss": 0.2959, + "step": 8652 + }, + { + "epoch": 0.8421411192214112, + "grad_norm": 1.6284323169813906, + "learning_rate": 6.394659572714845e-07, + "loss": 0.317, + "step": 8653 + }, + { + "epoch": 0.8422384428223845, + "grad_norm": 2.4055583009222086, + "learning_rate": 6.386949366521112e-07, + "loss": 0.3781, + "step": 8654 + }, + { + "epoch": 0.8423357664233576, + "grad_norm": 1.6893934294457125, + "learning_rate": 6.379243494183035e-07, + "loss": 0.2793, + "step": 8655 + }, + { + "epoch": 0.8424330900243309, + "grad_norm": 1.637773196893914, + "learning_rate": 6.371541956466337e-07, + "loss": 0.341, + "step": 8656 + }, + { + "epoch": 0.8425304136253041, + "grad_norm": 1.9097957465920725, + "learning_rate": 6.363844754136328e-07, + "loss": 0.3355, + "step": 8657 + }, + { + "epoch": 0.8426277372262774, + "grad_norm": 2.3671798454135726, + "learning_rate": 6.356151887957901e-07, + "loss": 0.3165, + "step": 8658 + }, + { + "epoch": 0.8427250608272506, + "grad_norm": 2.0678729961436053, + "learning_rate": 6.348463358695473e-07, + "loss": 0.286, + "step": 8659 + }, + { + "epoch": 0.8428223844282239, + "grad_norm": 1.7451722446459736, + "learning_rate": 6.340779167113071e-07, + "loss": 0.1859, + "step": 8660 + }, + { + "epoch": 0.8429197080291971, + "grad_norm": 1.8549885017881802, + "learning_rate": 6.333099313974283e-07, + "loss": 0.4105, + "step": 8661 + }, + { + "epoch": 0.8430170316301703, + "grad_norm": 1.3497120974101795, + "learning_rate": 6.325423800042252e-07, + "loss": 0.3608, + "step": 8662 + }, + { + "epoch": 0.8431143552311435, + "grad_norm": 1.6138464281745304, + "learning_rate": 6.317752626079704e-07, + "loss": 0.3384, + "step": 8663 + }, + { + "epoch": 0.8432116788321168, + "grad_norm": 1.6093773702955847, + "learning_rate": 6.310085792848924e-07, + "loss": 0.428, + "step": 8664 + }, + { + "epoch": 0.84330900243309, + "grad_norm": 1.4164798370283587, + "learning_rate": 6.302423301111777e-07, + "loss": 0.2585, + "step": 8665 + }, + { + "epoch": 0.8434063260340633, + "grad_norm": 1.6569900298938074, + "learning_rate": 6.294765151629689e-07, + "loss": 0.1824, + "step": 8666 + }, + { + "epoch": 0.8435036496350365, + "grad_norm": 1.7339118132820406, + "learning_rate": 6.287111345163649e-07, + "loss": 0.4524, + "step": 8667 + }, + { + "epoch": 0.8436009732360097, + "grad_norm": 1.808129401361493, + "learning_rate": 6.279461882474214e-07, + "loss": 0.2911, + "step": 8668 + }, + { + "epoch": 0.8436982968369829, + "grad_norm": 1.7913937767449668, + "learning_rate": 6.271816764321542e-07, + "loss": 0.3571, + "step": 8669 + }, + { + "epoch": 0.8437956204379562, + "grad_norm": 1.4660603671687962, + "learning_rate": 6.264175991465304e-07, + "loss": 0.3053, + "step": 8670 + }, + { + "epoch": 0.8438929440389294, + "grad_norm": 2.2072884468846032, + "learning_rate": 6.256539564664776e-07, + "loss": 0.3066, + "step": 8671 + }, + { + "epoch": 0.8439902676399027, + "grad_norm": 1.8100316570738135, + "learning_rate": 6.248907484678801e-07, + "loss": 0.3393, + "step": 8672 + }, + { + "epoch": 0.844087591240876, + "grad_norm": 1.953409647226595, + "learning_rate": 6.241279752265783e-07, + "loss": 0.2886, + "step": 8673 + }, + { + "epoch": 0.8441849148418491, + "grad_norm": 1.5071637433050533, + "learning_rate": 6.233656368183688e-07, + "loss": 0.3963, + "step": 8674 + }, + { + "epoch": 0.8442822384428224, + "grad_norm": 1.8204806663536373, + "learning_rate": 6.226037333190065e-07, + "loss": 0.5126, + "step": 8675 + }, + { + "epoch": 0.8443795620437956, + "grad_norm": 1.7944925851272784, + "learning_rate": 6.218422648042027e-07, + "loss": 0.2735, + "step": 8676 + }, + { + "epoch": 0.8444768856447689, + "grad_norm": 1.805865477185065, + "learning_rate": 6.210812313496229e-07, + "loss": 0.4151, + "step": 8677 + }, + { + "epoch": 0.8445742092457421, + "grad_norm": 1.5512683017090658, + "learning_rate": 6.203206330308926e-07, + "loss": 0.3925, + "step": 8678 + }, + { + "epoch": 0.8446715328467154, + "grad_norm": 1.994498195590955, + "learning_rate": 6.19560469923593e-07, + "loss": 0.2901, + "step": 8679 + }, + { + "epoch": 0.8447688564476885, + "grad_norm": 1.757429293082463, + "learning_rate": 6.188007421032632e-07, + "loss": 0.2712, + "step": 8680 + }, + { + "epoch": 0.8448661800486618, + "grad_norm": 1.4443362676427953, + "learning_rate": 6.180414496453951e-07, + "loss": 0.339, + "step": 8681 + }, + { + "epoch": 0.844963503649635, + "grad_norm": 1.5515924057090442, + "learning_rate": 6.172825926254417e-07, + "loss": 0.2303, + "step": 8682 + }, + { + "epoch": 0.8450608272506083, + "grad_norm": 1.7799156277194508, + "learning_rate": 6.165241711188114e-07, + "loss": 0.4688, + "step": 8683 + }, + { + "epoch": 0.8451581508515815, + "grad_norm": 1.6360036521441819, + "learning_rate": 6.157661852008679e-07, + "loss": 0.3107, + "step": 8684 + }, + { + "epoch": 0.8452554744525548, + "grad_norm": 1.5162143151470333, + "learning_rate": 6.150086349469336e-07, + "loss": 0.4376, + "step": 8685 + }, + { + "epoch": 0.845352798053528, + "grad_norm": 1.810649189943814, + "learning_rate": 6.142515204322863e-07, + "loss": 0.299, + "step": 8686 + }, + { + "epoch": 0.8454501216545012, + "grad_norm": 1.670041228373651, + "learning_rate": 6.134948417321623e-07, + "loss": 0.4847, + "step": 8687 + }, + { + "epoch": 0.8455474452554744, + "grad_norm": 1.5518235708219206, + "learning_rate": 6.12738598921751e-07, + "loss": 0.5369, + "step": 8688 + }, + { + "epoch": 0.8456447688564477, + "grad_norm": 1.7920504609905181, + "learning_rate": 6.119827920762017e-07, + "loss": 0.307, + "step": 8689 + }, + { + "epoch": 0.8457420924574209, + "grad_norm": 1.4321443054224512, + "learning_rate": 6.112274212706193e-07, + "loss": 0.3271, + "step": 8690 + }, + { + "epoch": 0.8458394160583942, + "grad_norm": 1.4221589360378877, + "learning_rate": 6.104724865800665e-07, + "loss": 0.3681, + "step": 8691 + }, + { + "epoch": 0.8459367396593674, + "grad_norm": 1.6544745663958784, + "learning_rate": 6.097179880795601e-07, + "loss": 0.3538, + "step": 8692 + }, + { + "epoch": 0.8460340632603406, + "grad_norm": 1.5641953997783475, + "learning_rate": 6.089639258440754e-07, + "loss": 0.5182, + "step": 8693 + }, + { + "epoch": 0.8461313868613138, + "grad_norm": 1.7431617373496398, + "learning_rate": 6.082102999485445e-07, + "loss": 0.5463, + "step": 8694 + }, + { + "epoch": 0.8462287104622871, + "grad_norm": 1.8641315815124582, + "learning_rate": 6.074571104678556e-07, + "loss": 0.4805, + "step": 8695 + }, + { + "epoch": 0.8463260340632603, + "grad_norm": 2.1602093377927885, + "learning_rate": 6.067043574768533e-07, + "loss": 0.4439, + "step": 8696 + }, + { + "epoch": 0.8464233576642336, + "grad_norm": 2.1300122109363113, + "learning_rate": 6.059520410503395e-07, + "loss": 0.5429, + "step": 8697 + }, + { + "epoch": 0.8465206812652069, + "grad_norm": 1.640862002738204, + "learning_rate": 6.052001612630731e-07, + "loss": 0.3275, + "step": 8698 + }, + { + "epoch": 0.84661800486618, + "grad_norm": 1.7542324654870431, + "learning_rate": 6.044487181897674e-07, + "loss": 0.5116, + "step": 8699 + }, + { + "epoch": 0.8467153284671532, + "grad_norm": 1.6879613033894207, + "learning_rate": 6.036977119050936e-07, + "loss": 0.4633, + "step": 8700 + }, + { + "epoch": 0.8468126520681265, + "grad_norm": 1.9596722537070996, + "learning_rate": 6.029471424836808e-07, + "loss": 0.3206, + "step": 8701 + }, + { + "epoch": 0.8469099756690998, + "grad_norm": 1.5325755395987553, + "learning_rate": 6.021970100001135e-07, + "loss": 0.2544, + "step": 8702 + }, + { + "epoch": 0.847007299270073, + "grad_norm": 1.6391677924447912, + "learning_rate": 6.014473145289318e-07, + "loss": 0.2718, + "step": 8703 + }, + { + "epoch": 0.8471046228710463, + "grad_norm": 1.5110720648504852, + "learning_rate": 6.00698056144634e-07, + "loss": 0.3074, + "step": 8704 + }, + { + "epoch": 0.8472019464720195, + "grad_norm": 1.5924148202629524, + "learning_rate": 5.999492349216746e-07, + "loss": 0.2183, + "step": 8705 + }, + { + "epoch": 0.8472992700729927, + "grad_norm": 1.356232705423511, + "learning_rate": 5.992008509344638e-07, + "loss": 0.3114, + "step": 8706 + }, + { + "epoch": 0.8473965936739659, + "grad_norm": 1.5098439501028276, + "learning_rate": 5.9845290425737e-07, + "loss": 0.3307, + "step": 8707 + }, + { + "epoch": 0.8474939172749392, + "grad_norm": 1.5089746453674888, + "learning_rate": 5.97705394964716e-07, + "loss": 0.2009, + "step": 8708 + }, + { + "epoch": 0.8475912408759124, + "grad_norm": 1.620347331453457, + "learning_rate": 5.969583231307841e-07, + "loss": 0.3435, + "step": 8709 + }, + { + "epoch": 0.8476885644768857, + "grad_norm": 1.5586220492516234, + "learning_rate": 5.962116888298086e-07, + "loss": 0.3895, + "step": 8710 + }, + { + "epoch": 0.8477858880778589, + "grad_norm": 1.784878030878728, + "learning_rate": 5.954654921359843e-07, + "loss": 0.399, + "step": 8711 + }, + { + "epoch": 0.8478832116788321, + "grad_norm": 1.692067992550951, + "learning_rate": 5.947197331234628e-07, + "loss": 0.2609, + "step": 8712 + }, + { + "epoch": 0.8479805352798053, + "grad_norm": 1.4989161549303245, + "learning_rate": 5.939744118663476e-07, + "loss": 0.2442, + "step": 8713 + }, + { + "epoch": 0.8480778588807786, + "grad_norm": 1.85361961981458, + "learning_rate": 5.932295284387035e-07, + "loss": 0.3494, + "step": 8714 + }, + { + "epoch": 0.8481751824817518, + "grad_norm": 1.6337530174315067, + "learning_rate": 5.924850829145501e-07, + "loss": 0.2885, + "step": 8715 + }, + { + "epoch": 0.8482725060827251, + "grad_norm": 1.3656275978325547, + "learning_rate": 5.917410753678626e-07, + "loss": 0.2181, + "step": 8716 + }, + { + "epoch": 0.8483698296836983, + "grad_norm": 1.6060737156891838, + "learning_rate": 5.909975058725742e-07, + "loss": 0.1981, + "step": 8717 + }, + { + "epoch": 0.8484671532846715, + "grad_norm": 1.8895423248226388, + "learning_rate": 5.902543745025735e-07, + "loss": 0.5245, + "step": 8718 + }, + { + "epoch": 0.8485644768856447, + "grad_norm": 1.9666935913484278, + "learning_rate": 5.895116813317059e-07, + "loss": 0.3497, + "step": 8719 + }, + { + "epoch": 0.848661800486618, + "grad_norm": 1.514094258104269, + "learning_rate": 5.887694264337745e-07, + "loss": 0.3482, + "step": 8720 + }, + { + "epoch": 0.8487591240875912, + "grad_norm": 1.6652726622011444, + "learning_rate": 5.88027609882536e-07, + "loss": 0.4366, + "step": 8721 + }, + { + "epoch": 0.8488564476885645, + "grad_norm": 1.763051478365167, + "learning_rate": 5.872862317517053e-07, + "loss": 0.2143, + "step": 8722 + }, + { + "epoch": 0.8489537712895378, + "grad_norm": 1.610683686827001, + "learning_rate": 5.865452921149551e-07, + "loss": 0.3979, + "step": 8723 + }, + { + "epoch": 0.8490510948905109, + "grad_norm": 1.5467193216919528, + "learning_rate": 5.858047910459109e-07, + "loss": 0.3871, + "step": 8724 + }, + { + "epoch": 0.8491484184914841, + "grad_norm": 2.1120888665460997, + "learning_rate": 5.850647286181576e-07, + "loss": 0.5089, + "step": 8725 + }, + { + "epoch": 0.8492457420924574, + "grad_norm": 2.0670794884799024, + "learning_rate": 5.843251049052356e-07, + "loss": 0.4069, + "step": 8726 + }, + { + "epoch": 0.8493430656934307, + "grad_norm": 1.7292601625221924, + "learning_rate": 5.835859199806431e-07, + "loss": 0.3312, + "step": 8727 + }, + { + "epoch": 0.8494403892944039, + "grad_norm": 1.493750427756787, + "learning_rate": 5.828471739178321e-07, + "loss": 0.3598, + "step": 8728 + }, + { + "epoch": 0.8495377128953772, + "grad_norm": 2.0665294704223367, + "learning_rate": 5.821088667902119e-07, + "loss": 0.3139, + "step": 8729 + }, + { + "epoch": 0.8496350364963504, + "grad_norm": 1.8152139011161796, + "learning_rate": 5.813709986711503e-07, + "loss": 0.4639, + "step": 8730 + }, + { + "epoch": 0.8497323600973236, + "grad_norm": 1.5784778766984866, + "learning_rate": 5.806335696339671e-07, + "loss": 0.3695, + "step": 8731 + }, + { + "epoch": 0.8498296836982968, + "grad_norm": 1.5149004275897062, + "learning_rate": 5.798965797519429e-07, + "loss": 0.2947, + "step": 8732 + }, + { + "epoch": 0.8499270072992701, + "grad_norm": 2.2222022479537156, + "learning_rate": 5.791600290983124e-07, + "loss": 0.2237, + "step": 8733 + }, + { + "epoch": 0.8500243309002433, + "grad_norm": 1.704617187840155, + "learning_rate": 5.784239177462675e-07, + "loss": 0.3286, + "step": 8734 + }, + { + "epoch": 0.8501216545012166, + "grad_norm": 1.5110543143749702, + "learning_rate": 5.776882457689547e-07, + "loss": 0.3977, + "step": 8735 + }, + { + "epoch": 0.8502189781021898, + "grad_norm": 1.9781079187604855, + "learning_rate": 5.769530132394796e-07, + "loss": 0.5381, + "step": 8736 + }, + { + "epoch": 0.850316301703163, + "grad_norm": 1.382161994990279, + "learning_rate": 5.762182202309008e-07, + "loss": 0.2159, + "step": 8737 + }, + { + "epoch": 0.8504136253041362, + "grad_norm": 1.665055386191977, + "learning_rate": 5.754838668162388e-07, + "loss": 0.4189, + "step": 8738 + }, + { + "epoch": 0.8505109489051095, + "grad_norm": 1.8400309927030214, + "learning_rate": 5.747499530684636e-07, + "loss": 0.568, + "step": 8739 + }, + { + "epoch": 0.8506082725060827, + "grad_norm": 1.9776062151021112, + "learning_rate": 5.740164790605051e-07, + "loss": 0.407, + "step": 8740 + }, + { + "epoch": 0.850705596107056, + "grad_norm": 1.6624480103129264, + "learning_rate": 5.732834448652508e-07, + "loss": 0.4396, + "step": 8741 + }, + { + "epoch": 0.8508029197080292, + "grad_norm": 1.9496439926690807, + "learning_rate": 5.725508505555405e-07, + "loss": 0.2231, + "step": 8742 + }, + { + "epoch": 0.8509002433090024, + "grad_norm": 1.6989128451296964, + "learning_rate": 5.718186962041733e-07, + "loss": 0.3906, + "step": 8743 + }, + { + "epoch": 0.8509975669099756, + "grad_norm": 1.718645959652932, + "learning_rate": 5.710869818839038e-07, + "loss": 0.4678, + "step": 8744 + }, + { + "epoch": 0.8510948905109489, + "grad_norm": 1.4202276894875634, + "learning_rate": 5.703557076674448e-07, + "loss": 0.2452, + "step": 8745 + }, + { + "epoch": 0.8511922141119221, + "grad_norm": 1.5988122167105736, + "learning_rate": 5.696248736274601e-07, + "loss": 0.3634, + "step": 8746 + }, + { + "epoch": 0.8512895377128954, + "grad_norm": 1.559093576849084, + "learning_rate": 5.688944798365742e-07, + "loss": 0.3892, + "step": 8747 + }, + { + "epoch": 0.8513868613138686, + "grad_norm": 1.9819888689669738, + "learning_rate": 5.681645263673685e-07, + "loss": 0.5602, + "step": 8748 + }, + { + "epoch": 0.8514841849148419, + "grad_norm": 1.3611532835146964, + "learning_rate": 5.67435013292379e-07, + "loss": 0.2401, + "step": 8749 + }, + { + "epoch": 0.851581508515815, + "grad_norm": 1.936111567986883, + "learning_rate": 5.667059406840952e-07, + "loss": 0.3121, + "step": 8750 + }, + { + "epoch": 0.8516788321167883, + "grad_norm": 1.4310907033701892, + "learning_rate": 5.659773086149672e-07, + "loss": 0.2085, + "step": 8751 + }, + { + "epoch": 0.8517761557177616, + "grad_norm": 1.6728361700255066, + "learning_rate": 5.652491171574004e-07, + "loss": 0.4861, + "step": 8752 + }, + { + "epoch": 0.8518734793187348, + "grad_norm": 1.7799211866096747, + "learning_rate": 5.645213663837534e-07, + "loss": 0.2781, + "step": 8753 + }, + { + "epoch": 0.8519708029197081, + "grad_norm": 1.5273559257626894, + "learning_rate": 5.637940563663452e-07, + "loss": 0.3808, + "step": 8754 + }, + { + "epoch": 0.8520681265206813, + "grad_norm": 2.0963925439074225, + "learning_rate": 5.630671871774479e-07, + "loss": 0.5135, + "step": 8755 + }, + { + "epoch": 0.8521654501216545, + "grad_norm": 1.5693254547623599, + "learning_rate": 5.623407588892921e-07, + "loss": 0.3833, + "step": 8756 + }, + { + "epoch": 0.8522627737226277, + "grad_norm": 1.605303159467135, + "learning_rate": 5.616147715740611e-07, + "loss": 0.2691, + "step": 8757 + }, + { + "epoch": 0.852360097323601, + "grad_norm": 1.8627970644204648, + "learning_rate": 5.608892253038989e-07, + "loss": 0.4211, + "step": 8758 + }, + { + "epoch": 0.8524574209245742, + "grad_norm": 1.508663668560556, + "learning_rate": 5.601641201509044e-07, + "loss": 0.3778, + "step": 8759 + }, + { + "epoch": 0.8525547445255475, + "grad_norm": 1.713970405100551, + "learning_rate": 5.594394561871286e-07, + "loss": 0.4704, + "step": 8760 + }, + { + "epoch": 0.8526520681265207, + "grad_norm": 1.453235991146147, + "learning_rate": 5.587152334845841e-07, + "loss": 0.3942, + "step": 8761 + }, + { + "epoch": 0.8527493917274939, + "grad_norm": 1.538184830650136, + "learning_rate": 5.579914521152363e-07, + "loss": 0.4101, + "step": 8762 + }, + { + "epoch": 0.8528467153284671, + "grad_norm": 1.5492490795281115, + "learning_rate": 5.572681121510092e-07, + "loss": 0.2211, + "step": 8763 + }, + { + "epoch": 0.8529440389294404, + "grad_norm": 1.7447765596409501, + "learning_rate": 5.565452136637795e-07, + "loss": 0.3335, + "step": 8764 + }, + { + "epoch": 0.8530413625304136, + "grad_norm": 1.7118736736714468, + "learning_rate": 5.558227567253832e-07, + "loss": 0.5926, + "step": 8765 + }, + { + "epoch": 0.8531386861313869, + "grad_norm": 1.6035237283072123, + "learning_rate": 5.551007414076109e-07, + "loss": 0.3877, + "step": 8766 + }, + { + "epoch": 0.8532360097323601, + "grad_norm": 1.6339025190124201, + "learning_rate": 5.543791677822102e-07, + "loss": 0.3642, + "step": 8767 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 1.6292867604876653, + "learning_rate": 5.536580359208843e-07, + "loss": 0.3324, + "step": 8768 + }, + { + "epoch": 0.8534306569343065, + "grad_norm": 3.4952030006142603, + "learning_rate": 5.529373458952919e-07, + "loss": 0.4137, + "step": 8769 + }, + { + "epoch": 0.8535279805352798, + "grad_norm": 1.778383324065555, + "learning_rate": 5.522170977770497e-07, + "loss": 0.4613, + "step": 8770 + }, + { + "epoch": 0.853625304136253, + "grad_norm": 1.7178584474106025, + "learning_rate": 5.514972916377276e-07, + "loss": 0.4278, + "step": 8771 + }, + { + "epoch": 0.8537226277372263, + "grad_norm": 1.9528528863176984, + "learning_rate": 5.507779275488539e-07, + "loss": 0.4395, + "step": 8772 + }, + { + "epoch": 0.8538199513381995, + "grad_norm": 1.3673889447714809, + "learning_rate": 5.500590055819127e-07, + "loss": 0.3406, + "step": 8773 + }, + { + "epoch": 0.8539172749391728, + "grad_norm": 2.620482599495798, + "learning_rate": 5.493405258083434e-07, + "loss": 0.4559, + "step": 8774 + }, + { + "epoch": 0.8540145985401459, + "grad_norm": 1.70972542527029, + "learning_rate": 5.486224882995411e-07, + "loss": 0.595, + "step": 8775 + }, + { + "epoch": 0.8541119221411192, + "grad_norm": 1.4870068157273293, + "learning_rate": 5.479048931268583e-07, + "loss": 0.2856, + "step": 8776 + }, + { + "epoch": 0.8542092457420924, + "grad_norm": 1.5594555759444115, + "learning_rate": 5.471877403616027e-07, + "loss": 0.2355, + "step": 8777 + }, + { + "epoch": 0.8543065693430657, + "grad_norm": 1.7824291542601813, + "learning_rate": 5.464710300750381e-07, + "loss": 0.5072, + "step": 8778 + }, + { + "epoch": 0.854403892944039, + "grad_norm": 1.9102795644947803, + "learning_rate": 5.457547623383846e-07, + "loss": 0.5419, + "step": 8779 + }, + { + "epoch": 0.8545012165450122, + "grad_norm": 1.8185255587283928, + "learning_rate": 5.450389372228182e-07, + "loss": 0.403, + "step": 8780 + }, + { + "epoch": 0.8545985401459854, + "grad_norm": 1.6599048379255443, + "learning_rate": 5.443235547994719e-07, + "loss": 0.307, + "step": 8781 + }, + { + "epoch": 0.8546958637469586, + "grad_norm": 1.593288878508357, + "learning_rate": 5.436086151394316e-07, + "loss": 0.3563, + "step": 8782 + }, + { + "epoch": 0.8547931873479319, + "grad_norm": 1.5899535195136578, + "learning_rate": 5.42894118313742e-07, + "loss": 0.3852, + "step": 8783 + }, + { + "epoch": 0.8548905109489051, + "grad_norm": 2.1571189124764256, + "learning_rate": 5.421800643934039e-07, + "loss": 0.3865, + "step": 8784 + }, + { + "epoch": 0.8549878345498784, + "grad_norm": 1.4354175742948825, + "learning_rate": 5.414664534493735e-07, + "loss": 0.3112, + "step": 8785 + }, + { + "epoch": 0.8550851581508516, + "grad_norm": 1.8111215150699422, + "learning_rate": 5.407532855525615e-07, + "loss": 0.2408, + "step": 8786 + }, + { + "epoch": 0.8551824817518248, + "grad_norm": 1.409632789059209, + "learning_rate": 5.400405607738357e-07, + "loss": 0.3922, + "step": 8787 + }, + { + "epoch": 0.855279805352798, + "grad_norm": 1.7251536978098196, + "learning_rate": 5.393282791840216e-07, + "loss": 0.4101, + "step": 8788 + }, + { + "epoch": 0.8553771289537713, + "grad_norm": 1.734118330925152, + "learning_rate": 5.386164408538975e-07, + "loss": 0.3072, + "step": 8789 + }, + { + "epoch": 0.8554744525547445, + "grad_norm": 1.360279440080453, + "learning_rate": 5.379050458541995e-07, + "loss": 0.3679, + "step": 8790 + }, + { + "epoch": 0.8555717761557178, + "grad_norm": 1.7776093336943595, + "learning_rate": 5.371940942556203e-07, + "loss": 0.3874, + "step": 8791 + }, + { + "epoch": 0.855669099756691, + "grad_norm": 1.5578960491950855, + "learning_rate": 5.364835861288081e-07, + "loss": 0.2533, + "step": 8792 + }, + { + "epoch": 0.8557664233576643, + "grad_norm": 1.8539073276958717, + "learning_rate": 5.357735215443644e-07, + "loss": 0.3481, + "step": 8793 + }, + { + "epoch": 0.8558637469586374, + "grad_norm": 1.617676591858573, + "learning_rate": 5.350639005728492e-07, + "loss": 0.4833, + "step": 8794 + }, + { + "epoch": 0.8559610705596107, + "grad_norm": 1.6717777402164158, + "learning_rate": 5.343547232847801e-07, + "loss": 0.3129, + "step": 8795 + }, + { + "epoch": 0.8560583941605839, + "grad_norm": 1.5593040587776128, + "learning_rate": 5.336459897506258e-07, + "loss": 0.3677, + "step": 8796 + }, + { + "epoch": 0.8561557177615572, + "grad_norm": 1.7066309129923647, + "learning_rate": 5.329377000408148e-07, + "loss": 0.4511, + "step": 8797 + }, + { + "epoch": 0.8562530413625304, + "grad_norm": 1.7546439541638608, + "learning_rate": 5.322298542257304e-07, + "loss": 0.4548, + "step": 8798 + }, + { + "epoch": 0.8563503649635037, + "grad_norm": 1.6513436539663309, + "learning_rate": 5.315224523757112e-07, + "loss": 0.5255, + "step": 8799 + }, + { + "epoch": 0.8564476885644768, + "grad_norm": 1.6658667870209818, + "learning_rate": 5.308154945610522e-07, + "loss": 0.4809, + "step": 8800 + }, + { + "epoch": 0.8565450121654501, + "grad_norm": 1.788342993367399, + "learning_rate": 5.301089808520049e-07, + "loss": 0.4374, + "step": 8801 + }, + { + "epoch": 0.8566423357664233, + "grad_norm": 1.574741324117876, + "learning_rate": 5.294029113187754e-07, + "loss": 0.3136, + "step": 8802 + }, + { + "epoch": 0.8567396593673966, + "grad_norm": 1.729328270526574, + "learning_rate": 5.286972860315276e-07, + "loss": 0.4465, + "step": 8803 + }, + { + "epoch": 0.8568369829683699, + "grad_norm": 1.8085131369765666, + "learning_rate": 5.279921050603781e-07, + "loss": 0.3461, + "step": 8804 + }, + { + "epoch": 0.8569343065693431, + "grad_norm": 1.697755450998722, + "learning_rate": 5.272873684754015e-07, + "loss": 0.4831, + "step": 8805 + }, + { + "epoch": 0.8570316301703162, + "grad_norm": 2.452050526501316, + "learning_rate": 5.265830763466289e-07, + "loss": 0.2705, + "step": 8806 + }, + { + "epoch": 0.8571289537712895, + "grad_norm": 1.6111111951513744, + "learning_rate": 5.25879228744045e-07, + "loss": 0.4544, + "step": 8807 + }, + { + "epoch": 0.8572262773722628, + "grad_norm": 2.3165694849867924, + "learning_rate": 5.251758257375922e-07, + "loss": 0.3849, + "step": 8808 + }, + { + "epoch": 0.857323600973236, + "grad_norm": 1.3551324872493675, + "learning_rate": 5.244728673971678e-07, + "loss": 0.2982, + "step": 8809 + }, + { + "epoch": 0.8574209245742093, + "grad_norm": 1.7507024445133026, + "learning_rate": 5.237703537926253e-07, + "loss": 0.2792, + "step": 8810 + }, + { + "epoch": 0.8575182481751825, + "grad_norm": 1.4933879876495202, + "learning_rate": 5.230682849937746e-07, + "loss": 0.4945, + "step": 8811 + }, + { + "epoch": 0.8576155717761558, + "grad_norm": 1.8396207184267073, + "learning_rate": 5.223666610703798e-07, + "loss": 0.3295, + "step": 8812 + }, + { + "epoch": 0.8577128953771289, + "grad_norm": 1.6294573027877337, + "learning_rate": 5.21665482092163e-07, + "loss": 0.2098, + "step": 8813 + }, + { + "epoch": 0.8578102189781022, + "grad_norm": 1.9068993885002294, + "learning_rate": 5.209647481287989e-07, + "loss": 0.6018, + "step": 8814 + }, + { + "epoch": 0.8579075425790754, + "grad_norm": 1.4885911994240344, + "learning_rate": 5.202644592499212e-07, + "loss": 0.3148, + "step": 8815 + }, + { + "epoch": 0.8580048661800487, + "grad_norm": 1.549705458927577, + "learning_rate": 5.195646155251172e-07, + "loss": 0.4297, + "step": 8816 + }, + { + "epoch": 0.8581021897810219, + "grad_norm": 1.6434307266969113, + "learning_rate": 5.188652170239322e-07, + "loss": 0.3301, + "step": 8817 + }, + { + "epoch": 0.8581995133819952, + "grad_norm": 1.6288138164393307, + "learning_rate": 5.181662638158641e-07, + "loss": 0.3955, + "step": 8818 + }, + { + "epoch": 0.8582968369829683, + "grad_norm": 1.8563703093410087, + "learning_rate": 5.174677559703695e-07, + "loss": 0.4389, + "step": 8819 + }, + { + "epoch": 0.8583941605839416, + "grad_norm": 1.7806108900898978, + "learning_rate": 5.167696935568583e-07, + "loss": 0.3724, + "step": 8820 + }, + { + "epoch": 0.8584914841849148, + "grad_norm": 1.4827520231529974, + "learning_rate": 5.160720766446992e-07, + "loss": 0.4275, + "step": 8821 + }, + { + "epoch": 0.8585888077858881, + "grad_norm": 1.624551124198022, + "learning_rate": 5.153749053032131e-07, + "loss": 0.4574, + "step": 8822 + }, + { + "epoch": 0.8586861313868613, + "grad_norm": 1.7110760776365506, + "learning_rate": 5.146781796016798e-07, + "loss": 0.3797, + "step": 8823 + }, + { + "epoch": 0.8587834549878346, + "grad_norm": 1.7607856529316872, + "learning_rate": 5.139818996093332e-07, + "loss": 0.5163, + "step": 8824 + }, + { + "epoch": 0.8588807785888077, + "grad_norm": 1.5502438322675702, + "learning_rate": 5.132860653953614e-07, + "loss": 0.4569, + "step": 8825 + }, + { + "epoch": 0.858978102189781, + "grad_norm": 1.6628863695953597, + "learning_rate": 5.125906770289113e-07, + "loss": 0.4094, + "step": 8826 + }, + { + "epoch": 0.8590754257907542, + "grad_norm": 1.8810469711093158, + "learning_rate": 5.118957345790842e-07, + "loss": 0.4445, + "step": 8827 + }, + { + "epoch": 0.8591727493917275, + "grad_norm": 1.625872817753556, + "learning_rate": 5.112012381149373e-07, + "loss": 0.3748, + "step": 8828 + }, + { + "epoch": 0.8592700729927008, + "grad_norm": 1.8194629176450128, + "learning_rate": 5.105071877054812e-07, + "loss": 0.5056, + "step": 8829 + }, + { + "epoch": 0.859367396593674, + "grad_norm": 1.631774791619786, + "learning_rate": 5.098135834196855e-07, + "loss": 0.4232, + "step": 8830 + }, + { + "epoch": 0.8594647201946471, + "grad_norm": 1.7693335820628846, + "learning_rate": 5.091204253264736e-07, + "loss": 0.4588, + "step": 8831 + }, + { + "epoch": 0.8595620437956204, + "grad_norm": 1.9853628508194825, + "learning_rate": 5.084277134947257e-07, + "loss": 0.46, + "step": 8832 + }, + { + "epoch": 0.8596593673965937, + "grad_norm": 1.5574878411713937, + "learning_rate": 5.077354479932766e-07, + "loss": 0.3631, + "step": 8833 + }, + { + "epoch": 0.8597566909975669, + "grad_norm": 1.4147611967844615, + "learning_rate": 5.070436288909169e-07, + "loss": 0.297, + "step": 8834 + }, + { + "epoch": 0.8598540145985402, + "grad_norm": 1.5876895641176385, + "learning_rate": 5.063522562563949e-07, + "loss": 0.4344, + "step": 8835 + }, + { + "epoch": 0.8599513381995134, + "grad_norm": 1.4038271058562009, + "learning_rate": 5.056613301584096e-07, + "loss": 0.2762, + "step": 8836 + }, + { + "epoch": 0.8600486618004867, + "grad_norm": 1.3463863415688844, + "learning_rate": 5.049708506656209e-07, + "loss": 0.1981, + "step": 8837 + }, + { + "epoch": 0.8601459854014598, + "grad_norm": 2.091644253047093, + "learning_rate": 5.042808178466413e-07, + "loss": 0.3913, + "step": 8838 + }, + { + "epoch": 0.8602433090024331, + "grad_norm": 1.8954314619878783, + "learning_rate": 5.035912317700415e-07, + "loss": 0.4543, + "step": 8839 + }, + { + "epoch": 0.8603406326034063, + "grad_norm": 1.5161312870055996, + "learning_rate": 5.029020925043432e-07, + "loss": 0.2705, + "step": 8840 + }, + { + "epoch": 0.8604379562043796, + "grad_norm": 1.4699045878216703, + "learning_rate": 5.022134001180285e-07, + "loss": 0.316, + "step": 8841 + }, + { + "epoch": 0.8605352798053528, + "grad_norm": 1.6756045375031288, + "learning_rate": 5.015251546795325e-07, + "loss": 0.5622, + "step": 8842 + }, + { + "epoch": 0.8606326034063261, + "grad_norm": 2.4007598866071738, + "learning_rate": 5.008373562572472e-07, + "loss": 0.2828, + "step": 8843 + }, + { + "epoch": 0.8607299270072992, + "grad_norm": 1.5554936298275484, + "learning_rate": 5.001500049195191e-07, + "loss": 0.3383, + "step": 8844 + }, + { + "epoch": 0.8608272506082725, + "grad_norm": 1.838720412026003, + "learning_rate": 4.994631007346506e-07, + "loss": 0.376, + "step": 8845 + }, + { + "epoch": 0.8609245742092457, + "grad_norm": 1.5726990286044449, + "learning_rate": 4.987766437709013e-07, + "loss": 0.3872, + "step": 8846 + }, + { + "epoch": 0.861021897810219, + "grad_norm": 1.7566867868958993, + "learning_rate": 4.980906340964825e-07, + "loss": 0.6055, + "step": 8847 + }, + { + "epoch": 0.8611192214111922, + "grad_norm": 2.0135404468051816, + "learning_rate": 4.974050717795642e-07, + "loss": 0.3014, + "step": 8848 + }, + { + "epoch": 0.8612165450121655, + "grad_norm": 1.7094294900246594, + "learning_rate": 4.96719956888273e-07, + "loss": 0.2878, + "step": 8849 + }, + { + "epoch": 0.8613138686131386, + "grad_norm": 1.6292821509885596, + "learning_rate": 4.960352894906862e-07, + "loss": 0.32, + "step": 8850 + }, + { + "epoch": 0.8614111922141119, + "grad_norm": 1.323582138931058, + "learning_rate": 4.953510696548414e-07, + "loss": 0.3781, + "step": 8851 + }, + { + "epoch": 0.8615085158150851, + "grad_norm": 1.3328777319629628, + "learning_rate": 4.946672974487293e-07, + "loss": 0.2612, + "step": 8852 + }, + { + "epoch": 0.8616058394160584, + "grad_norm": 1.7896066695954034, + "learning_rate": 4.939839729402967e-07, + "loss": 0.4514, + "step": 8853 + }, + { + "epoch": 0.8617031630170316, + "grad_norm": 1.5447798131932555, + "learning_rate": 4.933010961974471e-07, + "loss": 0.4052, + "step": 8854 + }, + { + "epoch": 0.8618004866180049, + "grad_norm": 1.3919017802532592, + "learning_rate": 4.926186672880373e-07, + "loss": 0.2708, + "step": 8855 + }, + { + "epoch": 0.8618978102189782, + "grad_norm": 1.5876418853791705, + "learning_rate": 4.919366862798807e-07, + "loss": 0.2986, + "step": 8856 + }, + { + "epoch": 0.8619951338199513, + "grad_norm": 2.05972345903319, + "learning_rate": 4.912551532407478e-07, + "loss": 0.3959, + "step": 8857 + }, + { + "epoch": 0.8620924574209246, + "grad_norm": 2.122483614801037, + "learning_rate": 4.905740682383603e-07, + "loss": 0.3101, + "step": 8858 + }, + { + "epoch": 0.8621897810218978, + "grad_norm": 1.8614697877787294, + "learning_rate": 4.898934313403997e-07, + "loss": 0.4581, + "step": 8859 + }, + { + "epoch": 0.8622871046228711, + "grad_norm": 1.5307685718048352, + "learning_rate": 4.892132426145018e-07, + "loss": 0.2894, + "step": 8860 + }, + { + "epoch": 0.8623844282238443, + "grad_norm": 1.4754504501983794, + "learning_rate": 4.885335021282556e-07, + "loss": 0.4573, + "step": 8861 + }, + { + "epoch": 0.8624817518248176, + "grad_norm": 2.008702655986808, + "learning_rate": 4.878542099492078e-07, + "loss": 0.3083, + "step": 8862 + }, + { + "epoch": 0.8625790754257907, + "grad_norm": 1.4899670922402488, + "learning_rate": 4.871753661448613e-07, + "loss": 0.3339, + "step": 8863 + }, + { + "epoch": 0.862676399026764, + "grad_norm": 1.5835142366897343, + "learning_rate": 4.864969707826722e-07, + "loss": 0.3481, + "step": 8864 + }, + { + "epoch": 0.8627737226277372, + "grad_norm": 1.5409403542047546, + "learning_rate": 4.858190239300531e-07, + "loss": 0.4016, + "step": 8865 + }, + { + "epoch": 0.8628710462287105, + "grad_norm": 1.3922957766777282, + "learning_rate": 4.851415256543723e-07, + "loss": 0.456, + "step": 8866 + }, + { + "epoch": 0.8629683698296837, + "grad_norm": 1.9443029518720707, + "learning_rate": 4.844644760229545e-07, + "loss": 0.479, + "step": 8867 + }, + { + "epoch": 0.863065693430657, + "grad_norm": 1.6571217527948316, + "learning_rate": 4.837878751030755e-07, + "loss": 0.4101, + "step": 8868 + }, + { + "epoch": 0.8631630170316301, + "grad_norm": 1.6451678640872458, + "learning_rate": 4.83111722961972e-07, + "loss": 0.2689, + "step": 8869 + }, + { + "epoch": 0.8632603406326034, + "grad_norm": 1.5876491686806944, + "learning_rate": 4.82436019666832e-07, + "loss": 0.3487, + "step": 8870 + }, + { + "epoch": 0.8633576642335766, + "grad_norm": 1.374768931306892, + "learning_rate": 4.817607652848034e-07, + "loss": 0.4002, + "step": 8871 + }, + { + "epoch": 0.8634549878345499, + "grad_norm": 1.889150817169666, + "learning_rate": 4.810859598829831e-07, + "loss": 0.2541, + "step": 8872 + }, + { + "epoch": 0.8635523114355231, + "grad_norm": 1.601903590655278, + "learning_rate": 4.80411603528429e-07, + "loss": 0.4019, + "step": 8873 + }, + { + "epoch": 0.8636496350364964, + "grad_norm": 1.5708104193489283, + "learning_rate": 4.797376962881512e-07, + "loss": 0.3565, + "step": 8874 + }, + { + "epoch": 0.8637469586374696, + "grad_norm": 1.4626595858370934, + "learning_rate": 4.790642382291172e-07, + "loss": 0.236, + "step": 8875 + }, + { + "epoch": 0.8638442822384428, + "grad_norm": 1.7937928011901723, + "learning_rate": 4.783912294182486e-07, + "loss": 0.2784, + "step": 8876 + }, + { + "epoch": 0.863941605839416, + "grad_norm": 1.8855765540120717, + "learning_rate": 4.777186699224229e-07, + "loss": 0.4643, + "step": 8877 + }, + { + "epoch": 0.8640389294403893, + "grad_norm": 1.3613127128790408, + "learning_rate": 4.770465598084733e-07, + "loss": 0.2862, + "step": 8878 + }, + { + "epoch": 0.8641362530413625, + "grad_norm": 1.815144221458172, + "learning_rate": 4.76374899143186e-07, + "loss": 0.5712, + "step": 8879 + }, + { + "epoch": 0.8642335766423358, + "grad_norm": 1.5180523498414442, + "learning_rate": 4.757036879933058e-07, + "loss": 0.3949, + "step": 8880 + }, + { + "epoch": 0.864330900243309, + "grad_norm": 1.5832849127074788, + "learning_rate": 4.75032926425531e-07, + "loss": 0.2299, + "step": 8881 + }, + { + "epoch": 0.8644282238442822, + "grad_norm": 2.27221248171749, + "learning_rate": 4.743626145065161e-07, + "loss": 0.3475, + "step": 8882 + }, + { + "epoch": 0.8645255474452554, + "grad_norm": 1.4168086915723712, + "learning_rate": 4.736927523028695e-07, + "loss": 0.3401, + "step": 8883 + }, + { + "epoch": 0.8646228710462287, + "grad_norm": 1.7037658185777274, + "learning_rate": 4.730233398811557e-07, + "loss": 0.3775, + "step": 8884 + }, + { + "epoch": 0.864720194647202, + "grad_norm": 1.6037149990719333, + "learning_rate": 4.7235437730789427e-07, + "loss": 0.2944, + "step": 8885 + }, + { + "epoch": 0.8648175182481752, + "grad_norm": 1.8002481395498335, + "learning_rate": 4.7168586464956314e-07, + "loss": 0.398, + "step": 8886 + }, + { + "epoch": 0.8649148418491485, + "grad_norm": 1.6114134239969953, + "learning_rate": 4.710178019725903e-07, + "loss": 0.1798, + "step": 8887 + }, + { + "epoch": 0.8650121654501216, + "grad_norm": 1.369967962990285, + "learning_rate": 4.703501893433621e-07, + "loss": 0.2115, + "step": 8888 + }, + { + "epoch": 0.8651094890510949, + "grad_norm": 1.9158941730110601, + "learning_rate": 4.696830268282204e-07, + "loss": 0.395, + "step": 8889 + }, + { + "epoch": 0.8652068126520681, + "grad_norm": 1.605323655006366, + "learning_rate": 4.690163144934601e-07, + "loss": 0.3242, + "step": 8890 + }, + { + "epoch": 0.8653041362530414, + "grad_norm": 1.544206417427814, + "learning_rate": 4.683500524053336e-07, + "loss": 0.2019, + "step": 8891 + }, + { + "epoch": 0.8654014598540146, + "grad_norm": 1.6877915872211626, + "learning_rate": 4.6768424063004803e-07, + "loss": 0.4207, + "step": 8892 + }, + { + "epoch": 0.8654987834549879, + "grad_norm": 1.5856595101095103, + "learning_rate": 4.6701887923376597e-07, + "loss": 0.5634, + "step": 8893 + }, + { + "epoch": 0.865596107055961, + "grad_norm": 2.1311263241499465, + "learning_rate": 4.6635396828260295e-07, + "loss": 0.395, + "step": 8894 + }, + { + "epoch": 0.8656934306569343, + "grad_norm": 1.7211334348614997, + "learning_rate": 4.6568950784263213e-07, + "loss": 0.2159, + "step": 8895 + }, + { + "epoch": 0.8657907542579075, + "grad_norm": 1.458726403079041, + "learning_rate": 4.650254979798835e-07, + "loss": 0.3708, + "step": 8896 + }, + { + "epoch": 0.8658880778588808, + "grad_norm": 1.8062718690386481, + "learning_rate": 4.6436193876033764e-07, + "loss": 0.2131, + "step": 8897 + }, + { + "epoch": 0.865985401459854, + "grad_norm": 1.6831953457971054, + "learning_rate": 4.63698830249934e-07, + "loss": 0.4651, + "step": 8898 + }, + { + "epoch": 0.8660827250608273, + "grad_norm": 1.682454655391922, + "learning_rate": 4.6303617251456545e-07, + "loss": 0.3266, + "step": 8899 + }, + { + "epoch": 0.8661800486618005, + "grad_norm": 1.593468884377645, + "learning_rate": 4.623739656200821e-07, + "loss": 0.5092, + "step": 8900 + }, + { + "epoch": 0.8662773722627737, + "grad_norm": 1.7189998098376638, + "learning_rate": 4.6171220963228626e-07, + "loss": 0.4034, + "step": 8901 + }, + { + "epoch": 0.8663746958637469, + "grad_norm": 1.8319686521071987, + "learning_rate": 4.6105090461693704e-07, + "loss": 0.3046, + "step": 8902 + }, + { + "epoch": 0.8664720194647202, + "grad_norm": 1.5358108018094117, + "learning_rate": 4.603900506397496e-07, + "loss": 0.456, + "step": 8903 + }, + { + "epoch": 0.8665693430656934, + "grad_norm": 1.7001582913473696, + "learning_rate": 4.597296477663943e-07, + "loss": 0.2564, + "step": 8904 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.699103691327235, + "learning_rate": 4.590696960624935e-07, + "loss": 0.3841, + "step": 8905 + }, + { + "epoch": 0.86676399026764, + "grad_norm": 1.7747391952398544, + "learning_rate": 4.5841019559362653e-07, + "loss": 0.3237, + "step": 8906 + }, + { + "epoch": 0.8668613138686131, + "grad_norm": 2.1659120443772113, + "learning_rate": 4.5775114642533257e-07, + "loss": 0.3116, + "step": 8907 + }, + { + "epoch": 0.8669586374695863, + "grad_norm": 1.420910770323411, + "learning_rate": 4.570925486230976e-07, + "loss": 0.3229, + "step": 8908 + }, + { + "epoch": 0.8670559610705596, + "grad_norm": 1.5428201724598205, + "learning_rate": 4.564344022523687e-07, + "loss": 0.3948, + "step": 8909 + }, + { + "epoch": 0.8671532846715329, + "grad_norm": 1.6257649601895192, + "learning_rate": 4.5577670737854574e-07, + "loss": 0.519, + "step": 8910 + }, + { + "epoch": 0.8672506082725061, + "grad_norm": 2.011304497890273, + "learning_rate": 4.5511946406698594e-07, + "loss": 0.2177, + "step": 8911 + }, + { + "epoch": 0.8673479318734794, + "grad_norm": 1.9519404977539276, + "learning_rate": 4.54462672382997e-07, + "loss": 0.3613, + "step": 8912 + }, + { + "epoch": 0.8674452554744525, + "grad_norm": 1.6095422454153265, + "learning_rate": 4.538063323918462e-07, + "loss": 0.374, + "step": 8913 + }, + { + "epoch": 0.8675425790754258, + "grad_norm": 1.6582896790757047, + "learning_rate": 4.5315044415875587e-07, + "loss": 0.4024, + "step": 8914 + }, + { + "epoch": 0.867639902676399, + "grad_norm": 2.232463846287704, + "learning_rate": 4.524950077488993e-07, + "loss": 0.3384, + "step": 8915 + }, + { + "epoch": 0.8677372262773723, + "grad_norm": 1.5888208946419526, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.3535, + "step": 8916 + }, + { + "epoch": 0.8678345498783455, + "grad_norm": 1.8929861158455272, + "learning_rate": 4.511854906593699e-07, + "loss": 0.356, + "step": 8917 + }, + { + "epoch": 0.8679318734793188, + "grad_norm": 1.7330009584991042, + "learning_rate": 4.505314101098268e-07, + "loss": 0.3564, + "step": 8918 + }, + { + "epoch": 0.868029197080292, + "grad_norm": 1.7609882066495335, + "learning_rate": 4.4987778164377273e-07, + "loss": 0.3604, + "step": 8919 + }, + { + "epoch": 0.8681265206812652, + "grad_norm": 1.6260007564359926, + "learning_rate": 4.4922460532616007e-07, + "loss": 0.2456, + "step": 8920 + }, + { + "epoch": 0.8682238442822384, + "grad_norm": 1.6417051120393822, + "learning_rate": 4.485718812218959e-07, + "loss": 0.2234, + "step": 8921 + }, + { + "epoch": 0.8683211678832117, + "grad_norm": 1.7136151685477914, + "learning_rate": 4.479196093958421e-07, + "loss": 0.2736, + "step": 8922 + }, + { + "epoch": 0.8684184914841849, + "grad_norm": 1.9294974044264364, + "learning_rate": 4.472677899128136e-07, + "loss": 0.3945, + "step": 8923 + }, + { + "epoch": 0.8685158150851582, + "grad_norm": 1.695651183159963, + "learning_rate": 4.4661642283758356e-07, + "loss": 0.4094, + "step": 8924 + }, + { + "epoch": 0.8686131386861314, + "grad_norm": 1.5936514880083799, + "learning_rate": 4.459655082348785e-07, + "loss": 0.3283, + "step": 8925 + }, + { + "epoch": 0.8687104622871046, + "grad_norm": 1.7419996630759194, + "learning_rate": 4.4531504616937895e-07, + "loss": 0.3819, + "step": 8926 + }, + { + "epoch": 0.8688077858880778, + "grad_norm": 1.6443561089190666, + "learning_rate": 4.4466503670572315e-07, + "loss": 0.5131, + "step": 8927 + }, + { + "epoch": 0.8689051094890511, + "grad_norm": 1.523670041847427, + "learning_rate": 4.440154799085028e-07, + "loss": 0.3626, + "step": 8928 + }, + { + "epoch": 0.8690024330900243, + "grad_norm": 1.3573623314840753, + "learning_rate": 4.433663758422657e-07, + "loss": 0.24, + "step": 8929 + }, + { + "epoch": 0.8690997566909976, + "grad_norm": 2.3699280640191356, + "learning_rate": 4.4271772457151143e-07, + "loss": 0.4307, + "step": 8930 + }, + { + "epoch": 0.8691970802919708, + "grad_norm": 2.5586081002102348, + "learning_rate": 4.420695261606978e-07, + "loss": 0.4638, + "step": 8931 + }, + { + "epoch": 0.869294403892944, + "grad_norm": 1.4219613520632248, + "learning_rate": 4.4142178067423827e-07, + "loss": 0.2435, + "step": 8932 + }, + { + "epoch": 0.8693917274939172, + "grad_norm": 1.8786740228920904, + "learning_rate": 4.407744881764969e-07, + "loss": 0.45, + "step": 8933 + }, + { + "epoch": 0.8694890510948905, + "grad_norm": 1.3218508905326891, + "learning_rate": 4.4012764873179735e-07, + "loss": 0.2877, + "step": 8934 + }, + { + "epoch": 0.8695863746958638, + "grad_norm": 1.5397193397805293, + "learning_rate": 4.394812624044159e-07, + "loss": 0.3395, + "step": 8935 + }, + { + "epoch": 0.869683698296837, + "grad_norm": 1.8906256305283684, + "learning_rate": 4.388353292585845e-07, + "loss": 0.3836, + "step": 8936 + }, + { + "epoch": 0.8697810218978103, + "grad_norm": 1.6108186043376587, + "learning_rate": 4.381898493584902e-07, + "loss": 0.362, + "step": 8937 + }, + { + "epoch": 0.8698783454987834, + "grad_norm": 1.7721774964915464, + "learning_rate": 4.3754482276827435e-07, + "loss": 0.5595, + "step": 8938 + }, + { + "epoch": 0.8699756690997567, + "grad_norm": 1.4978491144064539, + "learning_rate": 4.3690024955203356e-07, + "loss": 0.3859, + "step": 8939 + }, + { + "epoch": 0.8700729927007299, + "grad_norm": 1.612928153276089, + "learning_rate": 4.362561297738205e-07, + "loss": 0.3227, + "step": 8940 + }, + { + "epoch": 0.8701703163017032, + "grad_norm": 1.715721791912206, + "learning_rate": 4.3561246349764055e-07, + "loss": 0.4756, + "step": 8941 + }, + { + "epoch": 0.8702676399026764, + "grad_norm": 1.7570194065887248, + "learning_rate": 4.349692507874553e-07, + "loss": 0.4401, + "step": 8942 + }, + { + "epoch": 0.8703649635036497, + "grad_norm": 1.9226028800789607, + "learning_rate": 4.343264917071821e-07, + "loss": 0.5843, + "step": 8943 + }, + { + "epoch": 0.8704622871046229, + "grad_norm": 1.6968615789567658, + "learning_rate": 4.3368418632069143e-07, + "loss": 0.2729, + "step": 8944 + }, + { + "epoch": 0.8705596107055961, + "grad_norm": 1.6098171478594197, + "learning_rate": 4.3304233469180946e-07, + "loss": 0.2341, + "step": 8945 + }, + { + "epoch": 0.8706569343065693, + "grad_norm": 1.9198327106870343, + "learning_rate": 4.3240093688431737e-07, + "loss": 0.4279, + "step": 8946 + }, + { + "epoch": 0.8707542579075426, + "grad_norm": 1.7286971347142726, + "learning_rate": 4.317599929619526e-07, + "loss": 0.2409, + "step": 8947 + }, + { + "epoch": 0.8708515815085158, + "grad_norm": 1.706900515231185, + "learning_rate": 4.3111950298840466e-07, + "loss": 0.3628, + "step": 8948 + }, + { + "epoch": 0.8709489051094891, + "grad_norm": 2.274531779471638, + "learning_rate": 4.304794670273199e-07, + "loss": 0.3693, + "step": 8949 + }, + { + "epoch": 0.8710462287104623, + "grad_norm": 1.6783718900724887, + "learning_rate": 4.2983988514230026e-07, + "loss": 0.5253, + "step": 8950 + }, + { + "epoch": 0.8711435523114355, + "grad_norm": 1.723869951485436, + "learning_rate": 4.2920075739689926e-07, + "loss": 0.3367, + "step": 8951 + }, + { + "epoch": 0.8712408759124087, + "grad_norm": 1.2567039485649292, + "learning_rate": 4.285620838546284e-07, + "loss": 0.1943, + "step": 8952 + }, + { + "epoch": 0.871338199513382, + "grad_norm": 1.9432092656575435, + "learning_rate": 4.2792386457895297e-07, + "loss": 0.352, + "step": 8953 + }, + { + "epoch": 0.8714355231143552, + "grad_norm": 2.542217934865308, + "learning_rate": 4.272860996332945e-07, + "loss": 0.2756, + "step": 8954 + }, + { + "epoch": 0.8715328467153285, + "grad_norm": 2.126245918785812, + "learning_rate": 4.2664878908102556e-07, + "loss": 0.3924, + "step": 8955 + }, + { + "epoch": 0.8716301703163017, + "grad_norm": 2.5449672176745395, + "learning_rate": 4.2601193298547784e-07, + "loss": 0.2624, + "step": 8956 + }, + { + "epoch": 0.8717274939172749, + "grad_norm": 1.5687978851183497, + "learning_rate": 4.253755314099356e-07, + "loss": 0.2499, + "step": 8957 + }, + { + "epoch": 0.8718248175182481, + "grad_norm": 1.6727921296633723, + "learning_rate": 4.2473958441763886e-07, + "loss": 0.3999, + "step": 8958 + }, + { + "epoch": 0.8719221411192214, + "grad_norm": 1.3142769002470793, + "learning_rate": 4.241040920717815e-07, + "loss": 0.2691, + "step": 8959 + }, + { + "epoch": 0.8720194647201946, + "grad_norm": 1.544120416718892, + "learning_rate": 4.2346905443551356e-07, + "loss": 0.485, + "step": 8960 + }, + { + "epoch": 0.8721167883211679, + "grad_norm": 1.4941659962613072, + "learning_rate": 4.2283447157193904e-07, + "loss": 0.3133, + "step": 8961 + }, + { + "epoch": 0.8722141119221412, + "grad_norm": 1.7334613627442983, + "learning_rate": 4.222003435441158e-07, + "loss": 0.3256, + "step": 8962 + }, + { + "epoch": 0.8723114355231144, + "grad_norm": 1.3375740957681268, + "learning_rate": 4.215666704150578e-07, + "loss": 0.2192, + "step": 8963 + }, + { + "epoch": 0.8724087591240876, + "grad_norm": 1.6136067133730279, + "learning_rate": 4.209334522477343e-07, + "loss": 0.5021, + "step": 8964 + }, + { + "epoch": 0.8725060827250608, + "grad_norm": 1.7940283742083345, + "learning_rate": 4.203006891050687e-07, + "loss": 0.5266, + "step": 8965 + }, + { + "epoch": 0.8726034063260341, + "grad_norm": 1.571516344170533, + "learning_rate": 4.19668381049938e-07, + "loss": 0.508, + "step": 8966 + }, + { + "epoch": 0.8727007299270073, + "grad_norm": 1.7762901453022397, + "learning_rate": 4.190365281451753e-07, + "loss": 0.4659, + "step": 8967 + }, + { + "epoch": 0.8727980535279806, + "grad_norm": 1.6143784751958066, + "learning_rate": 4.184051304535686e-07, + "loss": 0.3301, + "step": 8968 + }, + { + "epoch": 0.8728953771289538, + "grad_norm": 1.7653246598813364, + "learning_rate": 4.1777418803786053e-07, + "loss": 0.5153, + "step": 8969 + }, + { + "epoch": 0.872992700729927, + "grad_norm": 1.359479702830626, + "learning_rate": 4.171437009607471e-07, + "loss": 0.2221, + "step": 8970 + }, + { + "epoch": 0.8730900243309002, + "grad_norm": 1.8800587123468613, + "learning_rate": 4.1651366928488146e-07, + "loss": 0.2931, + "step": 8971 + }, + { + "epoch": 0.8731873479318735, + "grad_norm": 2.185317667514927, + "learning_rate": 4.1588409307287026e-07, + "loss": 0.3675, + "step": 8972 + }, + { + "epoch": 0.8732846715328467, + "grad_norm": 1.8024247421235893, + "learning_rate": 4.1525497238727394e-07, + "loss": 0.3297, + "step": 8973 + }, + { + "epoch": 0.87338199513382, + "grad_norm": 1.49989286675928, + "learning_rate": 4.146263072906087e-07, + "loss": 0.4106, + "step": 8974 + }, + { + "epoch": 0.8734793187347932, + "grad_norm": 1.8581243203601692, + "learning_rate": 4.139980978453456e-07, + "loss": 0.4449, + "step": 8975 + }, + { + "epoch": 0.8735766423357664, + "grad_norm": 1.7414091963334486, + "learning_rate": 4.133703441139114e-07, + "loss": 0.4352, + "step": 8976 + }, + { + "epoch": 0.8736739659367396, + "grad_norm": 1.710119600630878, + "learning_rate": 4.1274304615868454e-07, + "loss": 0.2955, + "step": 8977 + }, + { + "epoch": 0.8737712895377129, + "grad_norm": 1.7094172861284116, + "learning_rate": 4.121162040420007e-07, + "loss": 0.3279, + "step": 8978 + }, + { + "epoch": 0.8738686131386861, + "grad_norm": 1.5530193331289426, + "learning_rate": 4.114898178261495e-07, + "loss": 0.3051, + "step": 8979 + }, + { + "epoch": 0.8739659367396594, + "grad_norm": 1.9659581446378496, + "learning_rate": 4.1086388757337506e-07, + "loss": 0.4733, + "step": 8980 + }, + { + "epoch": 0.8740632603406326, + "grad_norm": 1.6052081785340166, + "learning_rate": 4.1023841334587697e-07, + "loss": 0.3282, + "step": 8981 + }, + { + "epoch": 0.8741605839416058, + "grad_norm": 1.250859441940061, + "learning_rate": 4.09613395205809e-07, + "loss": 0.2084, + "step": 8982 + }, + { + "epoch": 0.874257907542579, + "grad_norm": 1.9399466294881544, + "learning_rate": 4.0898883321528085e-07, + "loss": 0.4795, + "step": 8983 + }, + { + "epoch": 0.8743552311435523, + "grad_norm": 1.3733324863529988, + "learning_rate": 4.0836472743635223e-07, + "loss": 0.3991, + "step": 8984 + }, + { + "epoch": 0.8744525547445255, + "grad_norm": 2.122250854283508, + "learning_rate": 4.0774107793104365e-07, + "loss": 0.3147, + "step": 8985 + }, + { + "epoch": 0.8745498783454988, + "grad_norm": 1.6287325027708741, + "learning_rate": 4.0711788476132773e-07, + "loss": 0.5318, + "step": 8986 + }, + { + "epoch": 0.874647201946472, + "grad_norm": 1.7607649358971411, + "learning_rate": 4.0649514798912936e-07, + "loss": 0.3897, + "step": 8987 + }, + { + "epoch": 0.8747445255474453, + "grad_norm": 1.8907879845852569, + "learning_rate": 4.058728676763313e-07, + "loss": 0.3986, + "step": 8988 + }, + { + "epoch": 0.8748418491484184, + "grad_norm": 1.4544576461253722, + "learning_rate": 4.052510438847707e-07, + "loss": 0.2297, + "step": 8989 + }, + { + "epoch": 0.8749391727493917, + "grad_norm": 1.6972652035834488, + "learning_rate": 4.046296766762375e-07, + "loss": 0.649, + "step": 8990 + }, + { + "epoch": 0.875036496350365, + "grad_norm": 1.2750684906758172, + "learning_rate": 4.040087661124786e-07, + "loss": 0.2968, + "step": 8991 + }, + { + "epoch": 0.8751338199513382, + "grad_norm": 1.678123704281099, + "learning_rate": 4.033883122551929e-07, + "loss": 0.5107, + "step": 8992 + }, + { + "epoch": 0.8752311435523115, + "grad_norm": 1.8621779400201937, + "learning_rate": 4.027683151660361e-07, + "loss": 0.5474, + "step": 8993 + }, + { + "epoch": 0.8753284671532847, + "grad_norm": 1.7158225357484174, + "learning_rate": 4.021487749066183e-07, + "loss": 0.4139, + "step": 8994 + }, + { + "epoch": 0.8754257907542579, + "grad_norm": 1.7209911646596765, + "learning_rate": 4.0152969153850195e-07, + "loss": 0.2804, + "step": 8995 + }, + { + "epoch": 0.8755231143552311, + "grad_norm": 1.6285468786822423, + "learning_rate": 4.0091106512320677e-07, + "loss": 0.4597, + "step": 8996 + }, + { + "epoch": 0.8756204379562044, + "grad_norm": 1.3212230183743494, + "learning_rate": 4.002928957222069e-07, + "loss": 0.2172, + "step": 8997 + }, + { + "epoch": 0.8757177615571776, + "grad_norm": 1.7502469842190336, + "learning_rate": 3.9967518339692815e-07, + "loss": 0.4522, + "step": 8998 + }, + { + "epoch": 0.8758150851581509, + "grad_norm": 2.001885479040049, + "learning_rate": 3.990579282087537e-07, + "loss": 0.4867, + "step": 8999 + }, + { + "epoch": 0.8759124087591241, + "grad_norm": 1.7387833116687497, + "learning_rate": 3.984411302190211e-07, + "loss": 0.403, + "step": 9000 + }, + { + "epoch": 0.8760097323600973, + "grad_norm": 1.7465329567251235, + "learning_rate": 3.9782478948902193e-07, + "loss": 0.4453, + "step": 9001 + }, + { + "epoch": 0.8761070559610705, + "grad_norm": 1.5661813474619781, + "learning_rate": 3.972089060800016e-07, + "loss": 0.2124, + "step": 9002 + }, + { + "epoch": 0.8762043795620438, + "grad_norm": 1.4779275709892998, + "learning_rate": 3.965934800531618e-07, + "loss": 0.1739, + "step": 9003 + }, + { + "epoch": 0.876301703163017, + "grad_norm": 1.7624155768337695, + "learning_rate": 3.959785114696574e-07, + "loss": 0.3872, + "step": 9004 + }, + { + "epoch": 0.8763990267639903, + "grad_norm": 1.6617805702607937, + "learning_rate": 3.95364000390599e-07, + "loss": 0.2558, + "step": 9005 + }, + { + "epoch": 0.8764963503649635, + "grad_norm": 1.6745915669339024, + "learning_rate": 3.947499468770494e-07, + "loss": 0.4923, + "step": 9006 + }, + { + "epoch": 0.8765936739659368, + "grad_norm": 1.5825901127570392, + "learning_rate": 3.9413635099002757e-07, + "loss": 0.3163, + "step": 9007 + }, + { + "epoch": 0.8766909975669099, + "grad_norm": 1.9118450165187293, + "learning_rate": 3.935232127905092e-07, + "loss": 0.263, + "step": 9008 + }, + { + "epoch": 0.8767883211678832, + "grad_norm": 1.5191640532794235, + "learning_rate": 3.929105323394189e-07, + "loss": 0.3201, + "step": 9009 + }, + { + "epoch": 0.8768856447688564, + "grad_norm": 1.48726765717403, + "learning_rate": 3.9229830969764126e-07, + "loss": 0.4013, + "step": 9010 + }, + { + "epoch": 0.8769829683698297, + "grad_norm": 1.5018207308155405, + "learning_rate": 3.916865449260127e-07, + "loss": 0.2423, + "step": 9011 + }, + { + "epoch": 0.877080291970803, + "grad_norm": 1.5919584040215649, + "learning_rate": 3.9107523808532445e-07, + "loss": 0.2813, + "step": 9012 + }, + { + "epoch": 0.8771776155717762, + "grad_norm": 1.6678384555786152, + "learning_rate": 3.9046438923632246e-07, + "loss": 0.3986, + "step": 9013 + }, + { + "epoch": 0.8772749391727493, + "grad_norm": 1.5882917697481183, + "learning_rate": 3.8985399843970815e-07, + "loss": 0.342, + "step": 9014 + }, + { + "epoch": 0.8773722627737226, + "grad_norm": 1.991616320539, + "learning_rate": 3.892440657561358e-07, + "loss": 0.57, + "step": 9015 + }, + { + "epoch": 0.8774695863746959, + "grad_norm": 1.6434889727559954, + "learning_rate": 3.8863459124621415e-07, + "loss": 0.3822, + "step": 9016 + }, + { + "epoch": 0.8775669099756691, + "grad_norm": 1.4253547762217462, + "learning_rate": 3.8802557497050695e-07, + "loss": 0.277, + "step": 9017 + }, + { + "epoch": 0.8776642335766424, + "grad_norm": 1.6596928207723018, + "learning_rate": 3.8741701698953363e-07, + "loss": 0.2942, + "step": 9018 + }, + { + "epoch": 0.8777615571776156, + "grad_norm": 1.577602413256697, + "learning_rate": 3.8680891736376735e-07, + "loss": 0.2633, + "step": 9019 + }, + { + "epoch": 0.8778588807785888, + "grad_norm": 1.8126575138445067, + "learning_rate": 3.862012761536332e-07, + "loss": 0.4914, + "step": 9020 + }, + { + "epoch": 0.877956204379562, + "grad_norm": 1.7881954040853976, + "learning_rate": 3.855940934195146e-07, + "loss": 0.4499, + "step": 9021 + }, + { + "epoch": 0.8780535279805353, + "grad_norm": 1.8905545056024078, + "learning_rate": 3.849873692217465e-07, + "loss": 0.4606, + "step": 9022 + }, + { + "epoch": 0.8781508515815085, + "grad_norm": 1.8712435286109956, + "learning_rate": 3.8438110362062076e-07, + "loss": 0.4319, + "step": 9023 + }, + { + "epoch": 0.8782481751824818, + "grad_norm": 1.251000385519162, + "learning_rate": 3.837752966763814e-07, + "loss": 0.2394, + "step": 9024 + }, + { + "epoch": 0.878345498783455, + "grad_norm": 1.8356486641147882, + "learning_rate": 3.831699484492285e-07, + "loss": 0.436, + "step": 9025 + }, + { + "epoch": 0.8784428223844283, + "grad_norm": 1.4085313729234155, + "learning_rate": 3.8256505899931627e-07, + "loss": 0.2685, + "step": 9026 + }, + { + "epoch": 0.8785401459854014, + "grad_norm": 1.4789291065704782, + "learning_rate": 3.819606283867511e-07, + "loss": 0.3948, + "step": 9027 + }, + { + "epoch": 0.8786374695863747, + "grad_norm": 1.5455482264546911, + "learning_rate": 3.8135665667159703e-07, + "loss": 0.3048, + "step": 9028 + }, + { + "epoch": 0.8787347931873479, + "grad_norm": 1.7041611625501667, + "learning_rate": 3.8075314391387066e-07, + "loss": 0.3005, + "step": 9029 + }, + { + "epoch": 0.8788321167883212, + "grad_norm": 1.7839212966502227, + "learning_rate": 3.8015009017354443e-07, + "loss": 0.564, + "step": 9030 + }, + { + "epoch": 0.8789294403892944, + "grad_norm": 1.683037685958326, + "learning_rate": 3.795474955105427e-07, + "loss": 0.633, + "step": 9031 + }, + { + "epoch": 0.8790267639902677, + "grad_norm": 1.5051912440965538, + "learning_rate": 3.7894535998474646e-07, + "loss": 0.3375, + "step": 9032 + }, + { + "epoch": 0.8791240875912408, + "grad_norm": 1.6300292317864338, + "learning_rate": 3.783436836559895e-07, + "loss": 0.3909, + "step": 9033 + }, + { + "epoch": 0.8792214111922141, + "grad_norm": 1.593824497052541, + "learning_rate": 3.7774246658406177e-07, + "loss": 0.3463, + "step": 9034 + }, + { + "epoch": 0.8793187347931873, + "grad_norm": 1.5144489877647738, + "learning_rate": 3.771417088287066e-07, + "loss": 0.3582, + "step": 9035 + }, + { + "epoch": 0.8794160583941606, + "grad_norm": 1.3240601211105127, + "learning_rate": 3.765414104496207e-07, + "loss": 0.2609, + "step": 9036 + }, + { + "epoch": 0.8795133819951338, + "grad_norm": 1.2223600083530024, + "learning_rate": 3.7594157150645736e-07, + "loss": 0.174, + "step": 9037 + }, + { + "epoch": 0.8796107055961071, + "grad_norm": 1.820265347766445, + "learning_rate": 3.753421920588218e-07, + "loss": 0.4641, + "step": 9038 + }, + { + "epoch": 0.8797080291970802, + "grad_norm": 1.630969015307784, + "learning_rate": 3.7474327216627526e-07, + "loss": 0.3386, + "step": 9039 + }, + { + "epoch": 0.8798053527980535, + "grad_norm": 1.302353833393497, + "learning_rate": 3.741448118883323e-07, + "loss": 0.3, + "step": 9040 + }, + { + "epoch": 0.8799026763990268, + "grad_norm": 1.6163492287653818, + "learning_rate": 3.7354681128446366e-07, + "loss": 0.3266, + "step": 9041 + }, + { + "epoch": 0.88, + "grad_norm": 1.690874610849509, + "learning_rate": 3.729492704140908e-07, + "loss": 0.2776, + "step": 9042 + }, + { + "epoch": 0.8800973236009733, + "grad_norm": 1.6948654358824187, + "learning_rate": 3.7235218933659334e-07, + "loss": 0.3548, + "step": 9043 + }, + { + "epoch": 0.8801946472019465, + "grad_norm": 1.7198295670724502, + "learning_rate": 3.717555681113033e-07, + "loss": 0.456, + "step": 9044 + }, + { + "epoch": 0.8802919708029197, + "grad_norm": 1.513783781775627, + "learning_rate": 3.7115940679750715e-07, + "loss": 0.3076, + "step": 9045 + }, + { + "epoch": 0.8803892944038929, + "grad_norm": 2.4265254502375684, + "learning_rate": 3.705637054544459e-07, + "loss": 0.274, + "step": 9046 + }, + { + "epoch": 0.8804866180048662, + "grad_norm": 1.6616444817263811, + "learning_rate": 3.699684641413143e-07, + "loss": 0.5787, + "step": 9047 + }, + { + "epoch": 0.8805839416058394, + "grad_norm": 2.145284773945413, + "learning_rate": 3.693736829172634e-07, + "loss": 0.6248, + "step": 9048 + }, + { + "epoch": 0.8806812652068127, + "grad_norm": 1.9022879778384403, + "learning_rate": 3.687793618413954e-07, + "loss": 0.4206, + "step": 9049 + }, + { + "epoch": 0.8807785888077859, + "grad_norm": 1.5163187228712287, + "learning_rate": 3.6818550097276805e-07, + "loss": 0.3597, + "step": 9050 + }, + { + "epoch": 0.8808759124087592, + "grad_norm": 1.469106914901325, + "learning_rate": 3.6759210037039583e-07, + "loss": 0.2549, + "step": 9051 + }, + { + "epoch": 0.8809732360097323, + "grad_norm": 2.0319212757990703, + "learning_rate": 3.6699916009324267e-07, + "loss": 0.5639, + "step": 9052 + }, + { + "epoch": 0.8810705596107056, + "grad_norm": 1.4382481494018382, + "learning_rate": 3.6640668020023084e-07, + "loss": 0.3361, + "step": 9053 + }, + { + "epoch": 0.8811678832116788, + "grad_norm": 1.4525846891942709, + "learning_rate": 3.6581466075023443e-07, + "loss": 0.1639, + "step": 9054 + }, + { + "epoch": 0.8812652068126521, + "grad_norm": 1.649886664919769, + "learning_rate": 3.652231018020852e-07, + "loss": 0.6188, + "step": 9055 + }, + { + "epoch": 0.8813625304136253, + "grad_norm": 1.4701884915699173, + "learning_rate": 3.6463200341456393e-07, + "loss": 0.2686, + "step": 9056 + }, + { + "epoch": 0.8814598540145986, + "grad_norm": 1.6359678016049655, + "learning_rate": 3.640413656464098e-07, + "loss": 0.4638, + "step": 9057 + }, + { + "epoch": 0.8815571776155717, + "grad_norm": 1.5462197351230702, + "learning_rate": 3.634511885563147e-07, + "loss": 0.3276, + "step": 9058 + }, + { + "epoch": 0.881654501216545, + "grad_norm": 1.613713241264869, + "learning_rate": 3.628614722029261e-07, + "loss": 0.3707, + "step": 9059 + }, + { + "epoch": 0.8817518248175182, + "grad_norm": 1.6381123511660378, + "learning_rate": 3.622722166448417e-07, + "loss": 0.3499, + "step": 9060 + }, + { + "epoch": 0.8818491484184915, + "grad_norm": 1.7478231787734744, + "learning_rate": 3.6168342194061846e-07, + "loss": 0.4698, + "step": 9061 + }, + { + "epoch": 0.8819464720194647, + "grad_norm": 1.8401327048006226, + "learning_rate": 3.610950881487646e-07, + "loss": 0.5005, + "step": 9062 + }, + { + "epoch": 0.882043795620438, + "grad_norm": 1.8045080847132302, + "learning_rate": 3.605072153277428e-07, + "loss": 0.3375, + "step": 9063 + }, + { + "epoch": 0.8821411192214111, + "grad_norm": 1.6474670156786215, + "learning_rate": 3.5991980353596966e-07, + "loss": 0.3197, + "step": 9064 + }, + { + "epoch": 0.8822384428223844, + "grad_norm": 1.614737972934675, + "learning_rate": 3.593328528318185e-07, + "loss": 0.2917, + "step": 9065 + }, + { + "epoch": 0.8823357664233576, + "grad_norm": 1.497862087028239, + "learning_rate": 3.587463632736149e-07, + "loss": 0.2748, + "step": 9066 + }, + { + "epoch": 0.8824330900243309, + "grad_norm": 1.4784119341632818, + "learning_rate": 3.581603349196372e-07, + "loss": 0.2573, + "step": 9067 + }, + { + "epoch": 0.8825304136253042, + "grad_norm": 1.9765740194475032, + "learning_rate": 3.5757476782811994e-07, + "loss": 0.3966, + "step": 9068 + }, + { + "epoch": 0.8826277372262774, + "grad_norm": 1.6749932274752672, + "learning_rate": 3.569896620572522e-07, + "loss": 0.337, + "step": 9069 + }, + { + "epoch": 0.8827250608272507, + "grad_norm": 1.634251959346853, + "learning_rate": 3.564050176651751e-07, + "loss": 0.4117, + "step": 9070 + }, + { + "epoch": 0.8828223844282238, + "grad_norm": 1.7628002695553, + "learning_rate": 3.5582083470998564e-07, + "loss": 0.4307, + "step": 9071 + }, + { + "epoch": 0.8829197080291971, + "grad_norm": 1.5194538949923977, + "learning_rate": 3.5523711324973396e-07, + "loss": 0.2847, + "step": 9072 + }, + { + "epoch": 0.8830170316301703, + "grad_norm": 1.6273275725265193, + "learning_rate": 3.5465385334242653e-07, + "loss": 0.3105, + "step": 9073 + }, + { + "epoch": 0.8831143552311436, + "grad_norm": 1.7073075618628495, + "learning_rate": 3.540710550460197e-07, + "loss": 0.2291, + "step": 9074 + }, + { + "epoch": 0.8832116788321168, + "grad_norm": 1.5253964080896296, + "learning_rate": 3.534887184184271e-07, + "loss": 0.3674, + "step": 9075 + }, + { + "epoch": 0.8833090024330901, + "grad_norm": 1.951184582979137, + "learning_rate": 3.5290684351751705e-07, + "loss": 0.3578, + "step": 9076 + }, + { + "epoch": 0.8834063260340632, + "grad_norm": 1.7633226625567928, + "learning_rate": 3.5232543040111147e-07, + "loss": 0.4841, + "step": 9077 + }, + { + "epoch": 0.8835036496350365, + "grad_norm": 1.8932993236137774, + "learning_rate": 3.517444791269836e-07, + "loss": 0.3373, + "step": 9078 + }, + { + "epoch": 0.8836009732360097, + "grad_norm": 1.998724113234692, + "learning_rate": 3.5116398975286405e-07, + "loss": 0.3778, + "step": 9079 + }, + { + "epoch": 0.883698296836983, + "grad_norm": 1.749393698703804, + "learning_rate": 3.505839623364371e-07, + "loss": 0.3623, + "step": 9080 + }, + { + "epoch": 0.8837956204379562, + "grad_norm": 1.5806156559702278, + "learning_rate": 3.500043969353384e-07, + "loss": 0.2478, + "step": 9081 + }, + { + "epoch": 0.8838929440389295, + "grad_norm": 1.618841532472243, + "learning_rate": 3.4942529360716126e-07, + "loss": 0.3894, + "step": 9082 + }, + { + "epoch": 0.8839902676399026, + "grad_norm": 1.7121246956886744, + "learning_rate": 3.4884665240945084e-07, + "loss": 0.3882, + "step": 9083 + }, + { + "epoch": 0.8840875912408759, + "grad_norm": 1.800775689304728, + "learning_rate": 3.482684733997083e-07, + "loss": 0.5229, + "step": 9084 + }, + { + "epoch": 0.8841849148418491, + "grad_norm": 1.8169008299491063, + "learning_rate": 3.476907566353854e-07, + "loss": 0.3397, + "step": 9085 + }, + { + "epoch": 0.8842822384428224, + "grad_norm": 1.7354285674433172, + "learning_rate": 3.4711350217389193e-07, + "loss": 0.4052, + "step": 9086 + }, + { + "epoch": 0.8843795620437956, + "grad_norm": 1.473746546255114, + "learning_rate": 3.4653671007259084e-07, + "loss": 0.3565, + "step": 9087 + }, + { + "epoch": 0.8844768856447689, + "grad_norm": 1.727610067457722, + "learning_rate": 3.459603803887962e-07, + "loss": 0.3789, + "step": 9088 + }, + { + "epoch": 0.884574209245742, + "grad_norm": 1.681907854103073, + "learning_rate": 3.4538451317977893e-07, + "loss": 0.528, + "step": 9089 + }, + { + "epoch": 0.8846715328467153, + "grad_norm": 1.8151512486514607, + "learning_rate": 3.4480910850276384e-07, + "loss": 0.7132, + "step": 9090 + }, + { + "epoch": 0.8847688564476885, + "grad_norm": 1.6661497109087697, + "learning_rate": 3.4423416641492955e-07, + "loss": 0.443, + "step": 9091 + }, + { + "epoch": 0.8848661800486618, + "grad_norm": 1.6664359489330403, + "learning_rate": 3.436596869734071e-07, + "loss": 0.3361, + "step": 9092 + }, + { + "epoch": 0.884963503649635, + "grad_norm": 1.5029199313614874, + "learning_rate": 3.430856702352836e-07, + "loss": 0.4399, + "step": 9093 + }, + { + "epoch": 0.8850608272506083, + "grad_norm": 1.4931129661384899, + "learning_rate": 3.425121162575995e-07, + "loss": 0.2794, + "step": 9094 + }, + { + "epoch": 0.8851581508515816, + "grad_norm": 3.014116453456156, + "learning_rate": 3.4193902509734866e-07, + "loss": 0.3236, + "step": 9095 + }, + { + "epoch": 0.8852554744525547, + "grad_norm": 1.6922870131209518, + "learning_rate": 3.4136639681148053e-07, + "loss": 0.3658, + "step": 9096 + }, + { + "epoch": 0.885352798053528, + "grad_norm": 1.8344467928872858, + "learning_rate": 3.4079423145689626e-07, + "loss": 0.4291, + "step": 9097 + }, + { + "epoch": 0.8854501216545012, + "grad_norm": 1.4247471233659503, + "learning_rate": 3.402225290904543e-07, + "loss": 0.305, + "step": 9098 + }, + { + "epoch": 0.8855474452554745, + "grad_norm": 1.482022077002162, + "learning_rate": 3.39651289768963e-07, + "loss": 0.3578, + "step": 9099 + }, + { + "epoch": 0.8856447688564477, + "grad_norm": 1.7085839374611063, + "learning_rate": 3.3908051354918757e-07, + "loss": 0.2994, + "step": 9100 + }, + { + "epoch": 0.885742092457421, + "grad_norm": 2.00591975551161, + "learning_rate": 3.38510200487846e-07, + "loss": 0.6162, + "step": 9101 + }, + { + "epoch": 0.8858394160583941, + "grad_norm": 1.3939051571685541, + "learning_rate": 3.3794035064161234e-07, + "loss": 0.3454, + "step": 9102 + }, + { + "epoch": 0.8859367396593674, + "grad_norm": 1.8612932840390803, + "learning_rate": 3.373709640671102e-07, + "loss": 0.5618, + "step": 9103 + }, + { + "epoch": 0.8860340632603406, + "grad_norm": 2.151574968511042, + "learning_rate": 3.368020408209216e-07, + "loss": 0.4568, + "step": 9104 + }, + { + "epoch": 0.8861313868613139, + "grad_norm": 1.6024720377275257, + "learning_rate": 3.362335809595801e-07, + "loss": 0.3057, + "step": 9105 + }, + { + "epoch": 0.8862287104622871, + "grad_norm": 1.8097065412665474, + "learning_rate": 3.3566558453957455e-07, + "loss": 0.2896, + "step": 9106 + }, + { + "epoch": 0.8863260340632604, + "grad_norm": 1.7369648610164115, + "learning_rate": 3.350980516173463e-07, + "loss": 0.4104, + "step": 9107 + }, + { + "epoch": 0.8864233576642335, + "grad_norm": 1.83707564767274, + "learning_rate": 3.345309822492926e-07, + "loss": 0.2828, + "step": 9108 + }, + { + "epoch": 0.8865206812652068, + "grad_norm": 1.9316451918313684, + "learning_rate": 3.339643764917633e-07, + "loss": 0.5468, + "step": 9109 + }, + { + "epoch": 0.88661800486618, + "grad_norm": 1.4716594368111537, + "learning_rate": 3.333982344010611e-07, + "loss": 0.2916, + "step": 9110 + }, + { + "epoch": 0.8867153284671533, + "grad_norm": 1.5773766131140017, + "learning_rate": 3.3283255603344446e-07, + "loss": 0.4253, + "step": 9111 + }, + { + "epoch": 0.8868126520681265, + "grad_norm": 1.6761369661115555, + "learning_rate": 3.322673414451255e-07, + "loss": 0.4859, + "step": 9112 + }, + { + "epoch": 0.8869099756690998, + "grad_norm": 1.7470794558085925, + "learning_rate": 3.3170259069227104e-07, + "loss": 0.3984, + "step": 9113 + }, + { + "epoch": 0.887007299270073, + "grad_norm": 1.764763343333673, + "learning_rate": 3.311383038309984e-07, + "loss": 0.4786, + "step": 9114 + }, + { + "epoch": 0.8871046228710462, + "grad_norm": 1.4167523171207508, + "learning_rate": 3.305744809173822e-07, + "loss": 0.2656, + "step": 9115 + }, + { + "epoch": 0.8872019464720194, + "grad_norm": 1.4060003694856682, + "learning_rate": 3.300111220074498e-07, + "loss": 0.3296, + "step": 9116 + }, + { + "epoch": 0.8872992700729927, + "grad_norm": 1.2579898116907582, + "learning_rate": 3.2944822715718306e-07, + "loss": 0.1784, + "step": 9117 + }, + { + "epoch": 0.887396593673966, + "grad_norm": 1.5244960921685669, + "learning_rate": 3.2888579642251615e-07, + "loss": 0.3471, + "step": 9118 + }, + { + "epoch": 0.8874939172749392, + "grad_norm": 1.1440309856860158, + "learning_rate": 3.2832382985933884e-07, + "loss": 0.2042, + "step": 9119 + }, + { + "epoch": 0.8875912408759125, + "grad_norm": 2.0623008169460673, + "learning_rate": 3.2776232752349536e-07, + "loss": 0.4595, + "step": 9120 + }, + { + "epoch": 0.8876885644768856, + "grad_norm": 1.5234848406356416, + "learning_rate": 3.272012894707799e-07, + "loss": 0.3877, + "step": 9121 + }, + { + "epoch": 0.8877858880778589, + "grad_norm": 1.6989976059401126, + "learning_rate": 3.2664071575694455e-07, + "loss": 0.3786, + "step": 9122 + }, + { + "epoch": 0.8878832116788321, + "grad_norm": 1.6025158533511297, + "learning_rate": 3.260806064376942e-07, + "loss": 0.5405, + "step": 9123 + }, + { + "epoch": 0.8879805352798054, + "grad_norm": 2.196950715280963, + "learning_rate": 3.255209615686866e-07, + "loss": 0.4193, + "step": 9124 + }, + { + "epoch": 0.8880778588807786, + "grad_norm": 1.5287874623010604, + "learning_rate": 3.249617812055339e-07, + "loss": 0.3226, + "step": 9125 + }, + { + "epoch": 0.8881751824817519, + "grad_norm": 1.8464667784044262, + "learning_rate": 3.2440306540380217e-07, + "loss": 0.4414, + "step": 9126 + }, + { + "epoch": 0.888272506082725, + "grad_norm": 1.7925715120700234, + "learning_rate": 3.2384481421901203e-07, + "loss": 0.3691, + "step": 9127 + }, + { + "epoch": 0.8883698296836983, + "grad_norm": 1.723308967231154, + "learning_rate": 3.2328702770663744e-07, + "loss": 0.3652, + "step": 9128 + }, + { + "epoch": 0.8884671532846715, + "grad_norm": 1.8916682998157235, + "learning_rate": 3.227297059221046e-07, + "loss": 0.4879, + "step": 9129 + }, + { + "epoch": 0.8885644768856448, + "grad_norm": 1.5412998106642846, + "learning_rate": 3.221728489207959e-07, + "loss": 0.3765, + "step": 9130 + }, + { + "epoch": 0.888661800486618, + "grad_norm": 1.5301586076392444, + "learning_rate": 3.2161645675804764e-07, + "loss": 0.2679, + "step": 9131 + }, + { + "epoch": 0.8887591240875913, + "grad_norm": 1.76691585378843, + "learning_rate": 3.2106052948914666e-07, + "loss": 0.3567, + "step": 9132 + }, + { + "epoch": 0.8888564476885644, + "grad_norm": 1.6002726501377984, + "learning_rate": 3.205050671693366e-07, + "loss": 0.404, + "step": 9133 + }, + { + "epoch": 0.8889537712895377, + "grad_norm": 1.4441165409241568, + "learning_rate": 3.199500698538149e-07, + "loss": 0.2846, + "step": 9134 + }, + { + "epoch": 0.8890510948905109, + "grad_norm": 1.4334349167482208, + "learning_rate": 3.193955375977309e-07, + "loss": 0.3984, + "step": 9135 + }, + { + "epoch": 0.8891484184914842, + "grad_norm": 1.5502845872066735, + "learning_rate": 3.188414704561893e-07, + "loss": 0.407, + "step": 9136 + }, + { + "epoch": 0.8892457420924574, + "grad_norm": 1.663549926329642, + "learning_rate": 3.1828786848424775e-07, + "loss": 0.3392, + "step": 9137 + }, + { + "epoch": 0.8893430656934307, + "grad_norm": 1.6710969059055198, + "learning_rate": 3.177347317369189e-07, + "loss": 0.4116, + "step": 9138 + }, + { + "epoch": 0.8894403892944039, + "grad_norm": 1.6817833887479985, + "learning_rate": 3.171820602691672e-07, + "loss": 0.2289, + "step": 9139 + }, + { + "epoch": 0.8895377128953771, + "grad_norm": 1.8527672304879188, + "learning_rate": 3.166298541359125e-07, + "loss": 0.3699, + "step": 9140 + }, + { + "epoch": 0.8896350364963503, + "grad_norm": 1.4694639864853833, + "learning_rate": 3.1607811339202765e-07, + "loss": 0.3212, + "step": 9141 + }, + { + "epoch": 0.8897323600973236, + "grad_norm": 1.5118120347055846, + "learning_rate": 3.15526838092341e-07, + "loss": 0.3278, + "step": 9142 + }, + { + "epoch": 0.8898296836982968, + "grad_norm": 1.7788914910643654, + "learning_rate": 3.1497602829163034e-07, + "loss": 0.2653, + "step": 9143 + }, + { + "epoch": 0.8899270072992701, + "grad_norm": 1.7043740827937748, + "learning_rate": 3.144256840446319e-07, + "loss": 0.5288, + "step": 9144 + }, + { + "epoch": 0.8900243309002434, + "grad_norm": 1.4050219365779737, + "learning_rate": 3.138758054060337e-07, + "loss": 0.2148, + "step": 9145 + }, + { + "epoch": 0.8901216545012165, + "grad_norm": 1.5777131100208739, + "learning_rate": 3.1332639243047633e-07, + "loss": 0.3379, + "step": 9146 + }, + { + "epoch": 0.8902189781021898, + "grad_norm": 1.9367033027996186, + "learning_rate": 3.1277744517255625e-07, + "loss": 0.4695, + "step": 9147 + }, + { + "epoch": 0.890316301703163, + "grad_norm": 1.5376693694325716, + "learning_rate": 3.12228963686822e-07, + "loss": 0.3524, + "step": 9148 + }, + { + "epoch": 0.8904136253041363, + "grad_norm": 1.614416872693255, + "learning_rate": 3.1168094802777717e-07, + "loss": 0.2848, + "step": 9149 + }, + { + "epoch": 0.8905109489051095, + "grad_norm": 1.5949271847358346, + "learning_rate": 3.111333982498782e-07, + "loss": 0.5459, + "step": 9150 + }, + { + "epoch": 0.8906082725060828, + "grad_norm": 1.3923387575878752, + "learning_rate": 3.105863144075355e-07, + "loss": 0.4825, + "step": 9151 + }, + { + "epoch": 0.8907055961070559, + "grad_norm": 1.4849020574361875, + "learning_rate": 3.100396965551139e-07, + "loss": 0.3025, + "step": 9152 + }, + { + "epoch": 0.8908029197080292, + "grad_norm": 1.7357386821128944, + "learning_rate": 3.0949354474692937e-07, + "loss": 0.3882, + "step": 9153 + }, + { + "epoch": 0.8909002433090024, + "grad_norm": 1.6591875907974658, + "learning_rate": 3.0894785903725467e-07, + "loss": 0.2886, + "step": 9154 + }, + { + "epoch": 0.8909975669099757, + "grad_norm": 1.3628761542206116, + "learning_rate": 3.0840263948031414e-07, + "loss": 0.2651, + "step": 9155 + }, + { + "epoch": 0.8910948905109489, + "grad_norm": 1.8751857665543792, + "learning_rate": 3.0785788613028776e-07, + "loss": 0.4253, + "step": 9156 + }, + { + "epoch": 0.8911922141119222, + "grad_norm": 1.8948864790599382, + "learning_rate": 3.073135990413068e-07, + "loss": 0.3, + "step": 9157 + }, + { + "epoch": 0.8912895377128954, + "grad_norm": 1.767411939701554, + "learning_rate": 3.067697782674578e-07, + "loss": 0.4425, + "step": 9158 + }, + { + "epoch": 0.8913868613138686, + "grad_norm": 1.8120567832201102, + "learning_rate": 3.0622642386278046e-07, + "loss": 0.3709, + "step": 9159 + }, + { + "epoch": 0.8914841849148418, + "grad_norm": 2.7063762538435348, + "learning_rate": 3.056835358812682e-07, + "loss": 0.4427, + "step": 9160 + }, + { + "epoch": 0.8915815085158151, + "grad_norm": 1.5352150320976024, + "learning_rate": 3.05141114376869e-07, + "loss": 0.3215, + "step": 9161 + }, + { + "epoch": 0.8916788321167883, + "grad_norm": 1.6665789422154553, + "learning_rate": 3.045991594034825e-07, + "loss": 0.342, + "step": 9162 + }, + { + "epoch": 0.8917761557177616, + "grad_norm": 1.6087236289921085, + "learning_rate": 3.0405767101496454e-07, + "loss": 0.3342, + "step": 9163 + }, + { + "epoch": 0.8918734793187348, + "grad_norm": 1.399799642531154, + "learning_rate": 3.035166492651209e-07, + "loss": 0.2883, + "step": 9164 + }, + { + "epoch": 0.891970802919708, + "grad_norm": 1.6795318531401184, + "learning_rate": 3.0297609420771534e-07, + "loss": 0.5727, + "step": 9165 + }, + { + "epoch": 0.8920681265206812, + "grad_norm": 1.5647631467799303, + "learning_rate": 3.024360058964615e-07, + "loss": 0.3283, + "step": 9166 + }, + { + "epoch": 0.8921654501216545, + "grad_norm": 1.6390319355895933, + "learning_rate": 3.0189638438503035e-07, + "loss": 0.365, + "step": 9167 + }, + { + "epoch": 0.8922627737226277, + "grad_norm": 1.6198457257080392, + "learning_rate": 3.013572297270423e-07, + "loss": 0.4035, + "step": 9168 + }, + { + "epoch": 0.892360097323601, + "grad_norm": 1.5221444144090437, + "learning_rate": 3.008185419760745e-07, + "loss": 0.3265, + "step": 9169 + }, + { + "epoch": 0.8924574209245743, + "grad_norm": 1.4684633015770647, + "learning_rate": 3.002803211856564e-07, + "loss": 0.2488, + "step": 9170 + }, + { + "epoch": 0.8925547445255474, + "grad_norm": 1.8435670794250265, + "learning_rate": 2.9974256740927134e-07, + "loss": 0.2716, + "step": 9171 + }, + { + "epoch": 0.8926520681265206, + "grad_norm": 1.6683920830066175, + "learning_rate": 2.992052807003565e-07, + "loss": 0.375, + "step": 9172 + }, + { + "epoch": 0.8927493917274939, + "grad_norm": 1.5240984925460292, + "learning_rate": 2.9866846111230195e-07, + "loss": 0.3951, + "step": 9173 + }, + { + "epoch": 0.8928467153284672, + "grad_norm": 1.9735470412924887, + "learning_rate": 2.981321086984529e-07, + "loss": 0.2793, + "step": 9174 + }, + { + "epoch": 0.8929440389294404, + "grad_norm": 1.6166653734706675, + "learning_rate": 2.9759622351210604e-07, + "loss": 0.1554, + "step": 9175 + }, + { + "epoch": 0.8930413625304137, + "grad_norm": 1.540771155553017, + "learning_rate": 2.9706080560651217e-07, + "loss": 0.3869, + "step": 9176 + }, + { + "epoch": 0.8931386861313869, + "grad_norm": 1.3655591150098954, + "learning_rate": 2.965258550348771e-07, + "loss": 0.2786, + "step": 9177 + }, + { + "epoch": 0.8932360097323601, + "grad_norm": 1.7268940188740871, + "learning_rate": 2.959913718503593e-07, + "loss": 0.3855, + "step": 9178 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 1.907714656285984, + "learning_rate": 2.954573561060697e-07, + "loss": 0.5679, + "step": 9179 + }, + { + "epoch": 0.8934306569343066, + "grad_norm": 1.9938927626245189, + "learning_rate": 2.9492380785507366e-07, + "loss": 0.2872, + "step": 9180 + }, + { + "epoch": 0.8935279805352798, + "grad_norm": 2.0341673808062075, + "learning_rate": 2.943907271503915e-07, + "loss": 0.1867, + "step": 9181 + }, + { + "epoch": 0.8936253041362531, + "grad_norm": 1.630172420345821, + "learning_rate": 2.9385811404499476e-07, + "loss": 0.1508, + "step": 9182 + }, + { + "epoch": 0.8937226277372263, + "grad_norm": 1.4687448866734878, + "learning_rate": 2.9332596859180993e-07, + "loss": 0.4466, + "step": 9183 + }, + { + "epoch": 0.8938199513381995, + "grad_norm": 1.64983782115716, + "learning_rate": 2.9279429084371646e-07, + "loss": 0.5035, + "step": 9184 + }, + { + "epoch": 0.8939172749391727, + "grad_norm": 1.5953273821070282, + "learning_rate": 2.9226308085354817e-07, + "loss": 0.3891, + "step": 9185 + }, + { + "epoch": 0.894014598540146, + "grad_norm": 1.6831368448128925, + "learning_rate": 2.9173233867409057e-07, + "loss": 0.3331, + "step": 9186 + }, + { + "epoch": 0.8941119221411192, + "grad_norm": 1.2954317817939076, + "learning_rate": 2.9120206435808486e-07, + "loss": 0.2744, + "step": 9187 + }, + { + "epoch": 0.8942092457420925, + "grad_norm": 1.3729589225197656, + "learning_rate": 2.906722579582244e-07, + "loss": 0.4519, + "step": 9188 + }, + { + "epoch": 0.8943065693430657, + "grad_norm": 1.569852163680121, + "learning_rate": 2.90142919527156e-07, + "loss": 0.2961, + "step": 9189 + }, + { + "epoch": 0.8944038929440389, + "grad_norm": 2.422869275177092, + "learning_rate": 2.896140491174798e-07, + "loss": 0.2981, + "step": 9190 + }, + { + "epoch": 0.8945012165450121, + "grad_norm": 1.998746359835537, + "learning_rate": 2.890856467817516e-07, + "loss": 0.4456, + "step": 9191 + }, + { + "epoch": 0.8945985401459854, + "grad_norm": 1.6829104707582905, + "learning_rate": 2.885577125724781e-07, + "loss": 0.2868, + "step": 9192 + }, + { + "epoch": 0.8946958637469586, + "grad_norm": 2.8838976960045772, + "learning_rate": 2.880302465421203e-07, + "loss": 0.3749, + "step": 9193 + }, + { + "epoch": 0.8947931873479319, + "grad_norm": 1.798833691715735, + "learning_rate": 2.875032487430929e-07, + "loss": 0.4536, + "step": 9194 + }, + { + "epoch": 0.8948905109489051, + "grad_norm": 1.712609366941201, + "learning_rate": 2.869767192277645e-07, + "loss": 0.6356, + "step": 9195 + }, + { + "epoch": 0.8949878345498783, + "grad_norm": 1.8824865288524737, + "learning_rate": 2.864506580484572e-07, + "loss": 0.3469, + "step": 9196 + }, + { + "epoch": 0.8950851581508515, + "grad_norm": 2.1387590503779825, + "learning_rate": 2.8592506525744466e-07, + "loss": 0.7102, + "step": 9197 + }, + { + "epoch": 0.8951824817518248, + "grad_norm": 1.88893695221631, + "learning_rate": 2.8539994090695523e-07, + "loss": 0.3691, + "step": 9198 + }, + { + "epoch": 0.895279805352798, + "grad_norm": 2.3397357387163122, + "learning_rate": 2.8487528504917263e-07, + "loss": 0.3036, + "step": 9199 + }, + { + "epoch": 0.8953771289537713, + "grad_norm": 1.5840223051485127, + "learning_rate": 2.843510977362307e-07, + "loss": 0.2888, + "step": 9200 + }, + { + "epoch": 0.8954744525547446, + "grad_norm": 1.763956682732983, + "learning_rate": 2.838273790202184e-07, + "loss": 0.4738, + "step": 9201 + }, + { + "epoch": 0.8955717761557178, + "grad_norm": 2.009141534083523, + "learning_rate": 2.833041289531779e-07, + "loss": 0.3668, + "step": 9202 + }, + { + "epoch": 0.895669099756691, + "grad_norm": 1.5425816303830613, + "learning_rate": 2.827813475871055e-07, + "loss": 0.3764, + "step": 9203 + }, + { + "epoch": 0.8957664233576642, + "grad_norm": 1.8600381902650238, + "learning_rate": 2.822590349739501e-07, + "loss": 0.2779, + "step": 9204 + }, + { + "epoch": 0.8958637469586375, + "grad_norm": 1.6995202509545277, + "learning_rate": 2.817371911656136e-07, + "loss": 0.411, + "step": 9205 + }, + { + "epoch": 0.8959610705596107, + "grad_norm": 1.7317292937457103, + "learning_rate": 2.812158162139539e-07, + "loss": 0.4568, + "step": 9206 + }, + { + "epoch": 0.896058394160584, + "grad_norm": 1.759147848910771, + "learning_rate": 2.8069491017077786e-07, + "loss": 0.3843, + "step": 9207 + }, + { + "epoch": 0.8961557177615572, + "grad_norm": 3.0970920274984457, + "learning_rate": 2.8017447308784915e-07, + "loss": 0.4052, + "step": 9208 + }, + { + "epoch": 0.8962530413625304, + "grad_norm": 1.784471977524771, + "learning_rate": 2.7965450501688473e-07, + "loss": 0.3637, + "step": 9209 + }, + { + "epoch": 0.8963503649635036, + "grad_norm": 1.8823259286835619, + "learning_rate": 2.791350060095538e-07, + "loss": 0.48, + "step": 9210 + }, + { + "epoch": 0.8964476885644769, + "grad_norm": 1.527320256747665, + "learning_rate": 2.786159761174784e-07, + "loss": 0.3775, + "step": 9211 + }, + { + "epoch": 0.8965450121654501, + "grad_norm": 1.9302012883109412, + "learning_rate": 2.78097415392235e-07, + "loss": 0.4761, + "step": 9212 + }, + { + "epoch": 0.8966423357664234, + "grad_norm": 1.6035594864490716, + "learning_rate": 2.7757932388535357e-07, + "loss": 0.4492, + "step": 9213 + }, + { + "epoch": 0.8967396593673966, + "grad_norm": 1.7127748839575634, + "learning_rate": 2.770617016483185e-07, + "loss": 0.3938, + "step": 9214 + }, + { + "epoch": 0.8968369829683698, + "grad_norm": 1.7007223558019864, + "learning_rate": 2.7654454873256464e-07, + "loss": 0.3669, + "step": 9215 + }, + { + "epoch": 0.896934306569343, + "grad_norm": 1.3558535510624703, + "learning_rate": 2.760278651894821e-07, + "loss": 0.1617, + "step": 9216 + }, + { + "epoch": 0.8970316301703163, + "grad_norm": 1.7152279920813256, + "learning_rate": 2.7551165107041477e-07, + "loss": 0.3861, + "step": 9217 + }, + { + "epoch": 0.8971289537712895, + "grad_norm": 1.3481134053926764, + "learning_rate": 2.7499590642665773e-07, + "loss": 0.3645, + "step": 9218 + }, + { + "epoch": 0.8972262773722628, + "grad_norm": 1.8097103618481378, + "learning_rate": 2.7448063130946224e-07, + "loss": 0.5263, + "step": 9219 + }, + { + "epoch": 0.897323600973236, + "grad_norm": 1.6867769599683455, + "learning_rate": 2.739658257700306e-07, + "loss": 0.2651, + "step": 9220 + }, + { + "epoch": 0.8974209245742093, + "grad_norm": 1.5036256999125885, + "learning_rate": 2.734514898595203e-07, + "loss": 0.3425, + "step": 9221 + }, + { + "epoch": 0.8975182481751824, + "grad_norm": 1.8396023148464262, + "learning_rate": 2.729376236290399e-07, + "loss": 0.5518, + "step": 9222 + }, + { + "epoch": 0.8976155717761557, + "grad_norm": 1.6814270238317088, + "learning_rate": 2.724242271296529e-07, + "loss": 0.3968, + "step": 9223 + }, + { + "epoch": 0.897712895377129, + "grad_norm": 1.8994487063035996, + "learning_rate": 2.7191130041237747e-07, + "loss": 0.4937, + "step": 9224 + }, + { + "epoch": 0.8978102189781022, + "grad_norm": 1.668657075959302, + "learning_rate": 2.713988435281817e-07, + "loss": 0.4164, + "step": 9225 + }, + { + "epoch": 0.8979075425790755, + "grad_norm": 1.715582825189221, + "learning_rate": 2.708868565279893e-07, + "loss": 0.4615, + "step": 9226 + }, + { + "epoch": 0.8980048661800487, + "grad_norm": 1.7116568111493766, + "learning_rate": 2.7037533946267623e-07, + "loss": 0.4112, + "step": 9227 + }, + { + "epoch": 0.8981021897810219, + "grad_norm": 1.9229364332820473, + "learning_rate": 2.698642923830741e-07, + "loss": 0.3661, + "step": 9228 + }, + { + "epoch": 0.8981995133819951, + "grad_norm": 1.8095015354745532, + "learning_rate": 2.6935371533996333e-07, + "loss": 0.3059, + "step": 9229 + }, + { + "epoch": 0.8982968369829684, + "grad_norm": 1.4307466346425188, + "learning_rate": 2.688436083840817e-07, + "loss": 0.3293, + "step": 9230 + }, + { + "epoch": 0.8983941605839416, + "grad_norm": 1.5308283011087775, + "learning_rate": 2.683339715661193e-07, + "loss": 0.4176, + "step": 9231 + }, + { + "epoch": 0.8984914841849149, + "grad_norm": 1.6399629256081933, + "learning_rate": 2.6782480493671826e-07, + "loss": 0.2767, + "step": 9232 + }, + { + "epoch": 0.8985888077858881, + "grad_norm": 1.6351665702454048, + "learning_rate": 2.6731610854647427e-07, + "loss": 0.5433, + "step": 9233 + }, + { + "epoch": 0.8986861313868613, + "grad_norm": 1.4460995567365942, + "learning_rate": 2.66807882445938e-07, + "loss": 0.3486, + "step": 9234 + }, + { + "epoch": 0.8987834549878345, + "grad_norm": 1.6316277983610856, + "learning_rate": 2.663001266856124e-07, + "loss": 0.4658, + "step": 9235 + }, + { + "epoch": 0.8988807785888078, + "grad_norm": 1.4075275975396337, + "learning_rate": 2.6579284131595164e-07, + "loss": 0.1408, + "step": 9236 + }, + { + "epoch": 0.898978102189781, + "grad_norm": 1.8309657530485572, + "learning_rate": 2.652860263873669e-07, + "loss": 0.484, + "step": 9237 + }, + { + "epoch": 0.8990754257907543, + "grad_norm": 2.2653088579268297, + "learning_rate": 2.6477968195021906e-07, + "loss": 0.17, + "step": 9238 + }, + { + "epoch": 0.8991727493917275, + "grad_norm": 1.4201176441513501, + "learning_rate": 2.642738080548263e-07, + "loss": 0.3165, + "step": 9239 + }, + { + "epoch": 0.8992700729927007, + "grad_norm": 2.700072400570539, + "learning_rate": 2.6376840475145493e-07, + "loss": 0.3367, + "step": 9240 + }, + { + "epoch": 0.8993673965936739, + "grad_norm": 1.6546914306824478, + "learning_rate": 2.632634720903282e-07, + "loss": 0.4153, + "step": 9241 + }, + { + "epoch": 0.8994647201946472, + "grad_norm": 1.7313117391546693, + "learning_rate": 2.627590101216221e-07, + "loss": 0.4383, + "step": 9242 + }, + { + "epoch": 0.8995620437956204, + "grad_norm": 2.0205152464444565, + "learning_rate": 2.6225501889546423e-07, + "loss": 0.42, + "step": 9243 + }, + { + "epoch": 0.8996593673965937, + "grad_norm": 1.9627783292466146, + "learning_rate": 2.6175149846193736e-07, + "loss": 0.3923, + "step": 9244 + }, + { + "epoch": 0.8997566909975669, + "grad_norm": 1.842141176891876, + "learning_rate": 2.612484488710765e-07, + "loss": 0.3958, + "step": 9245 + }, + { + "epoch": 0.8998540145985402, + "grad_norm": 1.6143799520395188, + "learning_rate": 2.60745870172871e-07, + "loss": 0.4507, + "step": 9246 + }, + { + "epoch": 0.8999513381995133, + "grad_norm": 1.301713763877542, + "learning_rate": 2.6024376241726044e-07, + "loss": 0.1957, + "step": 9247 + }, + { + "epoch": 0.9000486618004866, + "grad_norm": 1.518684366950848, + "learning_rate": 2.5974212565414046e-07, + "loss": 0.3225, + "step": 9248 + }, + { + "epoch": 0.9001459854014598, + "grad_norm": 1.3105400528708893, + "learning_rate": 2.5924095993335897e-07, + "loss": 0.2223, + "step": 9249 + }, + { + "epoch": 0.9002433090024331, + "grad_norm": 1.6435030443461012, + "learning_rate": 2.5874026530471773e-07, + "loss": 0.2424, + "step": 9250 + }, + { + "epoch": 0.9003406326034064, + "grad_norm": 1.7725156833401412, + "learning_rate": 2.5824004181797035e-07, + "loss": 0.4377, + "step": 9251 + }, + { + "epoch": 0.9004379562043796, + "grad_norm": 2.0132220234743117, + "learning_rate": 2.577402895228243e-07, + "loss": 0.3757, + "step": 9252 + }, + { + "epoch": 0.9005352798053528, + "grad_norm": 1.56075830662509, + "learning_rate": 2.5724100846894084e-07, + "loss": 0.3274, + "step": 9253 + }, + { + "epoch": 0.900632603406326, + "grad_norm": 1.5804692598833288, + "learning_rate": 2.5674219870593265e-07, + "loss": 0.3032, + "step": 9254 + }, + { + "epoch": 0.9007299270072993, + "grad_norm": 1.5987537596943948, + "learning_rate": 2.562438602833678e-07, + "loss": 0.3642, + "step": 9255 + }, + { + "epoch": 0.9008272506082725, + "grad_norm": 1.871884682555463, + "learning_rate": 2.5574599325076667e-07, + "loss": 0.3684, + "step": 9256 + }, + { + "epoch": 0.9009245742092458, + "grad_norm": 1.3984022935239633, + "learning_rate": 2.5524859765760303e-07, + "loss": 0.2326, + "step": 9257 + }, + { + "epoch": 0.901021897810219, + "grad_norm": 1.7370439218997562, + "learning_rate": 2.547516735533018e-07, + "loss": 0.6109, + "step": 9258 + }, + { + "epoch": 0.9011192214111922, + "grad_norm": 1.358898649392919, + "learning_rate": 2.5425522098724287e-07, + "loss": 0.2244, + "step": 9259 + }, + { + "epoch": 0.9012165450121654, + "grad_norm": 1.5967244833165672, + "learning_rate": 2.537592400087602e-07, + "loss": 0.3438, + "step": 9260 + }, + { + "epoch": 0.9013138686131387, + "grad_norm": 2.1232286812320513, + "learning_rate": 2.5326373066713984e-07, + "loss": 0.3673, + "step": 9261 + }, + { + "epoch": 0.9014111922141119, + "grad_norm": 1.8804057714156643, + "learning_rate": 2.527686930116191e-07, + "loss": 0.3298, + "step": 9262 + }, + { + "epoch": 0.9015085158150852, + "grad_norm": 1.3857685971045344, + "learning_rate": 2.522741270913914e-07, + "loss": 0.3309, + "step": 9263 + }, + { + "epoch": 0.9016058394160584, + "grad_norm": 1.5418043934242753, + "learning_rate": 2.5178003295560193e-07, + "loss": 0.2878, + "step": 9264 + }, + { + "epoch": 0.9017031630170317, + "grad_norm": 1.6168593658882855, + "learning_rate": 2.5128641065334915e-07, + "loss": 0.5183, + "step": 9265 + }, + { + "epoch": 0.9018004866180048, + "grad_norm": 1.638528556148835, + "learning_rate": 2.507932602336843e-07, + "loss": 0.357, + "step": 9266 + }, + { + "epoch": 0.9018978102189781, + "grad_norm": 1.8614067709480249, + "learning_rate": 2.5030058174561277e-07, + "loss": 0.5347, + "step": 9267 + }, + { + "epoch": 0.9019951338199513, + "grad_norm": 1.928542627644314, + "learning_rate": 2.49808375238092e-07, + "loss": 0.3421, + "step": 9268 + }, + { + "epoch": 0.9020924574209246, + "grad_norm": 1.3855622102777647, + "learning_rate": 2.4931664076003223e-07, + "loss": 0.2013, + "step": 9269 + }, + { + "epoch": 0.9021897810218978, + "grad_norm": 1.7504393843414814, + "learning_rate": 2.488253783602973e-07, + "loss": 0.2077, + "step": 9270 + }, + { + "epoch": 0.9022871046228711, + "grad_norm": 1.7929938819470252, + "learning_rate": 2.4833458808770636e-07, + "loss": 0.3597, + "step": 9271 + }, + { + "epoch": 0.9023844282238442, + "grad_norm": 1.7893450651286409, + "learning_rate": 2.4784426999102664e-07, + "loss": 0.3397, + "step": 9272 + }, + { + "epoch": 0.9024817518248175, + "grad_norm": 4.747547922158697, + "learning_rate": 2.4735442411898347e-07, + "loss": 0.3285, + "step": 9273 + }, + { + "epoch": 0.9025790754257907, + "grad_norm": 1.8588939653580514, + "learning_rate": 2.468650505202519e-07, + "loss": 0.3404, + "step": 9274 + }, + { + "epoch": 0.902676399026764, + "grad_norm": 1.7161327896046963, + "learning_rate": 2.4637614924346185e-07, + "loss": 0.4519, + "step": 9275 + }, + { + "epoch": 0.9027737226277373, + "grad_norm": 1.564869419293021, + "learning_rate": 2.4588772033719564e-07, + "loss": 0.5066, + "step": 9276 + }, + { + "epoch": 0.9028710462287105, + "grad_norm": 1.5518695078094675, + "learning_rate": 2.453997638499883e-07, + "loss": 0.3746, + "step": 9277 + }, + { + "epoch": 0.9029683698296836, + "grad_norm": 1.492041296062711, + "learning_rate": 2.449122798303294e-07, + "loss": 0.3104, + "step": 9278 + }, + { + "epoch": 0.9030656934306569, + "grad_norm": 1.6635304348096076, + "learning_rate": 2.444252683266607e-07, + "loss": 0.2661, + "step": 9279 + }, + { + "epoch": 0.9031630170316302, + "grad_norm": 1.6592993825001505, + "learning_rate": 2.4393872938737516e-07, + "loss": 0.4649, + "step": 9280 + }, + { + "epoch": 0.9032603406326034, + "grad_norm": 1.6307520665482136, + "learning_rate": 2.4345266306082136e-07, + "loss": 0.273, + "step": 9281 + }, + { + "epoch": 0.9033576642335767, + "grad_norm": 1.735519650057081, + "learning_rate": 2.4296706939530124e-07, + "loss": 0.3876, + "step": 9282 + }, + { + "epoch": 0.9034549878345499, + "grad_norm": 1.489173441513752, + "learning_rate": 2.4248194843906616e-07, + "loss": 0.3477, + "step": 9283 + }, + { + "epoch": 0.9035523114355231, + "grad_norm": 1.7490945926166768, + "learning_rate": 2.419973002403242e-07, + "loss": 0.3156, + "step": 9284 + }, + { + "epoch": 0.9036496350364963, + "grad_norm": 1.6907172440555884, + "learning_rate": 2.4151312484723465e-07, + "loss": 0.5542, + "step": 9285 + }, + { + "epoch": 0.9037469586374696, + "grad_norm": 1.5530159556987124, + "learning_rate": 2.4102942230791125e-07, + "loss": 0.3828, + "step": 9286 + }, + { + "epoch": 0.9038442822384428, + "grad_norm": 1.9130997708391517, + "learning_rate": 2.405461926704189e-07, + "loss": 0.5453, + "step": 9287 + }, + { + "epoch": 0.9039416058394161, + "grad_norm": 2.019339517312936, + "learning_rate": 2.400634359827769e-07, + "loss": 0.5213, + "step": 9288 + }, + { + "epoch": 0.9040389294403893, + "grad_norm": 1.9798696239973337, + "learning_rate": 2.3958115229295754e-07, + "loss": 0.3503, + "step": 9289 + }, + { + "epoch": 0.9041362530413626, + "grad_norm": 1.5616733653204091, + "learning_rate": 2.390993416488846e-07, + "loss": 0.3677, + "step": 9290 + }, + { + "epoch": 0.9042335766423357, + "grad_norm": 1.5521258235295425, + "learning_rate": 2.3861800409843594e-07, + "loss": 0.5086, + "step": 9291 + }, + { + "epoch": 0.904330900243309, + "grad_norm": 1.8364370112949306, + "learning_rate": 2.381371396894433e-07, + "loss": 0.4332, + "step": 9292 + }, + { + "epoch": 0.9044282238442822, + "grad_norm": 1.824889359974314, + "learning_rate": 2.376567484696901e-07, + "loss": 0.3331, + "step": 9293 + }, + { + "epoch": 0.9045255474452555, + "grad_norm": 1.6067909309535817, + "learning_rate": 2.3717683048691265e-07, + "loss": 0.3544, + "step": 9294 + }, + { + "epoch": 0.9046228710462287, + "grad_norm": 2.178252263777568, + "learning_rate": 2.3669738578880109e-07, + "loss": 0.376, + "step": 9295 + }, + { + "epoch": 0.904720194647202, + "grad_norm": 1.8672249762313546, + "learning_rate": 2.3621841442299788e-07, + "loss": 0.5543, + "step": 9296 + }, + { + "epoch": 0.9048175182481751, + "grad_norm": 1.4946775581145664, + "learning_rate": 2.3573991643709937e-07, + "loss": 0.1879, + "step": 9297 + }, + { + "epoch": 0.9049148418491484, + "grad_norm": 1.5688157421182054, + "learning_rate": 2.3526189187865312e-07, + "loss": 0.3575, + "step": 9298 + }, + { + "epoch": 0.9050121654501216, + "grad_norm": 1.7586989138733848, + "learning_rate": 2.3478434079516166e-07, + "loss": 0.3411, + "step": 9299 + }, + { + "epoch": 0.9051094890510949, + "grad_norm": 1.6655047976293895, + "learning_rate": 2.3430726323407981e-07, + "loss": 0.2401, + "step": 9300 + }, + { + "epoch": 0.9052068126520681, + "grad_norm": 1.828603046220522, + "learning_rate": 2.3383065924281357e-07, + "loss": 0.2797, + "step": 9301 + }, + { + "epoch": 0.9053041362530414, + "grad_norm": 1.6293150025307948, + "learning_rate": 2.333545288687239e-07, + "loss": 0.2752, + "step": 9302 + }, + { + "epoch": 0.9054014598540145, + "grad_norm": 1.7553232836413308, + "learning_rate": 2.328788721591252e-07, + "loss": 0.2443, + "step": 9303 + }, + { + "epoch": 0.9054987834549878, + "grad_norm": 1.65021076012306, + "learning_rate": 2.3240368916128352e-07, + "loss": 0.3587, + "step": 9304 + }, + { + "epoch": 0.905596107055961, + "grad_norm": 2.066432325346469, + "learning_rate": 2.319289799224167e-07, + "loss": 0.2973, + "step": 9305 + }, + { + "epoch": 0.9056934306569343, + "grad_norm": 1.8090382765035964, + "learning_rate": 2.3145474448969807e-07, + "loss": 0.4803, + "step": 9306 + }, + { + "epoch": 0.9057907542579076, + "grad_norm": 1.555921285452464, + "learning_rate": 2.3098098291025272e-07, + "loss": 0.3612, + "step": 9307 + }, + { + "epoch": 0.9058880778588808, + "grad_norm": 1.8196772830027095, + "learning_rate": 2.30507695231158e-07, + "loss": 0.3317, + "step": 9308 + }, + { + "epoch": 0.9059854014598541, + "grad_norm": 1.5955020773218702, + "learning_rate": 2.300348814994452e-07, + "loss": 0.3187, + "step": 9309 + }, + { + "epoch": 0.9060827250608272, + "grad_norm": 1.7250452063345196, + "learning_rate": 2.2956254176209836e-07, + "loss": 0.2642, + "step": 9310 + }, + { + "epoch": 0.9061800486618005, + "grad_norm": 1.5576655556753518, + "learning_rate": 2.290906760660544e-07, + "loss": 0.2587, + "step": 9311 + }, + { + "epoch": 0.9062773722627737, + "grad_norm": 2.0025368337303506, + "learning_rate": 2.286192844582019e-07, + "loss": 0.5338, + "step": 9312 + }, + { + "epoch": 0.906374695863747, + "grad_norm": 1.9053083501654542, + "learning_rate": 2.2814836698538346e-07, + "loss": 0.3914, + "step": 9313 + }, + { + "epoch": 0.9064720194647202, + "grad_norm": 1.7333985062720056, + "learning_rate": 2.276779236943949e-07, + "loss": 0.6232, + "step": 9314 + }, + { + "epoch": 0.9065693430656935, + "grad_norm": 1.5879599167836074, + "learning_rate": 2.2720795463198496e-07, + "loss": 0.4493, + "step": 9315 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 1.7300951965688691, + "learning_rate": 2.2673845984485353e-07, + "loss": 0.442, + "step": 9316 + }, + { + "epoch": 0.9067639902676399, + "grad_norm": 1.642686547727744, + "learning_rate": 2.262694393796555e-07, + "loss": 0.2533, + "step": 9317 + }, + { + "epoch": 0.9068613138686131, + "grad_norm": 2.1289972968274715, + "learning_rate": 2.2580089328299747e-07, + "loss": 0.3492, + "step": 9318 + }, + { + "epoch": 0.9069586374695864, + "grad_norm": 2.0200598373615066, + "learning_rate": 2.2533282160143888e-07, + "loss": 0.3633, + "step": 9319 + }, + { + "epoch": 0.9070559610705596, + "grad_norm": 1.403123474120264, + "learning_rate": 2.248652243814925e-07, + "loss": 0.32, + "step": 9320 + }, + { + "epoch": 0.9071532846715329, + "grad_norm": 2.5881212124569966, + "learning_rate": 2.2439810166962395e-07, + "loss": 0.4403, + "step": 9321 + }, + { + "epoch": 0.907250608272506, + "grad_norm": 1.8506303847317362, + "learning_rate": 2.2393145351225164e-07, + "loss": 0.4974, + "step": 9322 + }, + { + "epoch": 0.9073479318734793, + "grad_norm": 1.8981143378095517, + "learning_rate": 2.234652799557463e-07, + "loss": 0.3265, + "step": 9323 + }, + { + "epoch": 0.9074452554744525, + "grad_norm": 1.7262490104434902, + "learning_rate": 2.2299958104643192e-07, + "loss": 0.2773, + "step": 9324 + }, + { + "epoch": 0.9075425790754258, + "grad_norm": 1.7262775997355921, + "learning_rate": 2.22534356830586e-07, + "loss": 0.2699, + "step": 9325 + }, + { + "epoch": 0.907639902676399, + "grad_norm": 1.8273155833238959, + "learning_rate": 2.220696073544365e-07, + "loss": 0.3955, + "step": 9326 + }, + { + "epoch": 0.9077372262773723, + "grad_norm": 1.7296283136319377, + "learning_rate": 2.2160533266416705e-07, + "loss": 0.5662, + "step": 9327 + }, + { + "epoch": 0.9078345498783456, + "grad_norm": 1.6695721890361843, + "learning_rate": 2.2114153280591244e-07, + "loss": 0.2198, + "step": 9328 + }, + { + "epoch": 0.9079318734793187, + "grad_norm": 1.8276852461915094, + "learning_rate": 2.2067820782576133e-07, + "loss": 0.4167, + "step": 9329 + }, + { + "epoch": 0.908029197080292, + "grad_norm": 1.4392033312349215, + "learning_rate": 2.2021535776975468e-07, + "loss": 0.2916, + "step": 9330 + }, + { + "epoch": 0.9081265206812652, + "grad_norm": 1.6594024023749672, + "learning_rate": 2.1975298268388512e-07, + "loss": 0.3016, + "step": 9331 + }, + { + "epoch": 0.9082238442822385, + "grad_norm": 1.643706996340691, + "learning_rate": 2.1929108261410038e-07, + "loss": 0.2866, + "step": 9332 + }, + { + "epoch": 0.9083211678832117, + "grad_norm": 1.5532175140151832, + "learning_rate": 2.188296576062998e-07, + "loss": 0.3965, + "step": 9333 + }, + { + "epoch": 0.908418491484185, + "grad_norm": 1.4193602740835338, + "learning_rate": 2.18368707706334e-07, + "loss": 0.3599, + "step": 9334 + }, + { + "epoch": 0.9085158150851581, + "grad_norm": 2.057500733645414, + "learning_rate": 2.1790823296000908e-07, + "loss": 0.409, + "step": 9335 + }, + { + "epoch": 0.9086131386861314, + "grad_norm": 2.0932215692768734, + "learning_rate": 2.174482334130823e-07, + "loss": 0.4256, + "step": 9336 + }, + { + "epoch": 0.9087104622871046, + "grad_norm": 1.6839335866413347, + "learning_rate": 2.1698870911126436e-07, + "loss": 0.4425, + "step": 9337 + }, + { + "epoch": 0.9088077858880779, + "grad_norm": 1.4680029409831261, + "learning_rate": 2.1652966010021758e-07, + "loss": 0.3226, + "step": 9338 + }, + { + "epoch": 0.9089051094890511, + "grad_norm": 1.5721372563813318, + "learning_rate": 2.1607108642555885e-07, + "loss": 0.3814, + "step": 9339 + }, + { + "epoch": 0.9090024330900244, + "grad_norm": 1.4854676090331194, + "learning_rate": 2.1561298813285725e-07, + "loss": 0.4781, + "step": 9340 + }, + { + "epoch": 0.9090997566909975, + "grad_norm": 1.6792614773597596, + "learning_rate": 2.1515536526763303e-07, + "loss": 0.5261, + "step": 9341 + }, + { + "epoch": 0.9091970802919708, + "grad_norm": 1.5096688184014606, + "learning_rate": 2.146982178753615e-07, + "loss": 0.2705, + "step": 9342 + }, + { + "epoch": 0.909294403892944, + "grad_norm": 1.741118372469455, + "learning_rate": 2.1424154600146962e-07, + "loss": 0.4251, + "step": 9343 + }, + { + "epoch": 0.9093917274939173, + "grad_norm": 1.4082580640342885, + "learning_rate": 2.137853496913367e-07, + "loss": 0.4253, + "step": 9344 + }, + { + "epoch": 0.9094890510948905, + "grad_norm": 1.7634277174747768, + "learning_rate": 2.133296289902953e-07, + "loss": 0.2424, + "step": 9345 + }, + { + "epoch": 0.9095863746958638, + "grad_norm": 2.0254545431618047, + "learning_rate": 2.128743839436309e-07, + "loss": 0.4038, + "step": 9346 + }, + { + "epoch": 0.9096836982968369, + "grad_norm": 1.7019404920626158, + "learning_rate": 2.124196145965818e-07, + "loss": 0.3242, + "step": 9347 + }, + { + "epoch": 0.9097810218978102, + "grad_norm": 1.4741808540232275, + "learning_rate": 2.119653209943373e-07, + "loss": 0.275, + "step": 9348 + }, + { + "epoch": 0.9098783454987834, + "grad_norm": 1.8020436130034434, + "learning_rate": 2.1151150318204249e-07, + "loss": 0.3782, + "step": 9349 + }, + { + "epoch": 0.9099756690997567, + "grad_norm": 1.5520132249534144, + "learning_rate": 2.1105816120479238e-07, + "loss": 0.3223, + "step": 9350 + }, + { + "epoch": 0.9100729927007299, + "grad_norm": 1.573826061710417, + "learning_rate": 2.106052951076365e-07, + "loss": 0.2915, + "step": 9351 + }, + { + "epoch": 0.9101703163017032, + "grad_norm": 1.6091075786144453, + "learning_rate": 2.1015290493557604e-07, + "loss": 0.2342, + "step": 9352 + }, + { + "epoch": 0.9102676399026765, + "grad_norm": 1.8035823856760125, + "learning_rate": 2.0970099073356565e-07, + "loss": 0.2709, + "step": 9353 + }, + { + "epoch": 0.9103649635036496, + "grad_norm": 1.7363976034503348, + "learning_rate": 2.0924955254651269e-07, + "loss": 0.3902, + "step": 9354 + }, + { + "epoch": 0.9104622871046228, + "grad_norm": 1.785332802487314, + "learning_rate": 2.087985904192763e-07, + "loss": 0.3954, + "step": 9355 + }, + { + "epoch": 0.9105596107055961, + "grad_norm": 1.7205755768871764, + "learning_rate": 2.0834810439666787e-07, + "loss": 0.3334, + "step": 9356 + }, + { + "epoch": 0.9106569343065694, + "grad_norm": 1.537225391706508, + "learning_rate": 2.0789809452345434e-07, + "loss": 0.3098, + "step": 9357 + }, + { + "epoch": 0.9107542579075426, + "grad_norm": 1.78743366571867, + "learning_rate": 2.074485608443527e-07, + "loss": 0.3603, + "step": 9358 + }, + { + "epoch": 0.9108515815085159, + "grad_norm": 1.6553334273098752, + "learning_rate": 2.0699950340403285e-07, + "loss": 0.4786, + "step": 9359 + }, + { + "epoch": 0.910948905109489, + "grad_norm": 1.7104304707659728, + "learning_rate": 2.065509222471185e-07, + "loss": 0.2915, + "step": 9360 + }, + { + "epoch": 0.9110462287104623, + "grad_norm": 1.8924119559485049, + "learning_rate": 2.0610281741818506e-07, + "loss": 0.4196, + "step": 9361 + }, + { + "epoch": 0.9111435523114355, + "grad_norm": 1.244008247363482, + "learning_rate": 2.056551889617614e-07, + "loss": 0.1717, + "step": 9362 + }, + { + "epoch": 0.9112408759124088, + "grad_norm": 1.7890117254923106, + "learning_rate": 2.052080369223286e-07, + "loss": 0.4543, + "step": 9363 + }, + { + "epoch": 0.911338199513382, + "grad_norm": 1.8404521867489478, + "learning_rate": 2.0476136134432e-07, + "loss": 0.321, + "step": 9364 + }, + { + "epoch": 0.9114355231143553, + "grad_norm": 1.7241865700405237, + "learning_rate": 2.0431516227212288e-07, + "loss": 0.4103, + "step": 9365 + }, + { + "epoch": 0.9115328467153284, + "grad_norm": 1.7433556715857343, + "learning_rate": 2.0386943975007567e-07, + "loss": 0.3446, + "step": 9366 + }, + { + "epoch": 0.9116301703163017, + "grad_norm": 1.971070269589573, + "learning_rate": 2.034241938224696e-07, + "loss": 0.4841, + "step": 9367 + }, + { + "epoch": 0.9117274939172749, + "grad_norm": 1.9294947477745186, + "learning_rate": 2.029794245335498e-07, + "loss": 0.4272, + "step": 9368 + }, + { + "epoch": 0.9118248175182482, + "grad_norm": 1.7950434098799035, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.4037, + "step": 9369 + }, + { + "epoch": 0.9119221411192214, + "grad_norm": 1.5539771522698174, + "learning_rate": 2.020913160485094e-07, + "loss": 0.4713, + "step": 9370 + }, + { + "epoch": 0.9120194647201947, + "grad_norm": 1.5155447712536625, + "learning_rate": 2.016479769406404e-07, + "loss": 0.3286, + "step": 9371 + }, + { + "epoch": 0.9121167883211679, + "grad_norm": 1.8360767717495974, + "learning_rate": 2.0120511464796098e-07, + "loss": 0.3344, + "step": 9372 + }, + { + "epoch": 0.9122141119221411, + "grad_norm": 1.959754252500576, + "learning_rate": 2.0076272921447926e-07, + "loss": 0.3496, + "step": 9373 + }, + { + "epoch": 0.9123114355231143, + "grad_norm": 1.6235544671078996, + "learning_rate": 2.003208206841545e-07, + "loss": 0.2955, + "step": 9374 + }, + { + "epoch": 0.9124087591240876, + "grad_norm": 1.8871546650440083, + "learning_rate": 1.9987938910090044e-07, + "loss": 0.5786, + "step": 9375 + }, + { + "epoch": 0.9125060827250608, + "grad_norm": 1.6997340330868365, + "learning_rate": 1.9943843450858204e-07, + "loss": 0.5222, + "step": 9376 + }, + { + "epoch": 0.9126034063260341, + "grad_norm": 1.6967465009561917, + "learning_rate": 1.9899795695101698e-07, + "loss": 0.3287, + "step": 9377 + }, + { + "epoch": 0.9127007299270073, + "grad_norm": 1.5626096305533794, + "learning_rate": 1.9855795647197528e-07, + "loss": 0.2987, + "step": 9378 + }, + { + "epoch": 0.9127980535279805, + "grad_norm": 1.6845468182238468, + "learning_rate": 1.9811843311518143e-07, + "loss": 0.3775, + "step": 9379 + }, + { + "epoch": 0.9128953771289537, + "grad_norm": 1.7679714039349834, + "learning_rate": 1.9767938692430988e-07, + "loss": 0.3306, + "step": 9380 + }, + { + "epoch": 0.912992700729927, + "grad_norm": 1.805394682278994, + "learning_rate": 1.972408179429891e-07, + "loss": 0.3567, + "step": 9381 + }, + { + "epoch": 0.9130900243309003, + "grad_norm": 1.6073262034389517, + "learning_rate": 1.9680272621479978e-07, + "loss": 0.284, + "step": 9382 + }, + { + "epoch": 0.9131873479318735, + "grad_norm": 1.7876222635350516, + "learning_rate": 1.963651117832771e-07, + "loss": 0.1839, + "step": 9383 + }, + { + "epoch": 0.9132846715328468, + "grad_norm": 1.4480664383889308, + "learning_rate": 1.959279746919057e-07, + "loss": 0.3666, + "step": 9384 + }, + { + "epoch": 0.9133819951338199, + "grad_norm": 1.6898479375200948, + "learning_rate": 1.9549131498412366e-07, + "loss": 0.4714, + "step": 9385 + }, + { + "epoch": 0.9134793187347932, + "grad_norm": 1.4022330662818048, + "learning_rate": 1.9505513270332287e-07, + "loss": 0.2418, + "step": 9386 + }, + { + "epoch": 0.9135766423357664, + "grad_norm": 1.6133295749326282, + "learning_rate": 1.946194278928476e-07, + "loss": 0.4014, + "step": 9387 + }, + { + "epoch": 0.9136739659367397, + "grad_norm": 1.9884916242960717, + "learning_rate": 1.9418420059599263e-07, + "loss": 0.4151, + "step": 9388 + }, + { + "epoch": 0.9137712895377129, + "grad_norm": 1.6922176256254444, + "learning_rate": 1.9374945085600728e-07, + "loss": 0.3856, + "step": 9389 + }, + { + "epoch": 0.9138686131386862, + "grad_norm": 1.5762759414220149, + "learning_rate": 1.9331517871609417e-07, + "loss": 0.3266, + "step": 9390 + }, + { + "epoch": 0.9139659367396593, + "grad_norm": 1.5221204493178078, + "learning_rate": 1.928813842194055e-07, + "loss": 0.3998, + "step": 9391 + }, + { + "epoch": 0.9140632603406326, + "grad_norm": 1.8895565828773213, + "learning_rate": 1.9244806740904786e-07, + "loss": 0.298, + "step": 9392 + }, + { + "epoch": 0.9141605839416058, + "grad_norm": 1.8108743414673247, + "learning_rate": 1.920152283280813e-07, + "loss": 0.2857, + "step": 9393 + }, + { + "epoch": 0.9142579075425791, + "grad_norm": 1.7639904727202753, + "learning_rate": 1.9158286701951746e-07, + "loss": 0.1903, + "step": 9394 + }, + { + "epoch": 0.9143552311435523, + "grad_norm": 1.5513605145749492, + "learning_rate": 1.9115098352631867e-07, + "loss": 0.3623, + "step": 9395 + }, + { + "epoch": 0.9144525547445256, + "grad_norm": 1.600050719768567, + "learning_rate": 1.9071957789140228e-07, + "loss": 0.2307, + "step": 9396 + }, + { + "epoch": 0.9145498783454988, + "grad_norm": 1.5956154172365282, + "learning_rate": 1.9028865015763788e-07, + "loss": 0.5025, + "step": 9397 + }, + { + "epoch": 0.914647201946472, + "grad_norm": 1.3005493232235161, + "learning_rate": 1.8985820036784676e-07, + "loss": 0.2574, + "step": 9398 + }, + { + "epoch": 0.9147445255474452, + "grad_norm": 1.922651862705624, + "learning_rate": 1.8942822856480247e-07, + "loss": 0.4686, + "step": 9399 + }, + { + "epoch": 0.9148418491484185, + "grad_norm": 1.747562277343142, + "learning_rate": 1.8899873479123198e-07, + "loss": 0.1949, + "step": 9400 + }, + { + "epoch": 0.9149391727493917, + "grad_norm": 1.4718177897679678, + "learning_rate": 1.8856971908981502e-07, + "loss": 0.2272, + "step": 9401 + }, + { + "epoch": 0.915036496350365, + "grad_norm": 1.823458004454558, + "learning_rate": 1.8814118150318083e-07, + "loss": 0.3655, + "step": 9402 + }, + { + "epoch": 0.9151338199513382, + "grad_norm": 2.1562464617271693, + "learning_rate": 1.877131220739159e-07, + "loss": 0.4843, + "step": 9403 + }, + { + "epoch": 0.9152311435523114, + "grad_norm": 2.243138022099341, + "learning_rate": 1.872855408445562e-07, + "loss": 0.4687, + "step": 9404 + }, + { + "epoch": 0.9153284671532846, + "grad_norm": 1.826168741641697, + "learning_rate": 1.868584378575905e-07, + "loss": 0.2668, + "step": 9405 + }, + { + "epoch": 0.9154257907542579, + "grad_norm": 1.5394336233619508, + "learning_rate": 1.864318131554599e-07, + "loss": 0.3782, + "step": 9406 + }, + { + "epoch": 0.9155231143552311, + "grad_norm": 1.6006733729073583, + "learning_rate": 1.860056667805582e-07, + "loss": 0.5178, + "step": 9407 + }, + { + "epoch": 0.9156204379562044, + "grad_norm": 1.576146008587708, + "learning_rate": 1.855799987752338e-07, + "loss": 0.4655, + "step": 9408 + }, + { + "epoch": 0.9157177615571777, + "grad_norm": 1.5992563158476776, + "learning_rate": 1.8515480918178287e-07, + "loss": 0.3325, + "step": 9409 + }, + { + "epoch": 0.9158150851581508, + "grad_norm": 1.5292127642149458, + "learning_rate": 1.8473009804245824e-07, + "loss": 0.2867, + "step": 9410 + }, + { + "epoch": 0.915912408759124, + "grad_norm": 1.7575933701306377, + "learning_rate": 1.8430586539946283e-07, + "loss": 0.3647, + "step": 9411 + }, + { + "epoch": 0.9160097323600973, + "grad_norm": 2.0550078515786865, + "learning_rate": 1.8388211129495404e-07, + "loss": 0.4104, + "step": 9412 + }, + { + "epoch": 0.9161070559610706, + "grad_norm": 1.7978184752395665, + "learning_rate": 1.834588357710404e-07, + "loss": 0.5214, + "step": 9413 + }, + { + "epoch": 0.9162043795620438, + "grad_norm": 1.883524526608025, + "learning_rate": 1.8303603886978217e-07, + "loss": 0.4326, + "step": 9414 + }, + { + "epoch": 0.9163017031630171, + "grad_norm": 1.5481478915872586, + "learning_rate": 1.8261372063319404e-07, + "loss": 0.304, + "step": 9415 + }, + { + "epoch": 0.9163990267639903, + "grad_norm": 1.7281670342794881, + "learning_rate": 1.8219188110324136e-07, + "loss": 0.3833, + "step": 9416 + }, + { + "epoch": 0.9164963503649635, + "grad_norm": 1.5396185320735132, + "learning_rate": 1.8177052032184285e-07, + "loss": 0.4083, + "step": 9417 + }, + { + "epoch": 0.9165936739659367, + "grad_norm": 1.5962122428329892, + "learning_rate": 1.8134963833086883e-07, + "loss": 0.3284, + "step": 9418 + }, + { + "epoch": 0.91669099756691, + "grad_norm": 1.5290388374360997, + "learning_rate": 1.8092923517214367e-07, + "loss": 0.5214, + "step": 9419 + }, + { + "epoch": 0.9167883211678832, + "grad_norm": 1.52806928724209, + "learning_rate": 1.8050931088744227e-07, + "loss": 0.3808, + "step": 9420 + }, + { + "epoch": 0.9168856447688565, + "grad_norm": 1.5713327613951449, + "learning_rate": 1.8008986551849238e-07, + "loss": 0.3071, + "step": 9421 + }, + { + "epoch": 0.9169829683698297, + "grad_norm": 2.205291872840256, + "learning_rate": 1.796708991069751e-07, + "loss": 0.298, + "step": 9422 + }, + { + "epoch": 0.9170802919708029, + "grad_norm": 1.7912254751354024, + "learning_rate": 1.792524116945238e-07, + "loss": 0.1846, + "step": 9423 + }, + { + "epoch": 0.9171776155717761, + "grad_norm": 1.615730181872773, + "learning_rate": 1.788344033227235e-07, + "loss": 0.5047, + "step": 9424 + }, + { + "epoch": 0.9172749391727494, + "grad_norm": 1.6211786087070559, + "learning_rate": 1.784168740331116e-07, + "loss": 0.2458, + "step": 9425 + }, + { + "epoch": 0.9173722627737226, + "grad_norm": 1.7477447419150904, + "learning_rate": 1.7799982386717872e-07, + "loss": 0.4485, + "step": 9426 + }, + { + "epoch": 0.9174695863746959, + "grad_norm": 1.5718397618130184, + "learning_rate": 1.7758325286636734e-07, + "loss": 0.2657, + "step": 9427 + }, + { + "epoch": 0.9175669099756691, + "grad_norm": 1.5202724345652894, + "learning_rate": 1.7716716107207153e-07, + "loss": 0.2879, + "step": 9428 + }, + { + "epoch": 0.9176642335766423, + "grad_norm": 1.5623146710158495, + "learning_rate": 1.7675154852563937e-07, + "loss": 0.4309, + "step": 9429 + }, + { + "epoch": 0.9177615571776155, + "grad_norm": 1.4421642760352462, + "learning_rate": 1.7633641526837164e-07, + "loss": 0.1958, + "step": 9430 + }, + { + "epoch": 0.9178588807785888, + "grad_norm": 1.5875839932066083, + "learning_rate": 1.7592176134151816e-07, + "loss": 0.4138, + "step": 9431 + }, + { + "epoch": 0.917956204379562, + "grad_norm": 1.5301596983297456, + "learning_rate": 1.7550758678628432e-07, + "loss": 0.1713, + "step": 9432 + }, + { + "epoch": 0.9180535279805353, + "grad_norm": 1.6838088462418688, + "learning_rate": 1.7509389164382717e-07, + "loss": 0.3988, + "step": 9433 + }, + { + "epoch": 0.9181508515815086, + "grad_norm": 1.4300238208520675, + "learning_rate": 1.746806759552555e-07, + "loss": 0.2813, + "step": 9434 + }, + { + "epoch": 0.9182481751824818, + "grad_norm": 2.163756421199198, + "learning_rate": 1.7426793976163093e-07, + "loss": 0.2163, + "step": 9435 + }, + { + "epoch": 0.918345498783455, + "grad_norm": 1.7373311049890365, + "learning_rate": 1.7385568310396727e-07, + "loss": 0.2734, + "step": 9436 + }, + { + "epoch": 0.9184428223844282, + "grad_norm": 1.992987976818884, + "learning_rate": 1.7344390602323123e-07, + "loss": 0.5, + "step": 9437 + }, + { + "epoch": 0.9185401459854015, + "grad_norm": 1.5788688794922618, + "learning_rate": 1.7303260856034066e-07, + "loss": 0.2743, + "step": 9438 + }, + { + "epoch": 0.9186374695863747, + "grad_norm": 1.3208984338900542, + "learning_rate": 1.7262179075616614e-07, + "loss": 0.2739, + "step": 9439 + }, + { + "epoch": 0.918734793187348, + "grad_norm": 1.5186634086130733, + "learning_rate": 1.722114526515317e-07, + "loss": 0.3856, + "step": 9440 + }, + { + "epoch": 0.9188321167883212, + "grad_norm": 1.5201712782880852, + "learning_rate": 1.7180159428721365e-07, + "loss": 0.3807, + "step": 9441 + }, + { + "epoch": 0.9189294403892944, + "grad_norm": 1.3559582620628192, + "learning_rate": 1.713922157039377e-07, + "loss": 0.3235, + "step": 9442 + }, + { + "epoch": 0.9190267639902676, + "grad_norm": 1.6701064693752528, + "learning_rate": 1.7098331694238524e-07, + "loss": 0.196, + "step": 9443 + }, + { + "epoch": 0.9191240875912409, + "grad_norm": 1.3518761067026575, + "learning_rate": 1.7057489804318873e-07, + "loss": 0.1807, + "step": 9444 + }, + { + "epoch": 0.9192214111922141, + "grad_norm": 1.7684875509240763, + "learning_rate": 1.7016695904693293e-07, + "loss": 0.4115, + "step": 9445 + }, + { + "epoch": 0.9193187347931874, + "grad_norm": 1.4744048324095942, + "learning_rate": 1.697594999941554e-07, + "loss": 0.3183, + "step": 9446 + }, + { + "epoch": 0.9194160583941606, + "grad_norm": 1.7335947016970827, + "learning_rate": 1.6935252092534493e-07, + "loss": 0.4883, + "step": 9447 + }, + { + "epoch": 0.9195133819951338, + "grad_norm": 1.5336382400347863, + "learning_rate": 1.689460218809441e-07, + "loss": 0.3628, + "step": 9448 + }, + { + "epoch": 0.919610705596107, + "grad_norm": 1.7841871707218548, + "learning_rate": 1.685400029013462e-07, + "loss": 0.4276, + "step": 9449 + }, + { + "epoch": 0.9197080291970803, + "grad_norm": 1.7024330343332628, + "learning_rate": 1.6813446402689783e-07, + "loss": 0.2482, + "step": 9450 + }, + { + "epoch": 0.9198053527980535, + "grad_norm": 1.4904083510371198, + "learning_rate": 1.6772940529789783e-07, + "loss": 0.2286, + "step": 9451 + }, + { + "epoch": 0.9199026763990268, + "grad_norm": 1.706728840803088, + "learning_rate": 1.6732482675459738e-07, + "loss": 0.3371, + "step": 9452 + }, + { + "epoch": 0.92, + "grad_norm": 2.1913052839931386, + "learning_rate": 1.6692072843719876e-07, + "loss": 0.3492, + "step": 9453 + }, + { + "epoch": 0.9200973236009732, + "grad_norm": 1.440502431293463, + "learning_rate": 1.665171103858576e-07, + "loss": 0.2892, + "step": 9454 + }, + { + "epoch": 0.9201946472019464, + "grad_norm": 1.6939866467234457, + "learning_rate": 1.661139726406824e-07, + "loss": 0.4437, + "step": 9455 + }, + { + "epoch": 0.9202919708029197, + "grad_norm": 1.4593188452689092, + "learning_rate": 1.6571131524173277e-07, + "loss": 0.4501, + "step": 9456 + }, + { + "epoch": 0.9203892944038929, + "grad_norm": 1.6313605903174409, + "learning_rate": 1.6530913822902118e-07, + "loss": 0.2591, + "step": 9457 + }, + { + "epoch": 0.9204866180048662, + "grad_norm": 1.4305026539441597, + "learning_rate": 1.6490744164251226e-07, + "loss": 0.2671, + "step": 9458 + }, + { + "epoch": 0.9205839416058395, + "grad_norm": 1.5063439367610076, + "learning_rate": 1.6450622552212358e-07, + "loss": 0.3954, + "step": 9459 + }, + { + "epoch": 0.9206812652068127, + "grad_norm": 1.5155638849580173, + "learning_rate": 1.6410548990772212e-07, + "loss": 0.3054, + "step": 9460 + }, + { + "epoch": 0.9207785888077858, + "grad_norm": 1.7330062551501089, + "learning_rate": 1.6370523483913103e-07, + "loss": 0.3797, + "step": 9461 + }, + { + "epoch": 0.9208759124087591, + "grad_norm": 1.5662535785655478, + "learning_rate": 1.6330546035612404e-07, + "loss": 0.4485, + "step": 9462 + }, + { + "epoch": 0.9209732360097324, + "grad_norm": 1.4266820283015202, + "learning_rate": 1.629061664984255e-07, + "loss": 0.3708, + "step": 9463 + }, + { + "epoch": 0.9210705596107056, + "grad_norm": 1.9533283585538215, + "learning_rate": 1.625073533057142e-07, + "loss": 0.3704, + "step": 9464 + }, + { + "epoch": 0.9211678832116789, + "grad_norm": 1.5845286308005868, + "learning_rate": 1.6210902081762069e-07, + "loss": 0.2878, + "step": 9465 + }, + { + "epoch": 0.9212652068126521, + "grad_norm": 1.7397576737747853, + "learning_rate": 1.6171116907372775e-07, + "loss": 0.2989, + "step": 9466 + }, + { + "epoch": 0.9213625304136253, + "grad_norm": 1.5327474317826242, + "learning_rate": 1.613137981135693e-07, + "loss": 0.2604, + "step": 9467 + }, + { + "epoch": 0.9214598540145985, + "grad_norm": 1.4265574393543008, + "learning_rate": 1.6091690797663263e-07, + "loss": 0.189, + "step": 9468 + }, + { + "epoch": 0.9215571776155718, + "grad_norm": 1.612290121806367, + "learning_rate": 1.6052049870235786e-07, + "loss": 0.4226, + "step": 9469 + }, + { + "epoch": 0.921654501216545, + "grad_norm": 1.5479883368198146, + "learning_rate": 1.6012457033013574e-07, + "loss": 0.3513, + "step": 9470 + }, + { + "epoch": 0.9217518248175183, + "grad_norm": 1.7470961729281898, + "learning_rate": 1.5972912289930976e-07, + "loss": 0.2394, + "step": 9471 + }, + { + "epoch": 0.9218491484184915, + "grad_norm": 1.8267080570436904, + "learning_rate": 1.5933415644917517e-07, + "loss": 0.4832, + "step": 9472 + }, + { + "epoch": 0.9219464720194647, + "grad_norm": 1.503737957938693, + "learning_rate": 1.589396710189822e-07, + "loss": 0.2829, + "step": 9473 + }, + { + "epoch": 0.9220437956204379, + "grad_norm": 1.6016326888916643, + "learning_rate": 1.5854566664792848e-07, + "loss": 0.2272, + "step": 9474 + }, + { + "epoch": 0.9221411192214112, + "grad_norm": 1.6643204466717323, + "learning_rate": 1.5815214337516817e-07, + "loss": 0.4127, + "step": 9475 + }, + { + "epoch": 0.9222384428223844, + "grad_norm": 1.7800423393143172, + "learning_rate": 1.5775910123980498e-07, + "loss": 0.5203, + "step": 9476 + }, + { + "epoch": 0.9223357664233577, + "grad_norm": 1.664318512757687, + "learning_rate": 1.5736654028089604e-07, + "loss": 0.5129, + "step": 9477 + }, + { + "epoch": 0.9224330900243309, + "grad_norm": 1.6986091309469427, + "learning_rate": 1.5697446053745123e-07, + "loss": 0.4278, + "step": 9478 + }, + { + "epoch": 0.9225304136253042, + "grad_norm": 1.4995709441403597, + "learning_rate": 1.5658286204843053e-07, + "loss": 0.2563, + "step": 9479 + }, + { + "epoch": 0.9226277372262773, + "grad_norm": 1.8376986234061203, + "learning_rate": 1.5619174485274835e-07, + "loss": 0.3111, + "step": 9480 + }, + { + "epoch": 0.9227250608272506, + "grad_norm": 1.8307380585615058, + "learning_rate": 1.5580110898926916e-07, + "loss": 0.2911, + "step": 9481 + }, + { + "epoch": 0.9228223844282238, + "grad_norm": 1.9550844053306657, + "learning_rate": 1.5541095449681133e-07, + "loss": 0.393, + "step": 9482 + }, + { + "epoch": 0.9229197080291971, + "grad_norm": 1.4607370820500096, + "learning_rate": 1.5502128141414496e-07, + "loss": 0.2649, + "step": 9483 + }, + { + "epoch": 0.9230170316301703, + "grad_norm": 1.746782342026344, + "learning_rate": 1.546320897799919e-07, + "loss": 0.5944, + "step": 9484 + }, + { + "epoch": 0.9231143552311436, + "grad_norm": 1.7514493254034433, + "learning_rate": 1.5424337963302617e-07, + "loss": 0.4873, + "step": 9485 + }, + { + "epoch": 0.9232116788321167, + "grad_norm": 1.7041889332231446, + "learning_rate": 1.538551510118741e-07, + "loss": 0.3063, + "step": 9486 + }, + { + "epoch": 0.92330900243309, + "grad_norm": 1.6952203822269325, + "learning_rate": 1.5346740395511428e-07, + "loss": 0.5692, + "step": 9487 + }, + { + "epoch": 0.9234063260340633, + "grad_norm": 1.781332449092379, + "learning_rate": 1.53080138501277e-07, + "loss": 0.4049, + "step": 9488 + }, + { + "epoch": 0.9235036496350365, + "grad_norm": 2.545211435914416, + "learning_rate": 1.5269335468884593e-07, + "loss": 0.3301, + "step": 9489 + }, + { + "epoch": 0.9236009732360098, + "grad_norm": 1.583650339749936, + "learning_rate": 1.5230705255625587e-07, + "loss": 0.3092, + "step": 9490 + }, + { + "epoch": 0.923698296836983, + "grad_norm": 1.6439729952671, + "learning_rate": 1.5192123214189392e-07, + "loss": 0.3549, + "step": 9491 + }, + { + "epoch": 0.9237956204379562, + "grad_norm": 3.3968019254767934, + "learning_rate": 1.5153589348409825e-07, + "loss": 0.4706, + "step": 9492 + }, + { + "epoch": 0.9238929440389294, + "grad_norm": 1.7055294317894942, + "learning_rate": 1.511510366211616e-07, + "loss": 0.3741, + "step": 9493 + }, + { + "epoch": 0.9239902676399027, + "grad_norm": 1.8722053045356812, + "learning_rate": 1.5076666159132615e-07, + "loss": 0.4574, + "step": 9494 + }, + { + "epoch": 0.9240875912408759, + "grad_norm": 1.4188005514003477, + "learning_rate": 1.5038276843278853e-07, + "loss": 0.2753, + "step": 9495 + }, + { + "epoch": 0.9241849148418492, + "grad_norm": 1.506019039570695, + "learning_rate": 1.4999935718369608e-07, + "loss": 0.2789, + "step": 9496 + }, + { + "epoch": 0.9242822384428224, + "grad_norm": 1.7822854228294007, + "learning_rate": 1.4961642788214825e-07, + "loss": 0.2629, + "step": 9497 + }, + { + "epoch": 0.9243795620437956, + "grad_norm": 1.4707473809016132, + "learning_rate": 1.4923398056619743e-07, + "loss": 0.2837, + "step": 9498 + }, + { + "epoch": 0.9244768856447688, + "grad_norm": 1.6260007564359926, + "learning_rate": 1.4885201527384707e-07, + "loss": 0.354, + "step": 9499 + }, + { + "epoch": 0.9245742092457421, + "grad_norm": 1.9835830670159598, + "learning_rate": 1.4847053204305405e-07, + "loss": 0.51, + "step": 9500 + }, + { + "epoch": 0.9246715328467153, + "grad_norm": 1.4275813196806504, + "learning_rate": 1.480895309117264e-07, + "loss": 0.278, + "step": 9501 + }, + { + "epoch": 0.9247688564476886, + "grad_norm": 1.4954138423495842, + "learning_rate": 1.4770901191772436e-07, + "loss": 0.3423, + "step": 9502 + }, + { + "epoch": 0.9248661800486618, + "grad_norm": 2.2330113230997726, + "learning_rate": 1.473289750988599e-07, + "loss": 0.3661, + "step": 9503 + }, + { + "epoch": 0.9249635036496351, + "grad_norm": 1.5029238972790915, + "learning_rate": 1.4694942049289783e-07, + "loss": 0.4414, + "step": 9504 + }, + { + "epoch": 0.9250608272506082, + "grad_norm": 1.8493323435864393, + "learning_rate": 1.4657034813755523e-07, + "loss": 0.3162, + "step": 9505 + }, + { + "epoch": 0.9251581508515815, + "grad_norm": 1.4800581954133771, + "learning_rate": 1.4619175807050023e-07, + "loss": 0.2879, + "step": 9506 + }, + { + "epoch": 0.9252554744525547, + "grad_norm": 1.7939348796775774, + "learning_rate": 1.4581365032935334e-07, + "loss": 0.225, + "step": 9507 + }, + { + "epoch": 0.925352798053528, + "grad_norm": 1.6122594373003567, + "learning_rate": 1.454360249516873e-07, + "loss": 0.4803, + "step": 9508 + }, + { + "epoch": 0.9254501216545012, + "grad_norm": 2.926491257777502, + "learning_rate": 1.450588819750276e-07, + "loss": 0.4227, + "step": 9509 + }, + { + "epoch": 0.9255474452554745, + "grad_norm": 1.567835281994274, + "learning_rate": 1.4468222143685095e-07, + "loss": 0.2261, + "step": 9510 + }, + { + "epoch": 0.9256447688564476, + "grad_norm": 1.7811762476593467, + "learning_rate": 1.443060433745863e-07, + "loss": 0.4577, + "step": 9511 + }, + { + "epoch": 0.9257420924574209, + "grad_norm": 1.3363611988300181, + "learning_rate": 1.4393034782561488e-07, + "loss": 0.2632, + "step": 9512 + }, + { + "epoch": 0.9258394160583941, + "grad_norm": 1.781892158803006, + "learning_rate": 1.4355513482726958e-07, + "loss": 0.4856, + "step": 9513 + }, + { + "epoch": 0.9259367396593674, + "grad_norm": 1.5287701514499672, + "learning_rate": 1.4318040441683555e-07, + "loss": 0.3399, + "step": 9514 + }, + { + "epoch": 0.9260340632603407, + "grad_norm": 1.719914544666272, + "learning_rate": 1.4280615663154972e-07, + "loss": 0.2705, + "step": 9515 + }, + { + "epoch": 0.9261313868613139, + "grad_norm": 3.755317351319147, + "learning_rate": 1.4243239150860123e-07, + "loss": 0.3247, + "step": 9516 + }, + { + "epoch": 0.926228710462287, + "grad_norm": 1.620906956952849, + "learning_rate": 1.4205910908513254e-07, + "loss": 0.209, + "step": 9517 + }, + { + "epoch": 0.9263260340632603, + "grad_norm": 1.8064617336474211, + "learning_rate": 1.416863093982357e-07, + "loss": 0.5242, + "step": 9518 + }, + { + "epoch": 0.9264233576642336, + "grad_norm": 1.6421138728774511, + "learning_rate": 1.4131399248495602e-07, + "loss": 0.3703, + "step": 9519 + }, + { + "epoch": 0.9265206812652068, + "grad_norm": 1.8777417799579652, + "learning_rate": 1.4094215838229176e-07, + "loss": 0.3916, + "step": 9520 + }, + { + "epoch": 0.9266180048661801, + "grad_norm": 2.006164943520131, + "learning_rate": 1.4057080712719162e-07, + "loss": 0.3254, + "step": 9521 + }, + { + "epoch": 0.9267153284671533, + "grad_norm": 1.7169698339488402, + "learning_rate": 1.4019993875655724e-07, + "loss": 0.2954, + "step": 9522 + }, + { + "epoch": 0.9268126520681266, + "grad_norm": 1.6613562701056876, + "learning_rate": 1.398295533072419e-07, + "loss": 0.3306, + "step": 9523 + }, + { + "epoch": 0.9269099756690997, + "grad_norm": 1.6648152399508505, + "learning_rate": 1.3945965081605172e-07, + "loss": 0.4033, + "step": 9524 + }, + { + "epoch": 0.927007299270073, + "grad_norm": 1.5017296038163848, + "learning_rate": 1.3909023131974342e-07, + "loss": 0.3504, + "step": 9525 + }, + { + "epoch": 0.9271046228710462, + "grad_norm": 1.714574771023859, + "learning_rate": 1.387212948550265e-07, + "loss": 0.3085, + "step": 9526 + }, + { + "epoch": 0.9272019464720195, + "grad_norm": 1.701753256397766, + "learning_rate": 1.3835284145856275e-07, + "loss": 0.5359, + "step": 9527 + }, + { + "epoch": 0.9272992700729927, + "grad_norm": 1.5092459866374177, + "learning_rate": 1.3798487116696458e-07, + "loss": 0.4005, + "step": 9528 + }, + { + "epoch": 0.927396593673966, + "grad_norm": 1.5266337976429425, + "learning_rate": 1.376173840167988e-07, + "loss": 0.2527, + "step": 9529 + }, + { + "epoch": 0.9274939172749391, + "grad_norm": 1.5623912773453328, + "learning_rate": 1.3725038004458124e-07, + "loss": 0.3423, + "step": 9530 + }, + { + "epoch": 0.9275912408759124, + "grad_norm": 1.59036253916566, + "learning_rate": 1.3688385928678327e-07, + "loss": 0.543, + "step": 9531 + }, + { + "epoch": 0.9276885644768856, + "grad_norm": 1.7503276245793182, + "learning_rate": 1.365178217798252e-07, + "loss": 0.4154, + "step": 9532 + }, + { + "epoch": 0.9277858880778589, + "grad_norm": 1.456955731354955, + "learning_rate": 1.3615226756008016e-07, + "loss": 0.1514, + "step": 9533 + }, + { + "epoch": 0.9278832116788321, + "grad_norm": 2.1624606707750607, + "learning_rate": 1.3578719666387408e-07, + "loss": 0.7343, + "step": 9534 + }, + { + "epoch": 0.9279805352798054, + "grad_norm": 1.5318992950866142, + "learning_rate": 1.3542260912748462e-07, + "loss": 0.2815, + "step": 9535 + }, + { + "epoch": 0.9280778588807785, + "grad_norm": 1.5960943146423148, + "learning_rate": 1.3505850498714002e-07, + "loss": 0.2709, + "step": 9536 + }, + { + "epoch": 0.9281751824817518, + "grad_norm": 1.6842893504375411, + "learning_rate": 1.346948842790219e-07, + "loss": 0.2904, + "step": 9537 + }, + { + "epoch": 0.928272506082725, + "grad_norm": 1.4331688524100825, + "learning_rate": 1.343317470392641e-07, + "loss": 0.348, + "step": 9538 + }, + { + "epoch": 0.9283698296836983, + "grad_norm": 1.9025709576601564, + "learning_rate": 1.339690933039506e-07, + "loss": 0.3767, + "step": 9539 + }, + { + "epoch": 0.9284671532846716, + "grad_norm": 3.153130377622164, + "learning_rate": 1.3360692310911915e-07, + "loss": 0.3275, + "step": 9540 + }, + { + "epoch": 0.9285644768856448, + "grad_norm": 1.7024879313978045, + "learning_rate": 1.3324523649075883e-07, + "loss": 0.4279, + "step": 9541 + }, + { + "epoch": 0.928661800486618, + "grad_norm": 2.1556388984159125, + "learning_rate": 1.3288403348481138e-07, + "loss": 0.399, + "step": 9542 + }, + { + "epoch": 0.9287591240875912, + "grad_norm": 1.3823701829992785, + "learning_rate": 1.3252331412716869e-07, + "loss": 0.2495, + "step": 9543 + }, + { + "epoch": 0.9288564476885645, + "grad_norm": 1.8805895460291582, + "learning_rate": 1.3216307845367594e-07, + "loss": 0.4118, + "step": 9544 + }, + { + "epoch": 0.9289537712895377, + "grad_norm": 1.5649114692193105, + "learning_rate": 1.318033265001306e-07, + "loss": 0.4116, + "step": 9545 + }, + { + "epoch": 0.929051094890511, + "grad_norm": 1.4499009986491453, + "learning_rate": 1.3144405830228025e-07, + "loss": 0.3825, + "step": 9546 + }, + { + "epoch": 0.9291484184914842, + "grad_norm": 1.8766863550990718, + "learning_rate": 1.310852738958268e-07, + "loss": 0.4169, + "step": 9547 + }, + { + "epoch": 0.9292457420924575, + "grad_norm": 1.8314897920541182, + "learning_rate": 1.3072697331642182e-07, + "loss": 0.3169, + "step": 9548 + }, + { + "epoch": 0.9293430656934306, + "grad_norm": 1.4541874764746945, + "learning_rate": 1.303691565996712e-07, + "loss": 0.2217, + "step": 9549 + }, + { + "epoch": 0.9294403892944039, + "grad_norm": 1.2876459372317743, + "learning_rate": 1.3001182378113043e-07, + "loss": 0.1843, + "step": 9550 + }, + { + "epoch": 0.9295377128953771, + "grad_norm": 1.4392560101851248, + "learning_rate": 1.2965497489630717e-07, + "loss": 0.3593, + "step": 9551 + }, + { + "epoch": 0.9296350364963504, + "grad_norm": 1.4303638130169893, + "learning_rate": 1.2929860998066312e-07, + "loss": 0.2936, + "step": 9552 + }, + { + "epoch": 0.9297323600973236, + "grad_norm": 1.6222424217749034, + "learning_rate": 1.2894272906961048e-07, + "loss": 0.4201, + "step": 9553 + }, + { + "epoch": 0.9298296836982969, + "grad_norm": 1.6172531040908997, + "learning_rate": 1.2858733219851204e-07, + "loss": 0.4612, + "step": 9554 + }, + { + "epoch": 0.92992700729927, + "grad_norm": 1.6817492229561382, + "learning_rate": 1.2823241940268517e-07, + "loss": 0.3076, + "step": 9555 + }, + { + "epoch": 0.9300243309002433, + "grad_norm": 1.554602079105496, + "learning_rate": 1.2787799071739714e-07, + "loss": 0.3923, + "step": 9556 + }, + { + "epoch": 0.9301216545012165, + "grad_norm": 1.7084923957028069, + "learning_rate": 1.2752404617786763e-07, + "loss": 0.2901, + "step": 9557 + }, + { + "epoch": 0.9302189781021898, + "grad_norm": 1.5757407126936007, + "learning_rate": 1.27170585819269e-07, + "loss": 0.2274, + "step": 9558 + }, + { + "epoch": 0.930316301703163, + "grad_norm": 1.7529714425302017, + "learning_rate": 1.2681760967672374e-07, + "loss": 0.3544, + "step": 9559 + }, + { + "epoch": 0.9304136253041363, + "grad_norm": 1.833370020528216, + "learning_rate": 1.2646511778530824e-07, + "loss": 0.258, + "step": 9560 + }, + { + "epoch": 0.9305109489051094, + "grad_norm": 1.488214686549736, + "learning_rate": 1.261131101800489e-07, + "loss": 0.2847, + "step": 9561 + }, + { + "epoch": 0.9306082725060827, + "grad_norm": 1.3981323335444167, + "learning_rate": 1.257615868959261e-07, + "loss": 0.2849, + "step": 9562 + }, + { + "epoch": 0.9307055961070559, + "grad_norm": 1.5571032594068135, + "learning_rate": 1.2541054796787078e-07, + "loss": 0.3026, + "step": 9563 + }, + { + "epoch": 0.9308029197080292, + "grad_norm": 1.4504921965680089, + "learning_rate": 1.2505999343076447e-07, + "loss": 0.3731, + "step": 9564 + }, + { + "epoch": 0.9309002433090025, + "grad_norm": 1.896481986392999, + "learning_rate": 1.247099233194432e-07, + "loss": 0.2535, + "step": 9565 + }, + { + "epoch": 0.9309975669099757, + "grad_norm": 1.6644114813421689, + "learning_rate": 1.2436033766869415e-07, + "loss": 0.2597, + "step": 9566 + }, + { + "epoch": 0.931094890510949, + "grad_norm": 1.690328629159873, + "learning_rate": 1.2401123651325508e-07, + "loss": 0.484, + "step": 9567 + }, + { + "epoch": 0.9311922141119221, + "grad_norm": 1.684545544428426, + "learning_rate": 1.2366261988781603e-07, + "loss": 0.2757, + "step": 9568 + }, + { + "epoch": 0.9312895377128954, + "grad_norm": 1.6222275778857678, + "learning_rate": 1.2331448782701926e-07, + "loss": 0.5433, + "step": 9569 + }, + { + "epoch": 0.9313868613138686, + "grad_norm": 1.7638230031975677, + "learning_rate": 1.2296684036545993e-07, + "loss": 0.5099, + "step": 9570 + }, + { + "epoch": 0.9314841849148419, + "grad_norm": 2.4013554718985257, + "learning_rate": 1.226196775376831e-07, + "loss": 0.3136, + "step": 9571 + }, + { + "epoch": 0.9315815085158151, + "grad_norm": 1.7863822750290157, + "learning_rate": 1.222729993781868e-07, + "loss": 0.2384, + "step": 9572 + }, + { + "epoch": 0.9316788321167884, + "grad_norm": 1.343365813936208, + "learning_rate": 1.2192680592142013e-07, + "loss": 0.3244, + "step": 9573 + }, + { + "epoch": 0.9317761557177615, + "grad_norm": 1.5493472601471998, + "learning_rate": 1.2158109720178613e-07, + "loss": 0.5631, + "step": 9574 + }, + { + "epoch": 0.9318734793187348, + "grad_norm": 1.668275898653919, + "learning_rate": 1.2123587325363671e-07, + "loss": 0.2898, + "step": 9575 + }, + { + "epoch": 0.931970802919708, + "grad_norm": 1.5399710979474348, + "learning_rate": 1.2089113411127673e-07, + "loss": 0.4305, + "step": 9576 + }, + { + "epoch": 0.9320681265206813, + "grad_norm": 2.042636355607625, + "learning_rate": 1.2054687980896428e-07, + "loss": 0.3074, + "step": 9577 + }, + { + "epoch": 0.9321654501216545, + "grad_norm": 1.4961230720853722, + "learning_rate": 1.202031103809076e-07, + "loss": 0.4325, + "step": 9578 + }, + { + "epoch": 0.9322627737226278, + "grad_norm": 1.9269306087976328, + "learning_rate": 1.1985982586126653e-07, + "loss": 0.3425, + "step": 9579 + }, + { + "epoch": 0.9323600973236009, + "grad_norm": 1.8363101662165608, + "learning_rate": 1.195170262841544e-07, + "loss": 0.4339, + "step": 9580 + }, + { + "epoch": 0.9324574209245742, + "grad_norm": 1.7505564486160565, + "learning_rate": 1.191747116836356e-07, + "loss": 0.3387, + "step": 9581 + }, + { + "epoch": 0.9325547445255474, + "grad_norm": 1.7431995548513823, + "learning_rate": 1.1883288209372512e-07, + "loss": 0.493, + "step": 9582 + }, + { + "epoch": 0.9326520681265207, + "grad_norm": 1.4562947147691805, + "learning_rate": 1.1849153754839138e-07, + "loss": 0.2533, + "step": 9583 + }, + { + "epoch": 0.9327493917274939, + "grad_norm": 1.6753054212275835, + "learning_rate": 1.1815067808155389e-07, + "loss": 0.5323, + "step": 9584 + }, + { + "epoch": 0.9328467153284672, + "grad_norm": 1.7016213455043259, + "learning_rate": 1.1781030372708502e-07, + "loss": 0.3076, + "step": 9585 + }, + { + "epoch": 0.9329440389294404, + "grad_norm": 1.6807201670935805, + "learning_rate": 1.1747041451880658e-07, + "loss": 0.3508, + "step": 9586 + }, + { + "epoch": 0.9330413625304136, + "grad_norm": 1.6382580349377462, + "learning_rate": 1.171310104904938e-07, + "loss": 0.2869, + "step": 9587 + }, + { + "epoch": 0.9331386861313868, + "grad_norm": 1.6152531208763325, + "learning_rate": 1.1679209167587413e-07, + "loss": 0.5627, + "step": 9588 + }, + { + "epoch": 0.9332360097323601, + "grad_norm": 1.527257580289826, + "learning_rate": 1.1645365810862563e-07, + "loss": 0.2633, + "step": 9589 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.438426092625945, + "learning_rate": 1.1611570982237863e-07, + "loss": 0.4531, + "step": 9590 + }, + { + "epoch": 0.9334306569343066, + "grad_norm": 1.763894507465108, + "learning_rate": 1.157782468507157e-07, + "loss": 0.3207, + "step": 9591 + }, + { + "epoch": 0.9335279805352799, + "grad_norm": 1.510723722970145, + "learning_rate": 1.1544126922717003e-07, + "loss": 0.359, + "step": 9592 + }, + { + "epoch": 0.933625304136253, + "grad_norm": 1.8084891435267252, + "learning_rate": 1.1510477698522815e-07, + "loss": 0.2864, + "step": 9593 + }, + { + "epoch": 0.9337226277372263, + "grad_norm": 1.6246385906038752, + "learning_rate": 1.1476877015832666e-07, + "loss": 0.3175, + "step": 9594 + }, + { + "epoch": 0.9338199513381995, + "grad_norm": 1.6921089950202506, + "learning_rate": 1.1443324877985551e-07, + "loss": 0.4625, + "step": 9595 + }, + { + "epoch": 0.9339172749391728, + "grad_norm": 1.6553072135427505, + "learning_rate": 1.1409821288315581e-07, + "loss": 0.2737, + "step": 9596 + }, + { + "epoch": 0.934014598540146, + "grad_norm": 1.7725012908629545, + "learning_rate": 1.1376366250151926e-07, + "loss": 0.3504, + "step": 9597 + }, + { + "epoch": 0.9341119221411193, + "grad_norm": 1.5161900990754502, + "learning_rate": 1.1342959766819095e-07, + "loss": 0.3471, + "step": 9598 + }, + { + "epoch": 0.9342092457420924, + "grad_norm": 1.612815956092802, + "learning_rate": 1.1309601841636709e-07, + "loss": 0.3108, + "step": 9599 + }, + { + "epoch": 0.9343065693430657, + "grad_norm": 1.7268788319785766, + "learning_rate": 1.1276292477919559e-07, + "loss": 0.3771, + "step": 9600 + }, + { + "epoch": 0.9344038929440389, + "grad_norm": 3.983173383643072, + "learning_rate": 1.1243031678977612e-07, + "loss": 0.1529, + "step": 9601 + }, + { + "epoch": 0.9345012165450122, + "grad_norm": 1.5863961745082806, + "learning_rate": 1.1209819448116e-07, + "loss": 0.4878, + "step": 9602 + }, + { + "epoch": 0.9345985401459854, + "grad_norm": 1.571421445173974, + "learning_rate": 1.1176655788635082e-07, + "loss": 0.3307, + "step": 9603 + }, + { + "epoch": 0.9346958637469587, + "grad_norm": 3.479377755406753, + "learning_rate": 1.1143540703830336e-07, + "loss": 0.4739, + "step": 9604 + }, + { + "epoch": 0.9347931873479318, + "grad_norm": 1.9203010875341053, + "learning_rate": 1.1110474196992405e-07, + "loss": 0.5108, + "step": 9605 + }, + { + "epoch": 0.9348905109489051, + "grad_norm": 1.8574289989635662, + "learning_rate": 1.1077456271407161e-07, + "loss": 0.4113, + "step": 9606 + }, + { + "epoch": 0.9349878345498783, + "grad_norm": 1.875455228856436, + "learning_rate": 1.1044486930355647e-07, + "loss": 0.3652, + "step": 9607 + }, + { + "epoch": 0.9350851581508516, + "grad_norm": 1.651147937262718, + "learning_rate": 1.1011566177113963e-07, + "loss": 0.5793, + "step": 9608 + }, + { + "epoch": 0.9351824817518248, + "grad_norm": 1.5093737805600502, + "learning_rate": 1.0978694014953495e-07, + "loss": 0.3102, + "step": 9609 + }, + { + "epoch": 0.9352798053527981, + "grad_norm": 1.6321899103201287, + "learning_rate": 1.094587044714085e-07, + "loss": 0.2645, + "step": 9610 + }, + { + "epoch": 0.9353771289537713, + "grad_norm": 1.5812700865905913, + "learning_rate": 1.0913095476937585e-07, + "loss": 0.3897, + "step": 9611 + }, + { + "epoch": 0.9354744525547445, + "grad_norm": 1.5830566097506724, + "learning_rate": 1.0880369107600652e-07, + "loss": 0.3766, + "step": 9612 + }, + { + "epoch": 0.9355717761557177, + "grad_norm": 1.7026085026653628, + "learning_rate": 1.0847691342382061e-07, + "loss": 0.4445, + "step": 9613 + }, + { + "epoch": 0.935669099756691, + "grad_norm": 1.579412860182592, + "learning_rate": 1.0815062184529046e-07, + "loss": 0.2623, + "step": 9614 + }, + { + "epoch": 0.9357664233576642, + "grad_norm": 1.5048094576236761, + "learning_rate": 1.0782481637284014e-07, + "loss": 0.3588, + "step": 9615 + }, + { + "epoch": 0.9358637469586375, + "grad_norm": 1.5945606787814754, + "learning_rate": 1.0749949703884433e-07, + "loss": 0.313, + "step": 9616 + }, + { + "epoch": 0.9359610705596108, + "grad_norm": 1.6220271720868042, + "learning_rate": 1.0717466387563103e-07, + "loss": 0.3891, + "step": 9617 + }, + { + "epoch": 0.9360583941605839, + "grad_norm": 1.7429763308180264, + "learning_rate": 1.0685031691547886e-07, + "loss": 0.5214, + "step": 9618 + }, + { + "epoch": 0.9361557177615571, + "grad_norm": 1.68169145141777, + "learning_rate": 1.0652645619061763e-07, + "loss": 0.3951, + "step": 9619 + }, + { + "epoch": 0.9362530413625304, + "grad_norm": 1.371022366570152, + "learning_rate": 1.0620308173323046e-07, + "loss": 0.2407, + "step": 9620 + }, + { + "epoch": 0.9363503649635037, + "grad_norm": 1.8605278992871563, + "learning_rate": 1.0588019357545165e-07, + "loss": 0.3899, + "step": 9621 + }, + { + "epoch": 0.9364476885644769, + "grad_norm": 1.2461386167287576, + "learning_rate": 1.0555779174936553e-07, + "loss": 0.2129, + "step": 9622 + }, + { + "epoch": 0.9365450121654502, + "grad_norm": 1.534807238326553, + "learning_rate": 1.0523587628701037e-07, + "loss": 0.2612, + "step": 9623 + }, + { + "epoch": 0.9366423357664233, + "grad_norm": 1.8342953094380856, + "learning_rate": 1.0491444722037447e-07, + "loss": 0.2099, + "step": 9624 + }, + { + "epoch": 0.9367396593673966, + "grad_norm": 2.019196768533808, + "learning_rate": 1.0459350458139839e-07, + "loss": 0.3692, + "step": 9625 + }, + { + "epoch": 0.9368369829683698, + "grad_norm": 1.641301551468277, + "learning_rate": 1.0427304840197494e-07, + "loss": 0.3184, + "step": 9626 + }, + { + "epoch": 0.9369343065693431, + "grad_norm": 1.9283927249362134, + "learning_rate": 1.039530787139481e-07, + "loss": 0.2685, + "step": 9627 + }, + { + "epoch": 0.9370316301703163, + "grad_norm": 1.6741601888336033, + "learning_rate": 1.0363359554911357e-07, + "loss": 0.4488, + "step": 9628 + }, + { + "epoch": 0.9371289537712896, + "grad_norm": 1.5261772364611803, + "learning_rate": 1.0331459893921757e-07, + "loss": 0.3588, + "step": 9629 + }, + { + "epoch": 0.9372262773722628, + "grad_norm": 1.5012349766877646, + "learning_rate": 1.0299608891595924e-07, + "loss": 0.4478, + "step": 9630 + }, + { + "epoch": 0.937323600973236, + "grad_norm": 1.8592638896153006, + "learning_rate": 1.0267806551098991e-07, + "loss": 0.3178, + "step": 9631 + }, + { + "epoch": 0.9374209245742092, + "grad_norm": 1.6902336302214098, + "learning_rate": 1.0236052875591208e-07, + "loss": 0.3179, + "step": 9632 + }, + { + "epoch": 0.9375182481751825, + "grad_norm": 1.66272004471063, + "learning_rate": 1.020434786822777e-07, + "loss": 0.5332, + "step": 9633 + }, + { + "epoch": 0.9376155717761557, + "grad_norm": 1.8044568199261593, + "learning_rate": 1.0172691532159385e-07, + "loss": 0.48, + "step": 9634 + }, + { + "epoch": 0.937712895377129, + "grad_norm": 1.7565302940772842, + "learning_rate": 1.0141083870531698e-07, + "loss": 0.4607, + "step": 9635 + }, + { + "epoch": 0.9378102189781022, + "grad_norm": 1.7152043617314066, + "learning_rate": 1.0109524886485645e-07, + "loss": 0.5674, + "step": 9636 + }, + { + "epoch": 0.9379075425790754, + "grad_norm": 1.5908684205145254, + "learning_rate": 1.0078014583157158e-07, + "loss": 0.2423, + "step": 9637 + }, + { + "epoch": 0.9380048661800486, + "grad_norm": 1.675555162806399, + "learning_rate": 1.0046552963677514e-07, + "loss": 0.3958, + "step": 9638 + }, + { + "epoch": 0.9381021897810219, + "grad_norm": 1.6124136672454958, + "learning_rate": 1.0015140031173098e-07, + "loss": 0.4249, + "step": 9639 + }, + { + "epoch": 0.9381995133819951, + "grad_norm": 2.794575859739693, + "learning_rate": 9.983775788765304e-08, + "loss": 0.3278, + "step": 9640 + }, + { + "epoch": 0.9382968369829684, + "grad_norm": 1.7426104523995762, + "learning_rate": 9.952460239570916e-08, + "loss": 0.3914, + "step": 9641 + }, + { + "epoch": 0.9383941605839416, + "grad_norm": 1.7332346148378244, + "learning_rate": 9.921193386701777e-08, + "loss": 0.428, + "step": 9642 + }, + { + "epoch": 0.9384914841849148, + "grad_norm": 1.7307742432578774, + "learning_rate": 9.889975233264959e-08, + "loss": 0.4223, + "step": 9643 + }, + { + "epoch": 0.938588807785888, + "grad_norm": 1.6145390001999034, + "learning_rate": 9.858805782362424e-08, + "loss": 0.3724, + "step": 9644 + }, + { + "epoch": 0.9386861313868613, + "grad_norm": 1.6165798354016714, + "learning_rate": 9.827685037091694e-08, + "loss": 0.4593, + "step": 9645 + }, + { + "epoch": 0.9387834549878346, + "grad_norm": 1.4797309909378773, + "learning_rate": 9.796613000545129e-08, + "loss": 0.27, + "step": 9646 + }, + { + "epoch": 0.9388807785888078, + "grad_norm": 1.5343284801881822, + "learning_rate": 9.765589675810483e-08, + "loss": 0.3109, + "step": 9647 + }, + { + "epoch": 0.9389781021897811, + "grad_norm": 1.8306917609078297, + "learning_rate": 9.734615065970454e-08, + "loss": 0.3607, + "step": 9648 + }, + { + "epoch": 0.9390754257907542, + "grad_norm": 1.695301108607682, + "learning_rate": 9.703689174103137e-08, + "loss": 0.4311, + "step": 9649 + }, + { + "epoch": 0.9391727493917275, + "grad_norm": 2.028959068107322, + "learning_rate": 9.672812003281573e-08, + "loss": 0.303, + "step": 9650 + }, + { + "epoch": 0.9392700729927007, + "grad_norm": 1.450903558377217, + "learning_rate": 9.64198355657403e-08, + "loss": 0.2894, + "step": 9651 + }, + { + "epoch": 0.939367396593674, + "grad_norm": 1.4093053051956355, + "learning_rate": 9.611203837044003e-08, + "loss": 0.1971, + "step": 9652 + }, + { + "epoch": 0.9394647201946472, + "grad_norm": 1.6954325971826056, + "learning_rate": 9.580472847750045e-08, + "loss": 0.3804, + "step": 9653 + }, + { + "epoch": 0.9395620437956205, + "grad_norm": 1.521920333907738, + "learning_rate": 9.549790591745988e-08, + "loss": 0.2863, + "step": 9654 + }, + { + "epoch": 0.9396593673965937, + "grad_norm": 1.771926639618462, + "learning_rate": 9.51915707208062e-08, + "loss": 0.4075, + "step": 9655 + }, + { + "epoch": 0.9397566909975669, + "grad_norm": 1.9894093009635432, + "learning_rate": 9.488572291798115e-08, + "loss": 0.4613, + "step": 9656 + }, + { + "epoch": 0.9398540145985401, + "grad_norm": 1.4665133714037044, + "learning_rate": 9.458036253937707e-08, + "loss": 0.2658, + "step": 9657 + }, + { + "epoch": 0.9399513381995134, + "grad_norm": 1.526685255692653, + "learning_rate": 9.427548961533695e-08, + "loss": 0.3786, + "step": 9658 + }, + { + "epoch": 0.9400486618004866, + "grad_norm": 1.4876674077052594, + "learning_rate": 9.397110417615707e-08, + "loss": 0.3993, + "step": 9659 + }, + { + "epoch": 0.9401459854014599, + "grad_norm": 1.5471903161701483, + "learning_rate": 9.366720625208381e-08, + "loss": 0.2914, + "step": 9660 + }, + { + "epoch": 0.9402433090024331, + "grad_norm": 1.726129123618453, + "learning_rate": 9.336379587331634e-08, + "loss": 0.3847, + "step": 9661 + }, + { + "epoch": 0.9403406326034063, + "grad_norm": 2.0092470261799904, + "learning_rate": 9.306087307000389e-08, + "loss": 0.4272, + "step": 9662 + }, + { + "epoch": 0.9404379562043795, + "grad_norm": 1.982159556805777, + "learning_rate": 9.275843787224847e-08, + "loss": 0.435, + "step": 9663 + }, + { + "epoch": 0.9405352798053528, + "grad_norm": 1.509201911811012, + "learning_rate": 9.24564903101044e-08, + "loss": 0.2869, + "step": 9664 + }, + { + "epoch": 0.940632603406326, + "grad_norm": 1.8480235857676348, + "learning_rate": 9.215503041357432e-08, + "loss": 0.4535, + "step": 9665 + }, + { + "epoch": 0.9407299270072993, + "grad_norm": 1.9445942238235099, + "learning_rate": 9.185405821261539e-08, + "loss": 0.274, + "step": 9666 + }, + { + "epoch": 0.9408272506082725, + "grad_norm": 1.5902411786587216, + "learning_rate": 9.155357373713591e-08, + "loss": 0.3395, + "step": 9667 + }, + { + "epoch": 0.9409245742092457, + "grad_norm": 1.694929440022765, + "learning_rate": 9.125357701699423e-08, + "loss": 0.2725, + "step": 9668 + }, + { + "epoch": 0.9410218978102189, + "grad_norm": 1.8842227450425788, + "learning_rate": 9.095406808200202e-08, + "loss": 0.212, + "step": 9669 + }, + { + "epoch": 0.9411192214111922, + "grad_norm": 1.8663145442770175, + "learning_rate": 9.065504696192162e-08, + "loss": 0.2826, + "step": 9670 + }, + { + "epoch": 0.9412165450121655, + "grad_norm": 1.6082220299244068, + "learning_rate": 9.035651368646647e-08, + "loss": 0.359, + "step": 9671 + }, + { + "epoch": 0.9413138686131387, + "grad_norm": 1.3690855094057564, + "learning_rate": 9.005846828530284e-08, + "loss": 0.199, + "step": 9672 + }, + { + "epoch": 0.941411192214112, + "grad_norm": 1.5839346279324014, + "learning_rate": 8.97609107880465e-08, + "loss": 0.3461, + "step": 9673 + }, + { + "epoch": 0.9415085158150852, + "grad_norm": 1.4729765283199754, + "learning_rate": 8.946384122426655e-08, + "loss": 0.4353, + "step": 9674 + }, + { + "epoch": 0.9416058394160584, + "grad_norm": 1.613006051077611, + "learning_rate": 8.916725962348328e-08, + "loss": 0.4648, + "step": 9675 + }, + { + "epoch": 0.9417031630170316, + "grad_norm": 1.7079521157031634, + "learning_rate": 8.887116601516754e-08, + "loss": 0.528, + "step": 9676 + }, + { + "epoch": 0.9418004866180049, + "grad_norm": 1.8409606394416778, + "learning_rate": 8.857556042874304e-08, + "loss": 0.3511, + "step": 9677 + }, + { + "epoch": 0.9418978102189781, + "grad_norm": 1.4829276005055219, + "learning_rate": 8.82804428935835e-08, + "loss": 0.2479, + "step": 9678 + }, + { + "epoch": 0.9419951338199514, + "grad_norm": 1.7827255428414222, + "learning_rate": 8.798581343901547e-08, + "loss": 0.3469, + "step": 9679 + }, + { + "epoch": 0.9420924574209246, + "grad_norm": 1.5723818528711158, + "learning_rate": 8.769167209431606e-08, + "loss": 0.4992, + "step": 9680 + }, + { + "epoch": 0.9421897810218978, + "grad_norm": 1.5993231772359646, + "learning_rate": 8.739801888871468e-08, + "loss": 0.568, + "step": 9681 + }, + { + "epoch": 0.942287104622871, + "grad_norm": 1.7806884817507853, + "learning_rate": 8.710485385139245e-08, + "loss": 0.5021, + "step": 9682 + }, + { + "epoch": 0.9423844282238443, + "grad_norm": 1.4275306316235854, + "learning_rate": 8.681217701147992e-08, + "loss": 0.3184, + "step": 9683 + }, + { + "epoch": 0.9424817518248175, + "grad_norm": 1.5747862897931604, + "learning_rate": 8.651998839806109e-08, + "loss": 0.1866, + "step": 9684 + }, + { + "epoch": 0.9425790754257908, + "grad_norm": 1.5901031657404352, + "learning_rate": 8.622828804017102e-08, + "loss": 0.2323, + "step": 9685 + }, + { + "epoch": 0.942676399026764, + "grad_norm": 1.7238978195710375, + "learning_rate": 8.593707596679712e-08, + "loss": 0.3544, + "step": 9686 + }, + { + "epoch": 0.9427737226277372, + "grad_norm": 1.6005702969320474, + "learning_rate": 8.564635220687567e-08, + "loss": 0.4235, + "step": 9687 + }, + { + "epoch": 0.9428710462287104, + "grad_norm": 1.4641717208459533, + "learning_rate": 8.535611678929633e-08, + "loss": 0.4702, + "step": 9688 + }, + { + "epoch": 0.9429683698296837, + "grad_norm": 1.7480441471696593, + "learning_rate": 8.506636974290105e-08, + "loss": 0.3813, + "step": 9689 + }, + { + "epoch": 0.9430656934306569, + "grad_norm": 1.4694601736366648, + "learning_rate": 8.477711109648179e-08, + "loss": 0.304, + "step": 9690 + }, + { + "epoch": 0.9431630170316302, + "grad_norm": 1.8513837639535096, + "learning_rate": 8.448834087878166e-08, + "loss": 0.4912, + "step": 9691 + }, + { + "epoch": 0.9432603406326034, + "grad_norm": 1.5631567528453647, + "learning_rate": 8.42000591184966e-08, + "loss": 0.3544, + "step": 9692 + }, + { + "epoch": 0.9433576642335766, + "grad_norm": 1.5020745554476618, + "learning_rate": 8.391226584427314e-08, + "loss": 0.2881, + "step": 9693 + }, + { + "epoch": 0.9434549878345498, + "grad_norm": 2.167110348781055, + "learning_rate": 8.362496108470952e-08, + "loss": 0.4388, + "step": 9694 + }, + { + "epoch": 0.9435523114355231, + "grad_norm": 1.4586383863830763, + "learning_rate": 8.333814486835567e-08, + "loss": 0.495, + "step": 9695 + }, + { + "epoch": 0.9436496350364963, + "grad_norm": 1.5635219283852024, + "learning_rate": 8.305181722371159e-08, + "loss": 0.3284, + "step": 9696 + }, + { + "epoch": 0.9437469586374696, + "grad_norm": 1.6771754442907894, + "learning_rate": 8.276597817923171e-08, + "loss": 0.2367, + "step": 9697 + }, + { + "epoch": 0.9438442822384429, + "grad_norm": 1.8258163350839067, + "learning_rate": 8.248062776331833e-08, + "loss": 0.4231, + "step": 9698 + }, + { + "epoch": 0.9439416058394161, + "grad_norm": 1.7482073321302287, + "learning_rate": 8.219576600432711e-08, + "loss": 0.4003, + "step": 9699 + }, + { + "epoch": 0.9440389294403893, + "grad_norm": 1.6890589259671802, + "learning_rate": 8.191139293056594e-08, + "loss": 0.2817, + "step": 9700 + }, + { + "epoch": 0.9441362530413625, + "grad_norm": 1.544485153413389, + "learning_rate": 8.162750857029222e-08, + "loss": 0.3237, + "step": 9701 + }, + { + "epoch": 0.9442335766423358, + "grad_norm": 1.633504474522838, + "learning_rate": 8.134411295171619e-08, + "loss": 0.2196, + "step": 9702 + }, + { + "epoch": 0.944330900243309, + "grad_norm": 1.4822800961773586, + "learning_rate": 8.106120610299917e-08, + "loss": 0.2892, + "step": 9703 + }, + { + "epoch": 0.9444282238442823, + "grad_norm": 1.6627969003457146, + "learning_rate": 8.07787880522537e-08, + "loss": 0.4037, + "step": 9704 + }, + { + "epoch": 0.9445255474452555, + "grad_norm": 2.0120434545295085, + "learning_rate": 8.049685882754288e-08, + "loss": 0.4619, + "step": 9705 + }, + { + "epoch": 0.9446228710462287, + "grad_norm": 1.64966808506387, + "learning_rate": 8.021541845688318e-08, + "loss": 0.3687, + "step": 9706 + }, + { + "epoch": 0.9447201946472019, + "grad_norm": 2.0316889361985706, + "learning_rate": 7.99344669682417e-08, + "loss": 0.454, + "step": 9707 + }, + { + "epoch": 0.9448175182481752, + "grad_norm": 1.8119038884102494, + "learning_rate": 7.965400438953607e-08, + "loss": 0.5565, + "step": 9708 + }, + { + "epoch": 0.9449148418491484, + "grad_norm": 1.4892285153368283, + "learning_rate": 7.937403074863625e-08, + "loss": 0.239, + "step": 9709 + }, + { + "epoch": 0.9450121654501217, + "grad_norm": 1.800329718591039, + "learning_rate": 7.909454607336331e-08, + "loss": 0.3126, + "step": 9710 + }, + { + "epoch": 0.9451094890510949, + "grad_norm": 2.005117901500976, + "learning_rate": 7.881555039149059e-08, + "loss": 0.3808, + "step": 9711 + }, + { + "epoch": 0.9452068126520681, + "grad_norm": 1.92776479957944, + "learning_rate": 7.853704373074089e-08, + "loss": 0.4775, + "step": 9712 + }, + { + "epoch": 0.9453041362530413, + "grad_norm": 1.6249941312243783, + "learning_rate": 7.825902611878989e-08, + "loss": 0.3674, + "step": 9713 + }, + { + "epoch": 0.9454014598540146, + "grad_norm": 1.8744822105078447, + "learning_rate": 7.79814975832649e-08, + "loss": 0.2519, + "step": 9714 + }, + { + "epoch": 0.9454987834549878, + "grad_norm": 1.7045908927641071, + "learning_rate": 7.770445815174443e-08, + "loss": 0.4311, + "step": 9715 + }, + { + "epoch": 0.9455961070559611, + "grad_norm": 1.6779799187314537, + "learning_rate": 7.742790785175703e-08, + "loss": 0.3673, + "step": 9716 + }, + { + "epoch": 0.9456934306569343, + "grad_norm": 1.6070766889484076, + "learning_rate": 7.715184671078346e-08, + "loss": 0.3352, + "step": 9717 + }, + { + "epoch": 0.9457907542579076, + "grad_norm": 1.4152113320614492, + "learning_rate": 7.687627475625736e-08, + "loss": 0.2707, + "step": 9718 + }, + { + "epoch": 0.9458880778588807, + "grad_norm": 1.5185779238575927, + "learning_rate": 7.660119201556182e-08, + "loss": 0.4415, + "step": 9719 + }, + { + "epoch": 0.945985401459854, + "grad_norm": 1.5919060605287285, + "learning_rate": 7.632659851603164e-08, + "loss": 0.2827, + "step": 9720 + }, + { + "epoch": 0.9460827250608272, + "grad_norm": 1.5849207733762716, + "learning_rate": 7.605249428495332e-08, + "loss": 0.2145, + "step": 9721 + }, + { + "epoch": 0.9461800486618005, + "grad_norm": 1.2763479475046626, + "learning_rate": 7.577887934956619e-08, + "loss": 0.1714, + "step": 9722 + }, + { + "epoch": 0.9462773722627738, + "grad_norm": 1.4138346114333664, + "learning_rate": 7.550575373705793e-08, + "loss": 0.2244, + "step": 9723 + }, + { + "epoch": 0.946374695863747, + "grad_norm": 3.167199809384997, + "learning_rate": 7.523311747456963e-08, + "loss": 0.3239, + "step": 9724 + }, + { + "epoch": 0.9464720194647201, + "grad_norm": 1.6037621256369168, + "learning_rate": 7.49609705891935e-08, + "loss": 0.404, + "step": 9725 + }, + { + "epoch": 0.9465693430656934, + "grad_norm": 1.9481176044909954, + "learning_rate": 7.468931310797345e-08, + "loss": 0.3491, + "step": 9726 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 1.8007470911720906, + "learning_rate": 7.441814505790346e-08, + "loss": 0.434, + "step": 9727 + }, + { + "epoch": 0.9467639902676399, + "grad_norm": 1.6823869172364394, + "learning_rate": 7.414746646592973e-08, + "loss": 0.3838, + "step": 9728 + }, + { + "epoch": 0.9468613138686132, + "grad_norm": 1.4032827649704585, + "learning_rate": 7.387727735895023e-08, + "loss": 0.2372, + "step": 9729 + }, + { + "epoch": 0.9469586374695864, + "grad_norm": 1.5201200702575695, + "learning_rate": 7.360757776381234e-08, + "loss": 0.2803, + "step": 9730 + }, + { + "epoch": 0.9470559610705596, + "grad_norm": 2.0820264404699373, + "learning_rate": 7.33383677073185e-08, + "loss": 0.4103, + "step": 9731 + }, + { + "epoch": 0.9471532846715328, + "grad_norm": 1.6797542115204251, + "learning_rate": 7.306964721621901e-08, + "loss": 0.4178, + "step": 9732 + }, + { + "epoch": 0.9472506082725061, + "grad_norm": 2.0146020466968597, + "learning_rate": 7.280141631721748e-08, + "loss": 0.2606, + "step": 9733 + }, + { + "epoch": 0.9473479318734793, + "grad_norm": 1.5860218345130064, + "learning_rate": 7.253367503696706e-08, + "loss": 0.2519, + "step": 9734 + }, + { + "epoch": 0.9474452554744526, + "grad_norm": 1.9765939823101104, + "learning_rate": 7.226642340207424e-08, + "loss": 0.5539, + "step": 9735 + }, + { + "epoch": 0.9475425790754258, + "grad_norm": 1.6498029938042538, + "learning_rate": 7.199966143909554e-08, + "loss": 0.445, + "step": 9736 + }, + { + "epoch": 0.9476399026763991, + "grad_norm": 1.9670557270349787, + "learning_rate": 7.173338917453976e-08, + "loss": 0.4757, + "step": 9737 + }, + { + "epoch": 0.9477372262773722, + "grad_norm": 1.7088528130163683, + "learning_rate": 7.14676066348663e-08, + "loss": 0.342, + "step": 9738 + }, + { + "epoch": 0.9478345498783455, + "grad_norm": 1.6365193174942918, + "learning_rate": 7.120231384648569e-08, + "loss": 0.3759, + "step": 9739 + }, + { + "epoch": 0.9479318734793187, + "grad_norm": 1.7427854324435919, + "learning_rate": 7.093751083576073e-08, + "loss": 0.4042, + "step": 9740 + }, + { + "epoch": 0.948029197080292, + "grad_norm": 1.8772550373809933, + "learning_rate": 7.067319762900537e-08, + "loss": 0.2638, + "step": 9741 + }, + { + "epoch": 0.9481265206812652, + "grad_norm": 1.4961982073061926, + "learning_rate": 7.040937425248362e-08, + "loss": 0.3302, + "step": 9742 + }, + { + "epoch": 0.9482238442822385, + "grad_norm": 1.738980133491566, + "learning_rate": 7.014604073241282e-08, + "loss": 0.3963, + "step": 9743 + }, + { + "epoch": 0.9483211678832116, + "grad_norm": 1.5821933992906143, + "learning_rate": 6.988319709495984e-08, + "loss": 0.3703, + "step": 9744 + }, + { + "epoch": 0.9484184914841849, + "grad_norm": 1.4428606836553397, + "learning_rate": 6.962084336624376e-08, + "loss": 0.2657, + "step": 9745 + }, + { + "epoch": 0.9485158150851581, + "grad_norm": 1.693985520770493, + "learning_rate": 6.93589795723354e-08, + "loss": 0.3765, + "step": 9746 + }, + { + "epoch": 0.9486131386861314, + "grad_norm": 1.7015331423410074, + "learning_rate": 6.909760573925561e-08, + "loss": 0.3315, + "step": 9747 + }, + { + "epoch": 0.9487104622871046, + "grad_norm": 1.526402644491804, + "learning_rate": 6.883672189297753e-08, + "loss": 0.2893, + "step": 9748 + }, + { + "epoch": 0.9488077858880779, + "grad_norm": 1.4648089188567248, + "learning_rate": 6.857632805942482e-08, + "loss": 0.3456, + "step": 9749 + }, + { + "epoch": 0.948905109489051, + "grad_norm": 1.4253540235081714, + "learning_rate": 6.831642426447405e-08, + "loss": 0.2607, + "step": 9750 + }, + { + "epoch": 0.9490024330900243, + "grad_norm": 1.548802571609033, + "learning_rate": 6.80570105339512e-08, + "loss": 0.3788, + "step": 9751 + }, + { + "epoch": 0.9490997566909976, + "grad_norm": 1.3560336911733757, + "learning_rate": 6.779808689363455e-08, + "loss": 0.2021, + "step": 9752 + }, + { + "epoch": 0.9491970802919708, + "grad_norm": 1.8029042470792376, + "learning_rate": 6.75396533692535e-08, + "loss": 0.4018, + "step": 9753 + }, + { + "epoch": 0.9492944038929441, + "grad_norm": 2.3290395284584737, + "learning_rate": 6.72817099864892e-08, + "loss": 0.2993, + "step": 9754 + }, + { + "epoch": 0.9493917274939173, + "grad_norm": 1.8160319937437845, + "learning_rate": 6.702425677097335e-08, + "loss": 0.4984, + "step": 9755 + }, + { + "epoch": 0.9494890510948905, + "grad_norm": 2.516219738144879, + "learning_rate": 6.676729374828883e-08, + "loss": 0.33, + "step": 9756 + }, + { + "epoch": 0.9495863746958637, + "grad_norm": 1.8113673387091276, + "learning_rate": 6.651082094397076e-08, + "loss": 0.3438, + "step": 9757 + }, + { + "epoch": 0.949683698296837, + "grad_norm": 1.5594111619934168, + "learning_rate": 6.625483838350489e-08, + "loss": 0.4178, + "step": 9758 + }, + { + "epoch": 0.9497810218978102, + "grad_norm": 1.6622657926408493, + "learning_rate": 6.59993460923275e-08, + "loss": 0.2428, + "step": 9759 + }, + { + "epoch": 0.9498783454987835, + "grad_norm": 1.8694835256629623, + "learning_rate": 6.57443440958283e-08, + "loss": 0.3082, + "step": 9760 + }, + { + "epoch": 0.9499756690997567, + "grad_norm": 1.6308622257413332, + "learning_rate": 6.548983241934648e-08, + "loss": 0.1986, + "step": 9761 + }, + { + "epoch": 0.95007299270073, + "grad_norm": 1.7090675900499104, + "learning_rate": 6.523581108817289e-08, + "loss": 0.3768, + "step": 9762 + }, + { + "epoch": 0.9501703163017031, + "grad_norm": 1.58343594201875, + "learning_rate": 6.498228012755014e-08, + "loss": 0.4057, + "step": 9763 + }, + { + "epoch": 0.9502676399026764, + "grad_norm": 1.5679084253039652, + "learning_rate": 6.472923956267085e-08, + "loss": 0.2375, + "step": 9764 + }, + { + "epoch": 0.9503649635036496, + "grad_norm": 1.8368596784297289, + "learning_rate": 6.447668941868157e-08, + "loss": 0.2759, + "step": 9765 + }, + { + "epoch": 0.9504622871046229, + "grad_norm": 1.861700831241794, + "learning_rate": 6.422462972067667e-08, + "loss": 0.3168, + "step": 9766 + }, + { + "epoch": 0.9505596107055961, + "grad_norm": 1.4039925151255923, + "learning_rate": 6.397306049370389e-08, + "loss": 0.3151, + "step": 9767 + }, + { + "epoch": 0.9506569343065694, + "grad_norm": 1.3457329778585063, + "learning_rate": 6.372198176276212e-08, + "loss": 0.2285, + "step": 9768 + }, + { + "epoch": 0.9507542579075425, + "grad_norm": 1.5004302043704851, + "learning_rate": 6.347139355280141e-08, + "loss": 0.2331, + "step": 9769 + }, + { + "epoch": 0.9508515815085158, + "grad_norm": 1.8792631639062922, + "learning_rate": 6.322129588872239e-08, + "loss": 0.3211, + "step": 9770 + }, + { + "epoch": 0.950948905109489, + "grad_norm": 1.5391456852618615, + "learning_rate": 6.297168879537741e-08, + "loss": 0.323, + "step": 9771 + }, + { + "epoch": 0.9510462287104623, + "grad_norm": 1.7281448914967952, + "learning_rate": 6.27225722975705e-08, + "loss": 0.4145, + "step": 9772 + }, + { + "epoch": 0.9511435523114355, + "grad_norm": 1.861115290350832, + "learning_rate": 6.247394642005577e-08, + "loss": 0.2593, + "step": 9773 + }, + { + "epoch": 0.9512408759124088, + "grad_norm": 1.6346083990439721, + "learning_rate": 6.222581118754067e-08, + "loss": 0.2847, + "step": 9774 + }, + { + "epoch": 0.9513381995133819, + "grad_norm": 1.5410979313016757, + "learning_rate": 6.197816662468104e-08, + "loss": 0.4055, + "step": 9775 + }, + { + "epoch": 0.9514355231143552, + "grad_norm": 1.6839182954661618, + "learning_rate": 6.173101275608661e-08, + "loss": 0.2348, + "step": 9776 + }, + { + "epoch": 0.9515328467153285, + "grad_norm": 1.8327159130926505, + "learning_rate": 6.14843496063161e-08, + "loss": 0.6209, + "step": 9777 + }, + { + "epoch": 0.9516301703163017, + "grad_norm": 1.6673694400490178, + "learning_rate": 6.123817719988157e-08, + "loss": 0.2757, + "step": 9778 + }, + { + "epoch": 0.951727493917275, + "grad_norm": 1.6796230569941655, + "learning_rate": 6.099249556124509e-08, + "loss": 0.4392, + "step": 9779 + }, + { + "epoch": 0.9518248175182482, + "grad_norm": 1.7210120834112579, + "learning_rate": 6.074730471482049e-08, + "loss": 0.4603, + "step": 9780 + }, + { + "epoch": 0.9519221411192215, + "grad_norm": 2.2669090052626437, + "learning_rate": 6.05026046849716e-08, + "loss": 0.2671, + "step": 9781 + }, + { + "epoch": 0.9520194647201946, + "grad_norm": 1.9144622385321428, + "learning_rate": 6.025839549601508e-08, + "loss": 0.3334, + "step": 9782 + }, + { + "epoch": 0.9521167883211679, + "grad_norm": 2.6538754789035663, + "learning_rate": 6.001467717221764e-08, + "loss": 0.393, + "step": 9783 + }, + { + "epoch": 0.9522141119221411, + "grad_norm": 1.4103059557123239, + "learning_rate": 5.977144973779824e-08, + "loss": 0.4001, + "step": 9784 + }, + { + "epoch": 0.9523114355231144, + "grad_norm": 2.9589168528290157, + "learning_rate": 5.9528713216926436e-08, + "loss": 0.2247, + "step": 9785 + }, + { + "epoch": 0.9524087591240876, + "grad_norm": 1.7728013565571275, + "learning_rate": 5.928646763372292e-08, + "loss": 0.2472, + "step": 9786 + }, + { + "epoch": 0.9525060827250609, + "grad_norm": 1.591946947009317, + "learning_rate": 5.904471301226067e-08, + "loss": 0.4016, + "step": 9787 + }, + { + "epoch": 0.952603406326034, + "grad_norm": 1.7821070968093615, + "learning_rate": 5.8803449376561574e-08, + "loss": 0.2947, + "step": 9788 + }, + { + "epoch": 0.9527007299270073, + "grad_norm": 1.6407902225677349, + "learning_rate": 5.85626767506009e-08, + "loss": 0.3414, + "step": 9789 + }, + { + "epoch": 0.9527980535279805, + "grad_norm": 1.5564056563761606, + "learning_rate": 5.832239515830451e-08, + "loss": 0.3307, + "step": 9790 + }, + { + "epoch": 0.9528953771289538, + "grad_norm": 1.6867227530420887, + "learning_rate": 5.808260462354887e-08, + "loss": 0.3294, + "step": 9791 + }, + { + "epoch": 0.952992700729927, + "grad_norm": 1.5611010583182072, + "learning_rate": 5.784330517016268e-08, + "loss": 0.3231, + "step": 9792 + }, + { + "epoch": 0.9530900243309003, + "grad_norm": 1.6248307506746704, + "learning_rate": 5.760449682192415e-08, + "loss": 0.2021, + "step": 9793 + }, + { + "epoch": 0.9531873479318734, + "grad_norm": 1.6804968015350348, + "learning_rate": 5.736617960256541e-08, + "loss": 0.4313, + "step": 9794 + }, + { + "epoch": 0.9532846715328467, + "grad_norm": 1.5018434323260688, + "learning_rate": 5.712835353576696e-08, + "loss": 0.2555, + "step": 9795 + }, + { + "epoch": 0.9533819951338199, + "grad_norm": 1.6371291927375125, + "learning_rate": 5.689101864516211e-08, + "loss": 0.385, + "step": 9796 + }, + { + "epoch": 0.9534793187347932, + "grad_norm": 1.567270549425866, + "learning_rate": 5.665417495433534e-08, + "loss": 0.2547, + "step": 9797 + }, + { + "epoch": 0.9535766423357664, + "grad_norm": 1.7579549095959097, + "learning_rate": 5.64178224868217e-08, + "loss": 0.1862, + "step": 9798 + }, + { + "epoch": 0.9536739659367397, + "grad_norm": 1.4619812526309746, + "learning_rate": 5.618196126610742e-08, + "loss": 0.2722, + "step": 9799 + }, + { + "epoch": 0.9537712895377128, + "grad_norm": 1.6742629351105547, + "learning_rate": 5.5946591315630407e-08, + "loss": 0.3723, + "step": 9800 + }, + { + "epoch": 0.9538686131386861, + "grad_norm": 1.4878994509554617, + "learning_rate": 5.571171265877917e-08, + "loss": 0.1954, + "step": 9801 + }, + { + "epoch": 0.9539659367396593, + "grad_norm": 1.7936589526951254, + "learning_rate": 5.547732531889449e-08, + "loss": 0.4076, + "step": 9802 + }, + { + "epoch": 0.9540632603406326, + "grad_norm": 1.5153179840913809, + "learning_rate": 5.524342931926663e-08, + "loss": 0.3344, + "step": 9803 + }, + { + "epoch": 0.9541605839416059, + "grad_norm": 1.4933350629172073, + "learning_rate": 5.501002468313865e-08, + "loss": 0.2154, + "step": 9804 + }, + { + "epoch": 0.9542579075425791, + "grad_norm": 1.3534508011172486, + "learning_rate": 5.4777111433704236e-08, + "loss": 0.1614, + "step": 9805 + }, + { + "epoch": 0.9543552311435524, + "grad_norm": 1.5845762527525626, + "learning_rate": 5.454468959410764e-08, + "loss": 0.4162, + "step": 9806 + }, + { + "epoch": 0.9544525547445255, + "grad_norm": 1.9101854567839955, + "learning_rate": 5.431275918744483e-08, + "loss": 0.3255, + "step": 9807 + }, + { + "epoch": 0.9545498783454988, + "grad_norm": 1.8361067673591227, + "learning_rate": 5.408132023676349e-08, + "loss": 0.4645, + "step": 9808 + }, + { + "epoch": 0.954647201946472, + "grad_norm": 1.7262256690761473, + "learning_rate": 5.385037276506133e-08, + "loss": 0.4054, + "step": 9809 + }, + { + "epoch": 0.9547445255474453, + "grad_norm": 1.7111732634008543, + "learning_rate": 5.361991679528722e-08, + "loss": 0.4287, + "step": 9810 + }, + { + "epoch": 0.9548418491484185, + "grad_norm": 1.748744582412541, + "learning_rate": 5.338995235034228e-08, + "loss": 0.2668, + "step": 9811 + }, + { + "epoch": 0.9549391727493918, + "grad_norm": 1.7507861279032826, + "learning_rate": 5.316047945307878e-08, + "loss": 0.4201, + "step": 9812 + }, + { + "epoch": 0.9550364963503649, + "grad_norm": 2.0980199289020858, + "learning_rate": 5.2931498126298495e-08, + "loss": 0.4562, + "step": 9813 + }, + { + "epoch": 0.9551338199513382, + "grad_norm": 1.4991666545235318, + "learning_rate": 5.2703008392756546e-08, + "loss": 0.2123, + "step": 9814 + }, + { + "epoch": 0.9552311435523114, + "grad_norm": 1.6726984865930803, + "learning_rate": 5.2475010275157e-08, + "loss": 0.3634, + "step": 9815 + }, + { + "epoch": 0.9553284671532847, + "grad_norm": 1.7410735944238402, + "learning_rate": 5.224750379615673e-08, + "loss": 0.2259, + "step": 9816 + }, + { + "epoch": 0.9554257907542579, + "grad_norm": 1.8803518210596986, + "learning_rate": 5.2020488978363204e-08, + "loss": 0.3215, + "step": 9817 + }, + { + "epoch": 0.9555231143552312, + "grad_norm": 1.985821594649675, + "learning_rate": 5.179396584433449e-08, + "loss": 0.5381, + "step": 9818 + }, + { + "epoch": 0.9556204379562043, + "grad_norm": 1.6454925989189146, + "learning_rate": 5.156793441658148e-08, + "loss": 0.5229, + "step": 9819 + }, + { + "epoch": 0.9557177615571776, + "grad_norm": 1.6655664229694114, + "learning_rate": 5.134239471756397e-08, + "loss": 0.4189, + "step": 9820 + }, + { + "epoch": 0.9558150851581508, + "grad_norm": 1.7412129914019714, + "learning_rate": 5.1117346769694596e-08, + "loss": 0.4331, + "step": 9821 + }, + { + "epoch": 0.9559124087591241, + "grad_norm": 1.6724674209135837, + "learning_rate": 5.089279059533658e-08, + "loss": 0.4191, + "step": 9822 + }, + { + "epoch": 0.9560097323600973, + "grad_norm": 1.8526388009771322, + "learning_rate": 5.066872621680374e-08, + "loss": 0.4356, + "step": 9823 + }, + { + "epoch": 0.9561070559610706, + "grad_norm": 1.738078107643688, + "learning_rate": 5.04451536563616e-08, + "loss": 0.436, + "step": 9824 + }, + { + "epoch": 0.9562043795620438, + "grad_norm": 1.763167975712295, + "learning_rate": 5.022207293622627e-08, + "loss": 0.487, + "step": 9825 + }, + { + "epoch": 0.956301703163017, + "grad_norm": 1.8475275942505796, + "learning_rate": 4.99994840785667e-08, + "loss": 0.3893, + "step": 9826 + }, + { + "epoch": 0.9563990267639902, + "grad_norm": 1.8481247289848968, + "learning_rate": 4.9777387105500174e-08, + "loss": 0.3688, + "step": 9827 + }, + { + "epoch": 0.9564963503649635, + "grad_norm": 1.5557808410147473, + "learning_rate": 4.955578203909794e-08, + "loss": 0.2924, + "step": 9828 + }, + { + "epoch": 0.9565936739659368, + "grad_norm": 1.6593279758263173, + "learning_rate": 4.933466890138017e-08, + "loss": 0.3843, + "step": 9829 + }, + { + "epoch": 0.95669099756691, + "grad_norm": 1.7198532725004956, + "learning_rate": 4.9114047714319267e-08, + "loss": 0.555, + "step": 9830 + }, + { + "epoch": 0.9567883211678833, + "grad_norm": 1.662402977657864, + "learning_rate": 4.889391849983882e-08, + "loss": 0.4463, + "step": 9831 + }, + { + "epoch": 0.9568856447688564, + "grad_norm": 1.4815475840965286, + "learning_rate": 4.867428127981244e-08, + "loss": 0.2648, + "step": 9832 + }, + { + "epoch": 0.9569829683698297, + "grad_norm": 1.2693915187525318, + "learning_rate": 4.845513607606655e-08, + "loss": 0.1456, + "step": 9833 + }, + { + "epoch": 0.9570802919708029, + "grad_norm": 1.6151397566660393, + "learning_rate": 4.823648291037708e-08, + "loss": 0.4091, + "step": 9834 + }, + { + "epoch": 0.9571776155717762, + "grad_norm": 1.8275903425312592, + "learning_rate": 4.801832180447163e-08, + "loss": 0.2092, + "step": 9835 + }, + { + "epoch": 0.9572749391727494, + "grad_norm": 1.603980198050626, + "learning_rate": 4.7800652780029545e-08, + "loss": 0.3691, + "step": 9836 + }, + { + "epoch": 0.9573722627737227, + "grad_norm": 2.3002344592464663, + "learning_rate": 4.758347585868017e-08, + "loss": 0.284, + "step": 9837 + }, + { + "epoch": 0.9574695863746958, + "grad_norm": 2.05701943700914, + "learning_rate": 4.736679106200459e-08, + "loss": 0.5654, + "step": 9838 + }, + { + "epoch": 0.9575669099756691, + "grad_norm": 1.7394315535370477, + "learning_rate": 4.715059841153558e-08, + "loss": 0.3823, + "step": 9839 + }, + { + "epoch": 0.9576642335766423, + "grad_norm": 1.5141401587884566, + "learning_rate": 4.6934897928755943e-08, + "loss": 0.2991, + "step": 9840 + }, + { + "epoch": 0.9577615571776156, + "grad_norm": 1.6214141896680154, + "learning_rate": 4.6719689635099653e-08, + "loss": 0.353, + "step": 9841 + }, + { + "epoch": 0.9578588807785888, + "grad_norm": 1.8569511194076662, + "learning_rate": 4.6504973551952384e-08, + "loss": 0.4173, + "step": 9842 + }, + { + "epoch": 0.9579562043795621, + "grad_norm": 1.660269412503503, + "learning_rate": 4.62907497006504e-08, + "loss": 0.3065, + "step": 9843 + }, + { + "epoch": 0.9580535279805352, + "grad_norm": 1.7276276629872958, + "learning_rate": 4.6077018102481665e-08, + "loss": 0.4756, + "step": 9844 + }, + { + "epoch": 0.9581508515815085, + "grad_norm": 1.5104217879986812, + "learning_rate": 4.5863778778684755e-08, + "loss": 0.2376, + "step": 9845 + }, + { + "epoch": 0.9582481751824817, + "grad_norm": 1.8315556606340109, + "learning_rate": 4.565103175044883e-08, + "loss": 0.3407, + "step": 9846 + }, + { + "epoch": 0.958345498783455, + "grad_norm": 1.7182529077415112, + "learning_rate": 4.5438777038915307e-08, + "loss": 0.3382, + "step": 9847 + }, + { + "epoch": 0.9584428223844282, + "grad_norm": 1.7101483201513556, + "learning_rate": 4.522701466517565e-08, + "loss": 0.223, + "step": 9848 + }, + { + "epoch": 0.9585401459854015, + "grad_norm": 1.6865938367436308, + "learning_rate": 4.501574465027303e-08, + "loss": 0.3083, + "step": 9849 + }, + { + "epoch": 0.9586374695863747, + "grad_norm": 2.0210775509639647, + "learning_rate": 4.4804967015201207e-08, + "loss": 0.3989, + "step": 9850 + }, + { + "epoch": 0.9587347931873479, + "grad_norm": 1.9935098485500062, + "learning_rate": 4.4594681780905645e-08, + "loss": 0.4355, + "step": 9851 + }, + { + "epoch": 0.9588321167883211, + "grad_norm": 1.9006793438945706, + "learning_rate": 4.438488896828297e-08, + "loss": 0.3838, + "step": 9852 + }, + { + "epoch": 0.9589294403892944, + "grad_norm": 1.8697504944018382, + "learning_rate": 4.417558859817927e-08, + "loss": 0.3419, + "step": 9853 + }, + { + "epoch": 0.9590267639902676, + "grad_norm": 1.7662553590340924, + "learning_rate": 4.3966780691393484e-08, + "loss": 0.4552, + "step": 9854 + }, + { + "epoch": 0.9591240875912409, + "grad_norm": 1.70795700145977, + "learning_rate": 4.375846526867511e-08, + "loss": 0.6085, + "step": 9855 + }, + { + "epoch": 0.9592214111922142, + "grad_norm": 1.6705647976549916, + "learning_rate": 4.355064235072426e-08, + "loss": 0.2607, + "step": 9856 + }, + { + "epoch": 0.9593187347931873, + "grad_norm": 1.8085412827331584, + "learning_rate": 4.3343311958192194e-08, + "loss": 0.4392, + "step": 9857 + }, + { + "epoch": 0.9594160583941606, + "grad_norm": 1.261661917997937, + "learning_rate": 4.313647411168187e-08, + "loss": 0.2046, + "step": 9858 + }, + { + "epoch": 0.9595133819951338, + "grad_norm": 1.832542364403613, + "learning_rate": 4.2930128831747406e-08, + "loss": 0.3408, + "step": 9859 + }, + { + "epoch": 0.9596107055961071, + "grad_norm": 2.290535202452011, + "learning_rate": 4.272427613889241e-08, + "loss": 0.3567, + "step": 9860 + }, + { + "epoch": 0.9597080291970803, + "grad_norm": 1.6438535621422306, + "learning_rate": 4.251891605357328e-08, + "loss": 0.3944, + "step": 9861 + }, + { + "epoch": 0.9598053527980536, + "grad_norm": 2.0987505056574873, + "learning_rate": 4.2314048596196475e-08, + "loss": 0.3781, + "step": 9862 + }, + { + "epoch": 0.9599026763990267, + "grad_norm": 1.635230213721504, + "learning_rate": 4.2109673787120166e-08, + "loss": 0.4036, + "step": 9863 + }, + { + "epoch": 0.96, + "grad_norm": 2.7257439114130784, + "learning_rate": 4.190579164665309e-08, + "loss": 0.3375, + "step": 9864 + }, + { + "epoch": 0.9600973236009732, + "grad_norm": 2.007297910064611, + "learning_rate": 4.1702402195054616e-08, + "loss": 0.3373, + "step": 9865 + }, + { + "epoch": 0.9601946472019465, + "grad_norm": 1.5907261154921255, + "learning_rate": 4.149950545253634e-08, + "loss": 0.3125, + "step": 9866 + }, + { + "epoch": 0.9602919708029197, + "grad_norm": 1.6126486066606243, + "learning_rate": 4.129710143925936e-08, + "loss": 0.3969, + "step": 9867 + }, + { + "epoch": 0.960389294403893, + "grad_norm": 1.7575593893291466, + "learning_rate": 4.1095190175337586e-08, + "loss": 0.3686, + "step": 9868 + }, + { + "epoch": 0.9604866180048662, + "grad_norm": 1.9087454924064928, + "learning_rate": 4.0893771680834414e-08, + "loss": 0.3364, + "step": 9869 + }, + { + "epoch": 0.9605839416058394, + "grad_norm": 1.8541954409763288, + "learning_rate": 4.069284597576606e-08, + "loss": 0.5319, + "step": 9870 + }, + { + "epoch": 0.9606812652068126, + "grad_norm": 1.7642721884403707, + "learning_rate": 4.04924130800971e-08, + "loss": 0.2798, + "step": 9871 + }, + { + "epoch": 0.9607785888077859, + "grad_norm": 1.7307785135819032, + "learning_rate": 4.029247301374606e-08, + "loss": 0.3835, + "step": 9872 + }, + { + "epoch": 0.9608759124087591, + "grad_norm": 1.573230442640246, + "learning_rate": 4.009302579657981e-08, + "loss": 0.41, + "step": 9873 + }, + { + "epoch": 0.9609732360097324, + "grad_norm": 1.717797171302427, + "learning_rate": 3.989407144841861e-08, + "loss": 0.3987, + "step": 9874 + }, + { + "epoch": 0.9610705596107056, + "grad_norm": 1.7002738395492552, + "learning_rate": 3.9695609989032215e-08, + "loss": 0.4122, + "step": 9875 + }, + { + "epoch": 0.9611678832116788, + "grad_norm": 1.2003027772397932, + "learning_rate": 3.94976414381415e-08, + "loss": 0.1549, + "step": 9876 + }, + { + "epoch": 0.961265206812652, + "grad_norm": 1.7014620300280212, + "learning_rate": 3.9300165815419624e-08, + "loss": 0.5821, + "step": 9877 + }, + { + "epoch": 0.9613625304136253, + "grad_norm": 1.5642581203811425, + "learning_rate": 3.910318314048922e-08, + "loss": 0.3275, + "step": 9878 + }, + { + "epoch": 0.9614598540145985, + "grad_norm": 1.938545560209398, + "learning_rate": 3.890669343292464e-08, + "loss": 0.4224, + "step": 9879 + }, + { + "epoch": 0.9615571776155718, + "grad_norm": 1.7948291576128046, + "learning_rate": 3.8710696712250806e-08, + "loss": 0.4855, + "step": 9880 + }, + { + "epoch": 0.961654501216545, + "grad_norm": 2.0192025542471277, + "learning_rate": 3.8515192997945485e-08, + "loss": 0.5638, + "step": 9881 + }, + { + "epoch": 0.9617518248175182, + "grad_norm": 1.6656909550245977, + "learning_rate": 3.832018230943424e-08, + "loss": 0.3342, + "step": 9882 + }, + { + "epoch": 0.9618491484184915, + "grad_norm": 1.7018518851591566, + "learning_rate": 3.812566466609657e-08, + "loss": 0.4037, + "step": 9883 + }, + { + "epoch": 0.9619464720194647, + "grad_norm": 1.6699913322772013, + "learning_rate": 3.793164008726147e-08, + "loss": 0.3492, + "step": 9884 + }, + { + "epoch": 0.962043795620438, + "grad_norm": 1.5197912155614353, + "learning_rate": 3.773810859220905e-08, + "loss": 0.3243, + "step": 9885 + }, + { + "epoch": 0.9621411192214112, + "grad_norm": 1.4762939531855295, + "learning_rate": 3.754507020017062e-08, + "loss": 0.2523, + "step": 9886 + }, + { + "epoch": 0.9622384428223845, + "grad_norm": 1.6874629829020105, + "learning_rate": 3.7352524930329146e-08, + "loss": 0.2114, + "step": 9887 + }, + { + "epoch": 0.9623357664233577, + "grad_norm": 1.4512169729544038, + "learning_rate": 3.716047280181712e-08, + "loss": 0.3165, + "step": 9888 + }, + { + "epoch": 0.9624330900243309, + "grad_norm": 1.5674491321639075, + "learning_rate": 3.696891383371926e-08, + "loss": 0.2873, + "step": 9889 + }, + { + "epoch": 0.9625304136253041, + "grad_norm": 1.5617606893521376, + "learning_rate": 3.677784804507145e-08, + "loss": 0.4669, + "step": 9890 + }, + { + "epoch": 0.9626277372262774, + "grad_norm": 1.9835037361265613, + "learning_rate": 3.6587275454859075e-08, + "loss": 0.294, + "step": 9891 + }, + { + "epoch": 0.9627250608272506, + "grad_norm": 1.4330672042911214, + "learning_rate": 3.6397196082020306e-08, + "loss": 0.3996, + "step": 9892 + }, + { + "epoch": 0.9628223844282239, + "grad_norm": 1.639342906083656, + "learning_rate": 3.620760994544281e-08, + "loss": 0.2995, + "step": 9893 + }, + { + "epoch": 0.9629197080291971, + "grad_norm": 1.4309475041856383, + "learning_rate": 3.601851706396598e-08, + "loss": 0.25, + "step": 9894 + }, + { + "epoch": 0.9630170316301703, + "grad_norm": 1.5523318746695212, + "learning_rate": 3.582991745638087e-08, + "loss": 0.3242, + "step": 9895 + }, + { + "epoch": 0.9631143552311435, + "grad_norm": 1.8181221681260038, + "learning_rate": 3.564181114142751e-08, + "loss": 0.2435, + "step": 9896 + }, + { + "epoch": 0.9632116788321168, + "grad_norm": 1.6854399363165287, + "learning_rate": 3.545419813779871e-08, + "loss": 0.2825, + "step": 9897 + }, + { + "epoch": 0.96330900243309, + "grad_norm": 1.9295821064573289, + "learning_rate": 3.526707846413735e-08, + "loss": 0.295, + "step": 9898 + }, + { + "epoch": 0.9634063260340633, + "grad_norm": 1.6548811816294484, + "learning_rate": 3.5080452139038545e-08, + "loss": 0.4009, + "step": 9899 + }, + { + "epoch": 0.9635036496350365, + "grad_norm": 1.5077902836719443, + "learning_rate": 3.489431918104635e-08, + "loss": 0.196, + "step": 9900 + }, + { + "epoch": 0.9636009732360097, + "grad_norm": 1.6034848469279006, + "learning_rate": 3.47086796086582e-08, + "loss": 0.285, + "step": 9901 + }, + { + "epoch": 0.9636982968369829, + "grad_norm": 1.7657819019920982, + "learning_rate": 3.452353344032045e-08, + "loss": 0.4627, + "step": 9902 + }, + { + "epoch": 0.9637956204379562, + "grad_norm": 1.7345375465998352, + "learning_rate": 3.433888069443059e-08, + "loss": 0.2779, + "step": 9903 + }, + { + "epoch": 0.9638929440389294, + "grad_norm": 1.7657146599884272, + "learning_rate": 3.4154721389338394e-08, + "loss": 0.2393, + "step": 9904 + }, + { + "epoch": 0.9639902676399027, + "grad_norm": 1.584046838848143, + "learning_rate": 3.3971055543344234e-08, + "loss": 0.3594, + "step": 9905 + }, + { + "epoch": 0.964087591240876, + "grad_norm": 1.7964954804709472, + "learning_rate": 3.37878831746985e-08, + "loss": 0.4295, + "step": 9906 + }, + { + "epoch": 0.9641849148418491, + "grad_norm": 1.486362772674352, + "learning_rate": 3.3605204301602745e-08, + "loss": 0.2543, + "step": 9907 + }, + { + "epoch": 0.9642822384428223, + "grad_norm": 1.660046885758981, + "learning_rate": 3.3423018942210784e-08, + "loss": 0.49, + "step": 9908 + }, + { + "epoch": 0.9643795620437956, + "grad_norm": 1.658484193594878, + "learning_rate": 3.324132711462646e-08, + "loss": 0.4607, + "step": 9909 + }, + { + "epoch": 0.9644768856447689, + "grad_norm": 1.6110362395431892, + "learning_rate": 3.3060128836903685e-08, + "loss": 0.2894, + "step": 9910 + }, + { + "epoch": 0.9645742092457421, + "grad_norm": 1.5432748225403545, + "learning_rate": 3.287942412704914e-08, + "loss": 0.3871, + "step": 9911 + }, + { + "epoch": 0.9646715328467154, + "grad_norm": 1.7856457996859385, + "learning_rate": 3.269921300301959e-08, + "loss": 0.3756, + "step": 9912 + }, + { + "epoch": 0.9647688564476886, + "grad_norm": 1.8264974538480951, + "learning_rate": 3.251949548272182e-08, + "loss": 0.3931, + "step": 9913 + }, + { + "epoch": 0.9648661800486618, + "grad_norm": 1.6917628381511742, + "learning_rate": 3.234027158401543e-08, + "loss": 0.3752, + "step": 9914 + }, + { + "epoch": 0.964963503649635, + "grad_norm": 1.5773470632472013, + "learning_rate": 3.21615413247095e-08, + "loss": 0.2844, + "step": 9915 + }, + { + "epoch": 0.9650608272506083, + "grad_norm": 1.6762565170036234, + "learning_rate": 3.198330472256428e-08, + "loss": 0.5624, + "step": 9916 + }, + { + "epoch": 0.9651581508515815, + "grad_norm": 2.405710729635851, + "learning_rate": 3.180556179529226e-08, + "loss": 0.3577, + "step": 9917 + }, + { + "epoch": 0.9652554744525548, + "grad_norm": 1.6648471755265968, + "learning_rate": 3.162831256055543e-08, + "loss": 0.5358, + "step": 9918 + }, + { + "epoch": 0.965352798053528, + "grad_norm": 1.9490035249767261, + "learning_rate": 3.145155703596636e-08, + "loss": 0.3124, + "step": 9919 + }, + { + "epoch": 0.9654501216545012, + "grad_norm": 1.396886807590223, + "learning_rate": 3.12752952390899e-08, + "loss": 0.3546, + "step": 9920 + }, + { + "epoch": 0.9655474452554744, + "grad_norm": 1.672555303756909, + "learning_rate": 3.1099527187442024e-08, + "loss": 0.4216, + "step": 9921 + }, + { + "epoch": 0.9656447688564477, + "grad_norm": 1.6514266695378392, + "learning_rate": 3.092425289848766e-08, + "loss": 0.4191, + "step": 9922 + }, + { + "epoch": 0.9657420924574209, + "grad_norm": 1.919164467447434, + "learning_rate": 3.0749472389644544e-08, + "loss": 0.2602, + "step": 9923 + }, + { + "epoch": 0.9658394160583942, + "grad_norm": 1.7333900472948351, + "learning_rate": 3.057518567828155e-08, + "loss": 0.3903, + "step": 9924 + }, + { + "epoch": 0.9659367396593674, + "grad_norm": 1.6707671585994246, + "learning_rate": 3.040139278171594e-08, + "loss": 0.3928, + "step": 9925 + }, + { + "epoch": 0.9660340632603406, + "grad_norm": 1.5325461371043665, + "learning_rate": 3.022809371721891e-08, + "loss": 0.4426, + "step": 9926 + }, + { + "epoch": 0.9661313868613138, + "grad_norm": 1.5901351024401829, + "learning_rate": 3.005528850201056e-08, + "loss": 0.4088, + "step": 9927 + }, + { + "epoch": 0.9662287104622871, + "grad_norm": 1.7558846307450886, + "learning_rate": 2.988297715326327e-08, + "loss": 0.2942, + "step": 9928 + }, + { + "epoch": 0.9663260340632603, + "grad_norm": 1.4695576821058698, + "learning_rate": 2.9711159688099455e-08, + "loss": 0.2718, + "step": 9929 + }, + { + "epoch": 0.9664233576642336, + "grad_norm": 1.774905500448534, + "learning_rate": 2.9539836123592125e-08, + "loss": 0.4286, + "step": 9930 + }, + { + "epoch": 0.9665206812652068, + "grad_norm": 1.444980975095119, + "learning_rate": 2.9369006476766548e-08, + "loss": 0.2678, + "step": 9931 + }, + { + "epoch": 0.9666180048661801, + "grad_norm": 1.9529069702524104, + "learning_rate": 2.9198670764598036e-08, + "loss": 0.3273, + "step": 9932 + }, + { + "epoch": 0.9667153284671532, + "grad_norm": 1.552716870479163, + "learning_rate": 2.9028829004013047e-08, + "loss": 0.2489, + "step": 9933 + }, + { + "epoch": 0.9668126520681265, + "grad_norm": 1.6299659703218803, + "learning_rate": 2.8859481211888642e-08, + "loss": 0.3771, + "step": 9934 + }, + { + "epoch": 0.9669099756690998, + "grad_norm": 2.6198865950407133, + "learning_rate": 2.8690627405053573e-08, + "loss": 0.3106, + "step": 9935 + }, + { + "epoch": 0.967007299270073, + "grad_norm": 1.8186750127795688, + "learning_rate": 2.852226760028609e-08, + "loss": 0.4026, + "step": 9936 + }, + { + "epoch": 0.9671046228710463, + "grad_norm": 1.7731913257089829, + "learning_rate": 2.8354401814316702e-08, + "loss": 0.5187, + "step": 9937 + }, + { + "epoch": 0.9672019464720195, + "grad_norm": 1.5957576597646366, + "learning_rate": 2.818703006382595e-08, + "loss": 0.4586, + "step": 9938 + }, + { + "epoch": 0.9672992700729927, + "grad_norm": 1.8551227447740757, + "learning_rate": 2.8020152365446086e-08, + "loss": 0.4946, + "step": 9939 + }, + { + "epoch": 0.9673965936739659, + "grad_norm": 1.7712583218892546, + "learning_rate": 2.7853768735759957e-08, + "loss": 0.3586, + "step": 9940 + }, + { + "epoch": 0.9674939172749392, + "grad_norm": 1.7282402896881461, + "learning_rate": 2.768787919130045e-08, + "loss": 0.3963, + "step": 9941 + }, + { + "epoch": 0.9675912408759124, + "grad_norm": 1.8674962255396355, + "learning_rate": 2.752248374855271e-08, + "loss": 0.7935, + "step": 9942 + }, + { + "epoch": 0.9676885644768857, + "grad_norm": 2.111131177215691, + "learning_rate": 2.735758242395248e-08, + "loss": 0.2824, + "step": 9943 + }, + { + "epoch": 0.9677858880778589, + "grad_norm": 1.7061938713259677, + "learning_rate": 2.7193175233884984e-08, + "loss": 0.2771, + "step": 9944 + }, + { + "epoch": 0.9678832116788321, + "grad_norm": 1.9167351572266338, + "learning_rate": 2.702926219468882e-08, + "loss": 0.213, + "step": 9945 + }, + { + "epoch": 0.9679805352798053, + "grad_norm": 1.6091481761900583, + "learning_rate": 2.686584332265152e-08, + "loss": 0.534, + "step": 9946 + }, + { + "epoch": 0.9680778588807786, + "grad_norm": 1.7254827929478622, + "learning_rate": 2.6702918634011753e-08, + "loss": 0.4774, + "step": 9947 + }, + { + "epoch": 0.9681751824817518, + "grad_norm": 2.1888844060233352, + "learning_rate": 2.654048814495991e-08, + "loss": 0.5834, + "step": 9948 + }, + { + "epoch": 0.9682725060827251, + "grad_norm": 1.6189000741328867, + "learning_rate": 2.6378551871636958e-08, + "loss": 0.3642, + "step": 9949 + }, + { + "epoch": 0.9683698296836983, + "grad_norm": 1.5868916014035435, + "learning_rate": 2.621710983013448e-08, + "loss": 0.438, + "step": 9950 + }, + { + "epoch": 0.9684671532846715, + "grad_norm": 1.7988042011803174, + "learning_rate": 2.605616203649408e-08, + "loss": 0.4758, + "step": 9951 + }, + { + "epoch": 0.9685644768856447, + "grad_norm": 1.4619066420413294, + "learning_rate": 2.5895708506710748e-08, + "loss": 0.3929, + "step": 9952 + }, + { + "epoch": 0.968661800486618, + "grad_norm": 2.2085527155126923, + "learning_rate": 2.5735749256727837e-08, + "loss": 0.3535, + "step": 9953 + }, + { + "epoch": 0.9687591240875912, + "grad_norm": 1.557934461466324, + "learning_rate": 2.5576284302441523e-08, + "loss": 0.3284, + "step": 9954 + }, + { + "epoch": 0.9688564476885645, + "grad_norm": 1.7223017667359877, + "learning_rate": 2.5417313659696906e-08, + "loss": 0.2646, + "step": 9955 + }, + { + "epoch": 0.9689537712895377, + "grad_norm": 1.8681473439610643, + "learning_rate": 2.5258837344291354e-08, + "loss": 0.2738, + "step": 9956 + }, + { + "epoch": 0.969051094890511, + "grad_norm": 1.7076254362666061, + "learning_rate": 2.5100855371973375e-08, + "loss": 0.4329, + "step": 9957 + }, + { + "epoch": 0.9691484184914841, + "grad_norm": 2.11838680518988, + "learning_rate": 2.4943367758440416e-08, + "loss": 0.3442, + "step": 9958 + }, + { + "epoch": 0.9692457420924574, + "grad_norm": 1.3884322634402237, + "learning_rate": 2.4786374519343847e-08, + "loss": 0.2282, + "step": 9959 + }, + { + "epoch": 0.9693430656934306, + "grad_norm": 1.8915263147704275, + "learning_rate": 2.4629875670282856e-08, + "loss": 0.2545, + "step": 9960 + }, + { + "epoch": 0.9694403892944039, + "grad_norm": 1.59748940191201, + "learning_rate": 2.4473871226808887e-08, + "loss": 0.4663, + "step": 9961 + }, + { + "epoch": 0.9695377128953772, + "grad_norm": 1.395642561710221, + "learning_rate": 2.4318361204424545e-08, + "loss": 0.163, + "step": 9962 + }, + { + "epoch": 0.9696350364963504, + "grad_norm": 1.6200335241310506, + "learning_rate": 2.416334561858358e-08, + "loss": 0.2688, + "step": 9963 + }, + { + "epoch": 0.9697323600973236, + "grad_norm": 1.4701760045258008, + "learning_rate": 2.4008824484688664e-08, + "loss": 0.4615, + "step": 9964 + }, + { + "epoch": 0.9698296836982968, + "grad_norm": 1.3143597097772706, + "learning_rate": 2.3854797818095852e-08, + "loss": 0.2689, + "step": 9965 + }, + { + "epoch": 0.9699270072992701, + "grad_norm": 1.7555102382047365, + "learning_rate": 2.3701265634110126e-08, + "loss": 0.3909, + "step": 9966 + }, + { + "epoch": 0.9700243309002433, + "grad_norm": 1.8183477055127026, + "learning_rate": 2.3548227947988167e-08, + "loss": 0.3412, + "step": 9967 + }, + { + "epoch": 0.9701216545012166, + "grad_norm": 1.9289186621722776, + "learning_rate": 2.339568477493781e-08, + "loss": 0.3447, + "step": 9968 + }, + { + "epoch": 0.9702189781021898, + "grad_norm": 1.5832849879998572, + "learning_rate": 2.3243636130116932e-08, + "loss": 0.2796, + "step": 9969 + }, + { + "epoch": 0.970316301703163, + "grad_norm": 1.657505495226282, + "learning_rate": 2.309208202863511e-08, + "loss": 0.3293, + "step": 9970 + }, + { + "epoch": 0.9704136253041362, + "grad_norm": 1.588926908546496, + "learning_rate": 2.2941022485552522e-08, + "loss": 0.3609, + "step": 9971 + }, + { + "epoch": 0.9705109489051095, + "grad_norm": 1.734602354802007, + "learning_rate": 2.2790457515878827e-08, + "loss": 0.4765, + "step": 9972 + }, + { + "epoch": 0.9706082725060827, + "grad_norm": 1.974811426648667, + "learning_rate": 2.264038713457706e-08, + "loss": 0.3638, + "step": 9973 + }, + { + "epoch": 0.970705596107056, + "grad_norm": 1.6603481765208166, + "learning_rate": 2.2490811356559173e-08, + "loss": 0.4572, + "step": 9974 + }, + { + "epoch": 0.9708029197080292, + "grad_norm": 2.2064975329501366, + "learning_rate": 2.2341730196688838e-08, + "loss": 0.2651, + "step": 9975 + }, + { + "epoch": 0.9709002433090025, + "grad_norm": 1.575700313549152, + "learning_rate": 2.2193143669780316e-08, + "loss": 0.2347, + "step": 9976 + }, + { + "epoch": 0.9709975669099756, + "grad_norm": 1.6229119457066843, + "learning_rate": 2.2045051790599016e-08, + "loss": 0.2619, + "step": 9977 + }, + { + "epoch": 0.9710948905109489, + "grad_norm": 2.6560887904894797, + "learning_rate": 2.1897454573860388e-08, + "loss": 0.286, + "step": 9978 + }, + { + "epoch": 0.9711922141119221, + "grad_norm": 1.6837846333567486, + "learning_rate": 2.1750352034231036e-08, + "loss": 0.4373, + "step": 9979 + }, + { + "epoch": 0.9712895377128954, + "grad_norm": 1.7834900941608418, + "learning_rate": 2.1603744186329822e-08, + "loss": 0.4112, + "step": 9980 + }, + { + "epoch": 0.9713868613138686, + "grad_norm": 2.284323516140304, + "learning_rate": 2.1457631044723982e-08, + "loss": 0.1951, + "step": 9981 + }, + { + "epoch": 0.9714841849148419, + "grad_norm": 1.7625877791385884, + "learning_rate": 2.1312012623933564e-08, + "loss": 0.5466, + "step": 9982 + }, + { + "epoch": 0.971581508515815, + "grad_norm": 1.3337518313394432, + "learning_rate": 2.1166888938428663e-08, + "loss": 0.2672, + "step": 9983 + }, + { + "epoch": 0.9716788321167883, + "grad_norm": 2.1591536280087302, + "learning_rate": 2.102226000262997e-08, + "loss": 0.3142, + "step": 9984 + }, + { + "epoch": 0.9717761557177615, + "grad_norm": 1.5973380595048667, + "learning_rate": 2.0878125830909867e-08, + "loss": 0.4019, + "step": 9985 + }, + { + "epoch": 0.9718734793187348, + "grad_norm": 1.9098882506668815, + "learning_rate": 2.0734486437590795e-08, + "loss": 0.5553, + "step": 9986 + }, + { + "epoch": 0.971970802919708, + "grad_norm": 1.6445139976919432, + "learning_rate": 2.0591341836946332e-08, + "loss": 0.3015, + "step": 9987 + }, + { + "epoch": 0.9720681265206813, + "grad_norm": 1.4980526840714048, + "learning_rate": 2.0448692043200657e-08, + "loss": 0.2454, + "step": 9988 + }, + { + "epoch": 0.9721654501216545, + "grad_norm": 1.3990347668334029, + "learning_rate": 2.0306537070529653e-08, + "loss": 0.1445, + "step": 9989 + }, + { + "epoch": 0.9722627737226277, + "grad_norm": 1.7133218533444272, + "learning_rate": 2.0164876933058132e-08, + "loss": 0.3283, + "step": 9990 + }, + { + "epoch": 0.972360097323601, + "grad_norm": 1.7395548408900798, + "learning_rate": 2.0023711644864273e-08, + "loss": 0.4704, + "step": 9991 + }, + { + "epoch": 0.9724574209245742, + "grad_norm": 1.8656247035142928, + "learning_rate": 1.988304121997464e-08, + "loss": 0.3265, + "step": 9992 + }, + { + "epoch": 0.9725547445255475, + "grad_norm": 1.5711667598261545, + "learning_rate": 1.9742865672368606e-08, + "loss": 0.4138, + "step": 9993 + }, + { + "epoch": 0.9726520681265207, + "grad_norm": 1.363321691749382, + "learning_rate": 1.960318501597447e-08, + "loss": 0.2264, + "step": 9994 + }, + { + "epoch": 0.972749391727494, + "grad_norm": 2.147293951698845, + "learning_rate": 1.9463999264673905e-08, + "loss": 0.5105, + "step": 9995 + }, + { + "epoch": 0.9728467153284671, + "grad_norm": 1.7157980798573838, + "learning_rate": 1.932530843229641e-08, + "loss": 0.3378, + "step": 9996 + }, + { + "epoch": 0.9729440389294404, + "grad_norm": 3.026387512788717, + "learning_rate": 1.9187112532624287e-08, + "loss": 0.392, + "step": 9997 + }, + { + "epoch": 0.9730413625304136, + "grad_norm": 1.7694533711994564, + "learning_rate": 1.904941157939044e-08, + "loss": 0.2501, + "step": 9998 + }, + { + "epoch": 0.9731386861313869, + "grad_norm": 2.338191447268506, + "learning_rate": 1.8912205586278378e-08, + "loss": 0.365, + "step": 9999 + }, + { + "epoch": 0.9732360097323601, + "grad_norm": 1.6033911708992676, + "learning_rate": 1.8775494566921626e-08, + "loss": 0.2585, + "step": 10000 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 1.2838880780779296, + "learning_rate": 1.863927853490599e-08, + "loss": 0.2771, + "step": 10001 + }, + { + "epoch": 0.9734306569343065, + "grad_norm": 1.567424034455987, + "learning_rate": 1.8503557503766757e-08, + "loss": 0.3598, + "step": 10002 + }, + { + "epoch": 0.9735279805352798, + "grad_norm": 1.6340692237749228, + "learning_rate": 1.8368331486991464e-08, + "loss": 0.4718, + "step": 10003 + }, + { + "epoch": 0.973625304136253, + "grad_norm": 1.5887556922024277, + "learning_rate": 1.823360049801659e-08, + "loss": 0.4204, + "step": 10004 + }, + { + "epoch": 0.9737226277372263, + "grad_norm": 1.731833787122798, + "learning_rate": 1.8099364550230868e-08, + "loss": 0.3548, + "step": 10005 + }, + { + "epoch": 0.9738199513381995, + "grad_norm": 1.8544678889847714, + "learning_rate": 1.796562365697363e-08, + "loss": 0.3422, + "step": 10006 + }, + { + "epoch": 0.9739172749391728, + "grad_norm": 1.356943608837636, + "learning_rate": 1.783237783153424e-08, + "loss": 0.1893, + "step": 10007 + }, + { + "epoch": 0.9740145985401459, + "grad_norm": 1.8911405758866506, + "learning_rate": 1.769962708715378e-08, + "loss": 0.4185, + "step": 10008 + }, + { + "epoch": 0.9741119221411192, + "grad_norm": 1.4742780502675648, + "learning_rate": 1.7567371437023916e-08, + "loss": 0.4146, + "step": 10009 + }, + { + "epoch": 0.9742092457420924, + "grad_norm": 1.838742455026796, + "learning_rate": 1.7435610894286913e-08, + "loss": 0.4112, + "step": 10010 + }, + { + "epoch": 0.9743065693430657, + "grad_norm": 1.3114631281071467, + "learning_rate": 1.7304345472035634e-08, + "loss": 0.2728, + "step": 10011 + }, + { + "epoch": 0.974403892944039, + "grad_norm": 1.5643894215923786, + "learning_rate": 1.7173575183314085e-08, + "loss": 0.3253, + "step": 10012 + }, + { + "epoch": 0.9745012165450122, + "grad_norm": 1.6454471022276127, + "learning_rate": 1.7043300041116874e-08, + "loss": 0.3778, + "step": 10013 + }, + { + "epoch": 0.9745985401459853, + "grad_norm": 1.8505769190908223, + "learning_rate": 1.691352005839031e-08, + "loss": 0.4914, + "step": 10014 + }, + { + "epoch": 0.9746958637469586, + "grad_norm": 1.6100938543365535, + "learning_rate": 1.6784235248029636e-08, + "loss": 0.333, + "step": 10015 + }, + { + "epoch": 0.9747931873479319, + "grad_norm": 1.3664267030802446, + "learning_rate": 1.665544562288235e-08, + "loss": 0.3395, + "step": 10016 + }, + { + "epoch": 0.9748905109489051, + "grad_norm": 1.4012271611771088, + "learning_rate": 1.652715119574655e-08, + "loss": 0.3516, + "step": 10017 + }, + { + "epoch": 0.9749878345498784, + "grad_norm": 1.4493970735554336, + "learning_rate": 1.6399351979370926e-08, + "loss": 0.4051, + "step": 10018 + }, + { + "epoch": 0.9750851581508516, + "grad_norm": 1.898295204889378, + "learning_rate": 1.6272047986454766e-08, + "loss": 0.3523, + "step": 10019 + }, + { + "epoch": 0.9751824817518249, + "grad_norm": 1.8168010323673138, + "learning_rate": 1.6145239229648502e-08, + "loss": 0.3301, + "step": 10020 + }, + { + "epoch": 0.975279805352798, + "grad_norm": 1.5914472511938684, + "learning_rate": 1.6018925721553167e-08, + "loss": 0.3725, + "step": 10021 + }, + { + "epoch": 0.9753771289537713, + "grad_norm": 1.5828120059344923, + "learning_rate": 1.589310747472095e-08, + "loss": 0.1797, + "step": 10022 + }, + { + "epoch": 0.9754744525547445, + "grad_norm": 1.9316825900712, + "learning_rate": 1.5767784501653506e-08, + "loss": 0.1742, + "step": 10023 + }, + { + "epoch": 0.9755717761557178, + "grad_norm": 1.3211546249395896, + "learning_rate": 1.5642956814804765e-08, + "loss": 0.2196, + "step": 10024 + }, + { + "epoch": 0.975669099756691, + "grad_norm": 1.671676053483242, + "learning_rate": 1.551862442657981e-08, + "loss": 0.336, + "step": 10025 + }, + { + "epoch": 0.9757664233576643, + "grad_norm": 1.6052725641898251, + "learning_rate": 1.5394787349332085e-08, + "loss": 0.3288, + "step": 10026 + }, + { + "epoch": 0.9758637469586374, + "grad_norm": 1.9998761377126901, + "learning_rate": 1.527144559536842e-08, + "loss": 0.4417, + "step": 10027 + }, + { + "epoch": 0.9759610705596107, + "grad_norm": 1.6856338637659605, + "learning_rate": 1.514859917694511e-08, + "loss": 0.4096, + "step": 10028 + }, + { + "epoch": 0.9760583941605839, + "grad_norm": 1.7485884014119761, + "learning_rate": 1.5026248106269625e-08, + "loss": 0.2737, + "step": 10029 + }, + { + "epoch": 0.9761557177615572, + "grad_norm": 1.806714657478082, + "learning_rate": 1.4904392395499458e-08, + "loss": 0.4281, + "step": 10030 + }, + { + "epoch": 0.9762530413625304, + "grad_norm": 1.820111046641642, + "learning_rate": 1.4783032056744363e-08, + "loss": 0.5077, + "step": 10031 + }, + { + "epoch": 0.9763503649635037, + "grad_norm": 1.679837384301026, + "learning_rate": 1.466216710206303e-08, + "loss": 0.3831, + "step": 10032 + }, + { + "epoch": 0.9764476885644768, + "grad_norm": 1.653660459278536, + "learning_rate": 1.4541797543466962e-08, + "loss": 0.4609, + "step": 10033 + }, + { + "epoch": 0.9765450121654501, + "grad_norm": 1.8903109707402839, + "learning_rate": 1.4421923392916037e-08, + "loss": 0.3192, + "step": 10034 + }, + { + "epoch": 0.9766423357664233, + "grad_norm": 1.522459058289411, + "learning_rate": 1.4302544662323503e-08, + "loss": 0.3324, + "step": 10035 + }, + { + "epoch": 0.9767396593673966, + "grad_norm": 1.8950286526891722, + "learning_rate": 1.4183661363551538e-08, + "loss": 0.5218, + "step": 10036 + }, + { + "epoch": 0.9768369829683698, + "grad_norm": 2.154373900764878, + "learning_rate": 1.4065273508413469e-08, + "loss": 0.3709, + "step": 10037 + }, + { + "epoch": 0.9769343065693431, + "grad_norm": 1.717978842004379, + "learning_rate": 1.3947381108673774e-08, + "loss": 0.4445, + "step": 10038 + }, + { + "epoch": 0.9770316301703164, + "grad_norm": 1.7550433918079935, + "learning_rate": 1.3829984176047528e-08, + "loss": 0.3505, + "step": 10039 + }, + { + "epoch": 0.9771289537712895, + "grad_norm": 1.9377996151512222, + "learning_rate": 1.3713082722200954e-08, + "loss": 0.2372, + "step": 10040 + }, + { + "epoch": 0.9772262773722628, + "grad_norm": 1.5454624546838012, + "learning_rate": 1.3596676758749205e-08, + "loss": 0.3375, + "step": 10041 + }, + { + "epoch": 0.977323600973236, + "grad_norm": 1.7590709335442138, + "learning_rate": 1.348076629726136e-08, + "loss": 0.3287, + "step": 10042 + }, + { + "epoch": 0.9774209245742093, + "grad_norm": 1.565832170801034, + "learning_rate": 1.3365351349254874e-08, + "loss": 0.4014, + "step": 10043 + }, + { + "epoch": 0.9775182481751825, + "grad_norm": 1.6368765015247475, + "learning_rate": 1.3250431926197793e-08, + "loss": 0.3442, + "step": 10044 + }, + { + "epoch": 0.9776155717761558, + "grad_norm": 1.5215426667668548, + "learning_rate": 1.3136008039510428e-08, + "loss": 0.3641, + "step": 10045 + }, + { + "epoch": 0.9777128953771289, + "grad_norm": 1.2996424366650012, + "learning_rate": 1.3022079700563684e-08, + "loss": 0.3062, + "step": 10046 + }, + { + "epoch": 0.9778102189781022, + "grad_norm": 1.5220470636688037, + "learning_rate": 1.2908646920677947e-08, + "loss": 0.2494, + "step": 10047 + }, + { + "epoch": 0.9779075425790754, + "grad_norm": 1.5362936761419146, + "learning_rate": 1.279570971112476e-08, + "loss": 0.393, + "step": 10048 + }, + { + "epoch": 0.9780048661800487, + "grad_norm": 2.048202782865991, + "learning_rate": 1.2683268083127365e-08, + "loss": 0.275, + "step": 10049 + }, + { + "epoch": 0.9781021897810219, + "grad_norm": 1.991423636203803, + "learning_rate": 1.2571322047859602e-08, + "loss": 0.4997, + "step": 10050 + }, + { + "epoch": 0.9781995133819952, + "grad_norm": 1.4930581952843014, + "learning_rate": 1.2459871616444796e-08, + "loss": 0.3327, + "step": 10051 + }, + { + "epoch": 0.9782968369829683, + "grad_norm": 1.6358326992215706, + "learning_rate": 1.2348916799958532e-08, + "loss": 0.1977, + "step": 10052 + }, + { + "epoch": 0.9783941605839416, + "grad_norm": 2.2137307359319283, + "learning_rate": 1.2238457609425325e-08, + "loss": 0.3645, + "step": 10053 + }, + { + "epoch": 0.9784914841849148, + "grad_norm": 1.8366177211869856, + "learning_rate": 1.212849405582306e-08, + "loss": 0.4417, + "step": 10054 + }, + { + "epoch": 0.9785888077858881, + "grad_norm": 1.5366421174731555, + "learning_rate": 1.201902615007744e-08, + "loss": 0.3805, + "step": 10055 + }, + { + "epoch": 0.9786861313868613, + "grad_norm": 1.6490094245840268, + "learning_rate": 1.1910053903067542e-08, + "loss": 0.3528, + "step": 10056 + }, + { + "epoch": 0.9787834549878346, + "grad_norm": 1.7614536128177325, + "learning_rate": 1.1801577325621372e-08, + "loss": 0.3961, + "step": 10057 + }, + { + "epoch": 0.9788807785888077, + "grad_norm": 1.7992979216952498, + "learning_rate": 1.1693596428518083e-08, + "loss": 0.4902, + "step": 10058 + }, + { + "epoch": 0.978978102189781, + "grad_norm": 1.7489607313577806, + "learning_rate": 1.1586111222488539e-08, + "loss": 0.5566, + "step": 10059 + }, + { + "epoch": 0.9790754257907542, + "grad_norm": 1.6002030720587221, + "learning_rate": 1.1479121718213082e-08, + "loss": 0.2929, + "step": 10060 + }, + { + "epoch": 0.9791727493917275, + "grad_norm": 1.7983629782952275, + "learning_rate": 1.1372627926323765e-08, + "loss": 0.3462, + "step": 10061 + }, + { + "epoch": 0.9792700729927007, + "grad_norm": 1.4720016360221977, + "learning_rate": 1.1266629857402677e-08, + "loss": 0.2878, + "step": 10062 + }, + { + "epoch": 0.979367396593674, + "grad_norm": 1.5533786804445395, + "learning_rate": 1.1161127521982506e-08, + "loss": 0.3347, + "step": 10063 + }, + { + "epoch": 0.9794647201946473, + "grad_norm": 1.4971352878512332, + "learning_rate": 1.1056120930547643e-08, + "loss": 0.273, + "step": 10064 + }, + { + "epoch": 0.9795620437956204, + "grad_norm": 1.563857671261597, + "learning_rate": 1.0951610093533071e-08, + "loss": 0.2004, + "step": 10065 + }, + { + "epoch": 0.9796593673965936, + "grad_norm": 1.5978396434033968, + "learning_rate": 1.084759502132271e-08, + "loss": 0.2583, + "step": 10066 + }, + { + "epoch": 0.9797566909975669, + "grad_norm": 1.6376738390484344, + "learning_rate": 1.0744075724253843e-08, + "loss": 0.2074, + "step": 10067 + }, + { + "epoch": 0.9798540145985402, + "grad_norm": 1.8194626555692168, + "learning_rate": 1.064105221261269e-08, + "loss": 0.5449, + "step": 10068 + }, + { + "epoch": 0.9799513381995134, + "grad_norm": 1.5832377789759222, + "learning_rate": 1.0538524496636616e-08, + "loss": 0.2858, + "step": 10069 + }, + { + "epoch": 0.9800486618004867, + "grad_norm": 1.7814193109723289, + "learning_rate": 1.0436492586514135e-08, + "loss": 0.5048, + "step": 10070 + }, + { + "epoch": 0.9801459854014598, + "grad_norm": 1.3216344321378053, + "learning_rate": 1.0334956492384918e-08, + "loss": 0.2117, + "step": 10071 + }, + { + "epoch": 0.9802433090024331, + "grad_norm": 2.275491736789118, + "learning_rate": 1.0233916224337558e-08, + "loss": 0.5376, + "step": 10072 + }, + { + "epoch": 0.9803406326034063, + "grad_norm": 1.5709558184534391, + "learning_rate": 1.0133371792412916e-08, + "loss": 0.3885, + "step": 10073 + }, + { + "epoch": 0.9804379562043796, + "grad_norm": 1.7768708993902669, + "learning_rate": 1.0033323206601886e-08, + "loss": 0.5512, + "step": 10074 + }, + { + "epoch": 0.9805352798053528, + "grad_norm": 1.7550330673536847, + "learning_rate": 9.933770476847072e-09, + "loss": 0.4581, + "step": 10075 + }, + { + "epoch": 0.9806326034063261, + "grad_norm": 1.7557570583587823, + "learning_rate": 9.834713613040004e-09, + "loss": 0.3429, + "step": 10076 + }, + { + "epoch": 0.9807299270072992, + "grad_norm": 1.909895241346043, + "learning_rate": 9.73615262502503e-09, + "loss": 0.279, + "step": 10077 + }, + { + "epoch": 0.9808272506082725, + "grad_norm": 1.7917413104573001, + "learning_rate": 9.63808752259543e-09, + "loss": 0.2901, + "step": 10078 + }, + { + "epoch": 0.9809245742092457, + "grad_norm": 1.87293364786237, + "learning_rate": 9.540518315496739e-09, + "loss": 0.441, + "step": 10079 + }, + { + "epoch": 0.981021897810219, + "grad_norm": 3.1468044238998063, + "learning_rate": 9.443445013424534e-09, + "loss": 0.2956, + "step": 10080 + }, + { + "epoch": 0.9811192214111922, + "grad_norm": 2.6418241227293255, + "learning_rate": 9.346867626023881e-09, + "loss": 0.6436, + "step": 10081 + }, + { + "epoch": 0.9812165450121655, + "grad_norm": 1.977489992639657, + "learning_rate": 9.250786162893211e-09, + "loss": 0.4341, + "step": 10082 + }, + { + "epoch": 0.9813138686131387, + "grad_norm": 1.6259598464793898, + "learning_rate": 9.155200633578776e-09, + "loss": 0.3071, + "step": 10083 + }, + { + "epoch": 0.9814111922141119, + "grad_norm": 1.8848568276589883, + "learning_rate": 9.0601110475802e-09, + "loss": 0.3769, + "step": 10084 + }, + { + "epoch": 0.9815085158150851, + "grad_norm": 1.5779333470365025, + "learning_rate": 8.965517414346037e-09, + "loss": 0.2043, + "step": 10085 + }, + { + "epoch": 0.9816058394160584, + "grad_norm": 1.6730329834946722, + "learning_rate": 8.871419743275989e-09, + "loss": 0.4106, + "step": 10086 + }, + { + "epoch": 0.9817031630170316, + "grad_norm": 1.625341819611682, + "learning_rate": 8.777818043720354e-09, + "loss": 0.2284, + "step": 10087 + }, + { + "epoch": 0.9818004866180049, + "grad_norm": 1.5166227851360399, + "learning_rate": 8.684712324981136e-09, + "loss": 0.3592, + "step": 10088 + }, + { + "epoch": 0.9818978102189782, + "grad_norm": 1.3003103876039435, + "learning_rate": 8.59210259631038e-09, + "loss": 0.2801, + "step": 10089 + }, + { + "epoch": 0.9819951338199513, + "grad_norm": 1.525229940120262, + "learning_rate": 8.499988866909614e-09, + "loss": 0.3649, + "step": 10090 + }, + { + "epoch": 0.9820924574209245, + "grad_norm": 2.1104008193580595, + "learning_rate": 8.408371145933736e-09, + "loss": 0.3502, + "step": 10091 + }, + { + "epoch": 0.9821897810218978, + "grad_norm": 1.5762296569140708, + "learning_rate": 8.317249442485465e-09, + "loss": 0.3479, + "step": 10092 + }, + { + "epoch": 0.982287104622871, + "grad_norm": 1.5175732547480572, + "learning_rate": 8.226623765620335e-09, + "loss": 0.2838, + "step": 10093 + }, + { + "epoch": 0.9823844282238443, + "grad_norm": 1.8181428872622187, + "learning_rate": 8.136494124343918e-09, + "loss": 0.3583, + "step": 10094 + }, + { + "epoch": 0.9824817518248176, + "grad_norm": 1.754898436343569, + "learning_rate": 8.046860527612388e-09, + "loss": 0.5918, + "step": 10095 + }, + { + "epoch": 0.9825790754257907, + "grad_norm": 1.7430617529500119, + "learning_rate": 7.957722984332506e-09, + "loss": 0.3412, + "step": 10096 + }, + { + "epoch": 0.982676399026764, + "grad_norm": 1.797611848431201, + "learning_rate": 7.869081503362185e-09, + "loss": 0.2414, + "step": 10097 + }, + { + "epoch": 0.9827737226277372, + "grad_norm": 2.066018542314541, + "learning_rate": 7.780936093509939e-09, + "loss": 0.5061, + "step": 10098 + }, + { + "epoch": 0.9828710462287105, + "grad_norm": 1.6656143044992557, + "learning_rate": 7.693286763533758e-09, + "loss": 0.4574, + "step": 10099 + }, + { + "epoch": 0.9829683698296837, + "grad_norm": 2.7679686018480503, + "learning_rate": 7.606133522144454e-09, + "loss": 0.2762, + "step": 10100 + }, + { + "epoch": 0.983065693430657, + "grad_norm": 1.6969222762967053, + "learning_rate": 7.519476378002322e-09, + "loss": 0.3666, + "step": 10101 + }, + { + "epoch": 0.9831630170316301, + "grad_norm": 1.7164779210569392, + "learning_rate": 7.433315339718805e-09, + "loss": 0.2725, + "step": 10102 + }, + { + "epoch": 0.9832603406326034, + "grad_norm": 1.7902742638202855, + "learning_rate": 7.347650415854835e-09, + "loss": 0.3924, + "step": 10103 + }, + { + "epoch": 0.9833576642335766, + "grad_norm": 1.7196824752121416, + "learning_rate": 7.262481614924155e-09, + "loss": 0.4553, + "step": 10104 + }, + { + "epoch": 0.9834549878345499, + "grad_norm": 2.068928497081907, + "learning_rate": 7.177808945388887e-09, + "loss": 0.2983, + "step": 10105 + }, + { + "epoch": 0.9835523114355231, + "grad_norm": 1.5263037689760284, + "learning_rate": 7.093632415663964e-09, + "loss": 0.4884, + "step": 10106 + }, + { + "epoch": 0.9836496350364964, + "grad_norm": 1.8365379487896134, + "learning_rate": 7.009952034113809e-09, + "loss": 0.3415, + "step": 10107 + }, + { + "epoch": 0.9837469586374696, + "grad_norm": 1.4011360429745006, + "learning_rate": 6.9267678090534366e-09, + "loss": 0.4226, + "step": 10108 + }, + { + "epoch": 0.9838442822384428, + "grad_norm": 1.9042937449894148, + "learning_rate": 6.844079748749011e-09, + "loss": 0.3495, + "step": 10109 + }, + { + "epoch": 0.983941605839416, + "grad_norm": 1.4134184071214662, + "learning_rate": 6.761887861417293e-09, + "loss": 0.2184, + "step": 10110 + }, + { + "epoch": 0.9840389294403893, + "grad_norm": 1.7951687880616172, + "learning_rate": 6.680192155226195e-09, + "loss": 0.4372, + "step": 10111 + }, + { + "epoch": 0.9841362530413625, + "grad_norm": 1.5233741160191667, + "learning_rate": 6.5989926382931115e-09, + "loss": 0.2393, + "step": 10112 + }, + { + "epoch": 0.9842335766423358, + "grad_norm": 1.5666187170820838, + "learning_rate": 6.518289318687698e-09, + "loss": 0.3548, + "step": 10113 + }, + { + "epoch": 0.984330900243309, + "grad_norm": 1.6243124020628665, + "learning_rate": 6.438082204429097e-09, + "loss": 0.2459, + "step": 10114 + }, + { + "epoch": 0.9844282238442822, + "grad_norm": 1.7535962890780648, + "learning_rate": 6.3583713034875984e-09, + "loss": 0.4119, + "step": 10115 + }, + { + "epoch": 0.9845255474452554, + "grad_norm": 1.8658724187880504, + "learning_rate": 6.2791566237840886e-09, + "loss": 0.257, + "step": 10116 + }, + { + "epoch": 0.9846228710462287, + "grad_norm": 1.6598020646998646, + "learning_rate": 6.200438173190049e-09, + "loss": 0.3657, + "step": 10117 + }, + { + "epoch": 0.984720194647202, + "grad_norm": 1.5896436860645908, + "learning_rate": 6.122215959527555e-09, + "loss": 0.2788, + "step": 10118 + }, + { + "epoch": 0.9848175182481752, + "grad_norm": 1.7178958504385828, + "learning_rate": 6.04448999057039e-09, + "loss": 0.2666, + "step": 10119 + }, + { + "epoch": 0.9849148418491485, + "grad_norm": 1.530317528630525, + "learning_rate": 5.967260274041819e-09, + "loss": 0.3351, + "step": 10120 + }, + { + "epoch": 0.9850121654501216, + "grad_norm": 1.3948292133423756, + "learning_rate": 5.89052681761626e-09, + "loss": 0.3067, + "step": 10121 + }, + { + "epoch": 0.9851094890510949, + "grad_norm": 1.8940753191520583, + "learning_rate": 5.8142896289181685e-09, + "loss": 0.4455, + "step": 10122 + }, + { + "epoch": 0.9852068126520681, + "grad_norm": 1.7146179467651506, + "learning_rate": 5.738548715524261e-09, + "loss": 0.2119, + "step": 10123 + }, + { + "epoch": 0.9853041362530414, + "grad_norm": 1.9503446396690298, + "learning_rate": 5.6633040849601865e-09, + "loss": 0.3428, + "step": 10124 + }, + { + "epoch": 0.9854014598540146, + "grad_norm": 1.73035508034364, + "learning_rate": 5.588555744703295e-09, + "loss": 0.3412, + "step": 10125 + }, + { + "epoch": 0.9854987834549879, + "grad_norm": 1.6397158328866068, + "learning_rate": 5.514303702181534e-09, + "loss": 0.3121, + "step": 10126 + }, + { + "epoch": 0.9855961070559611, + "grad_norm": 2.0331156452572356, + "learning_rate": 5.440547964773446e-09, + "loss": 0.2979, + "step": 10127 + }, + { + "epoch": 0.9856934306569343, + "grad_norm": 1.7551051334940906, + "learning_rate": 5.367288539808169e-09, + "loss": 0.4846, + "step": 10128 + }, + { + "epoch": 0.9857907542579075, + "grad_norm": 1.5525338289319262, + "learning_rate": 5.294525434564879e-09, + "loss": 0.3292, + "step": 10129 + }, + { + "epoch": 0.9858880778588808, + "grad_norm": 1.4011796034894104, + "learning_rate": 5.2222586562750146e-09, + "loss": 0.3753, + "step": 10130 + }, + { + "epoch": 0.985985401459854, + "grad_norm": 1.5434798927146662, + "learning_rate": 5.150488212118943e-09, + "loss": 0.387, + "step": 10131 + }, + { + "epoch": 0.9860827250608273, + "grad_norm": 1.9406876951646739, + "learning_rate": 5.079214109229292e-09, + "loss": 0.3544, + "step": 10132 + }, + { + "epoch": 0.9861800486618005, + "grad_norm": 1.446878770201284, + "learning_rate": 5.008436354688173e-09, + "loss": 0.3112, + "step": 10133 + }, + { + "epoch": 0.9862773722627737, + "grad_norm": 1.4662114383345521, + "learning_rate": 4.938154955528296e-09, + "loss": 0.2477, + "step": 10134 + }, + { + "epoch": 0.9863746958637469, + "grad_norm": 2.085504608383337, + "learning_rate": 4.868369918735183e-09, + "loss": 0.5032, + "step": 10135 + }, + { + "epoch": 0.9864720194647202, + "grad_norm": 1.8533849318386093, + "learning_rate": 4.799081251241622e-09, + "loss": 0.4189, + "step": 10136 + }, + { + "epoch": 0.9865693430656934, + "grad_norm": 2.271349705306333, + "learning_rate": 4.730288959934326e-09, + "loss": 0.3549, + "step": 10137 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 1.3869106536036873, + "learning_rate": 4.661993051647828e-09, + "loss": 0.1986, + "step": 10138 + }, + { + "epoch": 0.98676399026764, + "grad_norm": 1.581438570501715, + "learning_rate": 4.594193533170033e-09, + "loss": 0.2918, + "step": 10139 + }, + { + "epoch": 0.9868613138686131, + "grad_norm": 2.066513433723341, + "learning_rate": 4.526890411237217e-09, + "loss": 0.4232, + "step": 10140 + }, + { + "epoch": 0.9869586374695863, + "grad_norm": 1.6907852125143448, + "learning_rate": 4.4600836925379194e-09, + "loss": 0.4307, + "step": 10141 + }, + { + "epoch": 0.9870559610705596, + "grad_norm": 1.6510798533102014, + "learning_rate": 4.39377338371072e-09, + "loss": 0.4157, + "step": 10142 + }, + { + "epoch": 0.9871532846715328, + "grad_norm": 1.5492639301308437, + "learning_rate": 4.327959491344791e-09, + "loss": 0.3448, + "step": 10143 + }, + { + "epoch": 0.9872506082725061, + "grad_norm": 1.9646074824165363, + "learning_rate": 4.262642021979901e-09, + "loss": 0.202, + "step": 10144 + }, + { + "epoch": 0.9873479318734794, + "grad_norm": 1.4203524979434883, + "learning_rate": 4.1978209821069706e-09, + "loss": 0.3508, + "step": 10145 + }, + { + "epoch": 0.9874452554744526, + "grad_norm": 1.617625449031163, + "learning_rate": 4.133496378167512e-09, + "loss": 0.405, + "step": 10146 + }, + { + "epoch": 0.9875425790754258, + "grad_norm": 1.8877064269913355, + "learning_rate": 4.06966821655308e-09, + "loss": 0.5345, + "step": 10147 + }, + { + "epoch": 0.987639902676399, + "grad_norm": 2.1206088978464965, + "learning_rate": 4.006336503606379e-09, + "loss": 0.2152, + "step": 10148 + }, + { + "epoch": 0.9877372262773723, + "grad_norm": 2.249025663705205, + "learning_rate": 3.943501245620707e-09, + "loss": 0.2688, + "step": 10149 + }, + { + "epoch": 0.9878345498783455, + "grad_norm": 2.117078039288698, + "learning_rate": 3.881162448840514e-09, + "loss": 0.5, + "step": 10150 + }, + { + "epoch": 0.9879318734793188, + "grad_norm": 1.7959764970800607, + "learning_rate": 3.819320119459735e-09, + "loss": 0.267, + "step": 10151 + }, + { + "epoch": 0.988029197080292, + "grad_norm": 2.0277872941767843, + "learning_rate": 3.757974263624009e-09, + "loss": 0.3672, + "step": 10152 + }, + { + "epoch": 0.9881265206812652, + "grad_norm": 1.6163597752936243, + "learning_rate": 3.697124887429571e-09, + "loss": 0.373, + "step": 10153 + }, + { + "epoch": 0.9882238442822384, + "grad_norm": 2.7997111001155295, + "learning_rate": 3.6367719969226944e-09, + "loss": 0.2195, + "step": 10154 + }, + { + "epoch": 0.9883211678832117, + "grad_norm": 1.5759305142851643, + "learning_rate": 3.57691559810025e-09, + "loss": 0.2842, + "step": 10155 + }, + { + "epoch": 0.9884184914841849, + "grad_norm": 1.537303403365062, + "learning_rate": 3.517555696911368e-09, + "loss": 0.3595, + "step": 10156 + }, + { + "epoch": 0.9885158150851582, + "grad_norm": 1.4396385373665412, + "learning_rate": 3.4586922992541073e-09, + "loss": 0.2264, + "step": 10157 + }, + { + "epoch": 0.9886131386861314, + "grad_norm": 1.4108164501963798, + "learning_rate": 3.4003254109776783e-09, + "loss": 0.3209, + "step": 10158 + }, + { + "epoch": 0.9887104622871046, + "grad_norm": 1.614618149248701, + "learning_rate": 3.3424550378818865e-09, + "loss": 0.3904, + "step": 10159 + }, + { + "epoch": 0.9888077858880778, + "grad_norm": 1.7542109914995272, + "learning_rate": 3.2850811857171315e-09, + "loss": 0.3686, + "step": 10160 + }, + { + "epoch": 0.9889051094890511, + "grad_norm": 2.789556293959523, + "learning_rate": 3.228203860185519e-09, + "loss": 0.276, + "step": 10161 + }, + { + "epoch": 0.9890024330900243, + "grad_norm": 1.8259046063193372, + "learning_rate": 3.1718230669386394e-09, + "loss": 0.2938, + "step": 10162 + }, + { + "epoch": 0.9890997566909976, + "grad_norm": 1.659234363178178, + "learning_rate": 3.115938811579233e-09, + "loss": 0.4896, + "step": 10163 + }, + { + "epoch": 0.9891970802919708, + "grad_norm": 1.9277549054599503, + "learning_rate": 3.0605510996595255e-09, + "loss": 0.3109, + "step": 10164 + }, + { + "epoch": 0.989294403892944, + "grad_norm": 2.053644532033876, + "learning_rate": 3.0056599366851124e-09, + "loss": 0.3448, + "step": 10165 + }, + { + "epoch": 0.9893917274939172, + "grad_norm": 1.676355081403625, + "learning_rate": 2.951265328108854e-09, + "loss": 0.3718, + "step": 10166 + }, + { + "epoch": 0.9894890510948905, + "grad_norm": 1.4805967733543572, + "learning_rate": 2.8973672793375373e-09, + "loss": 0.2268, + "step": 10167 + }, + { + "epoch": 0.9895863746958637, + "grad_norm": 1.719493843722048, + "learning_rate": 2.843965795725767e-09, + "loss": 0.3647, + "step": 10168 + }, + { + "epoch": 0.989683698296837, + "grad_norm": 1.3915983072878595, + "learning_rate": 2.7910608825809647e-09, + "loss": 0.2695, + "step": 10169 + }, + { + "epoch": 0.9897810218978103, + "grad_norm": 1.627630305779688, + "learning_rate": 2.7386525451594816e-09, + "loss": 0.4397, + "step": 10170 + }, + { + "epoch": 0.9898783454987835, + "grad_norm": 1.6617084741386776, + "learning_rate": 2.686740788669928e-09, + "loss": 0.2979, + "step": 10171 + }, + { + "epoch": 0.9899756690997566, + "grad_norm": 1.7639435720436196, + "learning_rate": 2.635325618270401e-09, + "loss": 0.6115, + "step": 10172 + }, + { + "epoch": 0.9900729927007299, + "grad_norm": 1.704455844373239, + "learning_rate": 2.584407039070702e-09, + "loss": 0.3975, + "step": 10173 + }, + { + "epoch": 0.9901703163017032, + "grad_norm": 1.747665210167157, + "learning_rate": 2.533985056129562e-09, + "loss": 0.4637, + "step": 10174 + }, + { + "epoch": 0.9902676399026764, + "grad_norm": 1.7201196934948249, + "learning_rate": 2.4840596744585277e-09, + "loss": 0.335, + "step": 10175 + }, + { + "epoch": 0.9903649635036497, + "grad_norm": 2.1914342106686027, + "learning_rate": 2.4346308990175205e-09, + "loss": 0.3058, + "step": 10176 + }, + { + "epoch": 0.9904622871046229, + "grad_norm": 1.5808494392235486, + "learning_rate": 2.3856987347192772e-09, + "loss": 0.372, + "step": 10177 + }, + { + "epoch": 0.9905596107055961, + "grad_norm": 1.7245530392527575, + "learning_rate": 2.33726318642602e-09, + "loss": 0.5439, + "step": 10178 + }, + { + "epoch": 0.9906569343065693, + "grad_norm": 1.7369906660058665, + "learning_rate": 2.28932425895112e-09, + "loss": 0.4372, + "step": 10179 + }, + { + "epoch": 0.9907542579075426, + "grad_norm": 1.8690727644094616, + "learning_rate": 2.2418819570574346e-09, + "loss": 0.3435, + "step": 10180 + }, + { + "epoch": 0.9908515815085158, + "grad_norm": 1.4492678569689201, + "learning_rate": 2.194936285460081e-09, + "loss": 0.2937, + "step": 10181 + }, + { + "epoch": 0.9909489051094891, + "grad_norm": 1.60514958311479, + "learning_rate": 2.148487248823661e-09, + "loss": 0.4702, + "step": 10182 + }, + { + "epoch": 0.9910462287104623, + "grad_norm": 1.5752298217701577, + "learning_rate": 2.1025348517639267e-09, + "loss": 0.4531, + "step": 10183 + }, + { + "epoch": 0.9911435523114355, + "grad_norm": 1.777263530555741, + "learning_rate": 2.0570790988472256e-09, + "loss": 0.5723, + "step": 10184 + }, + { + "epoch": 0.9912408759124087, + "grad_norm": 1.8886389270894859, + "learning_rate": 2.0121199945905e-09, + "loss": 0.3539, + "step": 10185 + }, + { + "epoch": 0.991338199513382, + "grad_norm": 1.8457090024501326, + "learning_rate": 1.9676575434612877e-09, + "loss": 0.4289, + "step": 10186 + }, + { + "epoch": 0.9914355231143552, + "grad_norm": 1.865774985056033, + "learning_rate": 1.923691749878276e-09, + "loss": 0.3129, + "step": 10187 + }, + { + "epoch": 0.9915328467153285, + "grad_norm": 2.0114446064345493, + "learning_rate": 1.8802226182101925e-09, + "loss": 0.45, + "step": 10188 + }, + { + "epoch": 0.9916301703163017, + "grad_norm": 1.8467448206429093, + "learning_rate": 1.8372501527763598e-09, + "loss": 0.5029, + "step": 10189 + }, + { + "epoch": 0.991727493917275, + "grad_norm": 1.6211326502533367, + "learning_rate": 1.7947743578466958e-09, + "loss": 0.4954, + "step": 10190 + }, + { + "epoch": 0.9918248175182481, + "grad_norm": 1.687919988290718, + "learning_rate": 1.7527952376428236e-09, + "loss": 0.4389, + "step": 10191 + }, + { + "epoch": 0.9919221411192214, + "grad_norm": 1.7946042515574991, + "learning_rate": 1.711312796335851e-09, + "loss": 0.285, + "step": 10192 + }, + { + "epoch": 0.9920194647201946, + "grad_norm": 1.5967734586379025, + "learning_rate": 1.6703270380480364e-09, + "loss": 0.3104, + "step": 10193 + }, + { + "epoch": 0.9921167883211679, + "grad_norm": 1.6708262353811334, + "learning_rate": 1.6298379668516773e-09, + "loss": 0.5358, + "step": 10194 + }, + { + "epoch": 0.9922141119221412, + "grad_norm": 1.7161013221660495, + "learning_rate": 1.5898455867707773e-09, + "loss": 0.3185, + "step": 10195 + }, + { + "epoch": 0.9923114355231144, + "grad_norm": 1.5903500962043906, + "learning_rate": 1.55034990177938e-09, + "loss": 0.4521, + "step": 10196 + }, + { + "epoch": 0.9924087591240875, + "grad_norm": 1.8581204068575898, + "learning_rate": 1.5113509158015682e-09, + "loss": 0.4294, + "step": 10197 + }, + { + "epoch": 0.9925060827250608, + "grad_norm": 1.7019343982968196, + "learning_rate": 1.472848632713686e-09, + "loss": 0.4834, + "step": 10198 + }, + { + "epoch": 0.992603406326034, + "grad_norm": 1.5420309959393632, + "learning_rate": 1.4348430563404514e-09, + "loss": 0.3684, + "step": 10199 + }, + { + "epoch": 0.9927007299270073, + "grad_norm": 1.7447434224153548, + "learning_rate": 1.3973341904599536e-09, + "loss": 0.4463, + "step": 10200 + }, + { + "epoch": 0.9927980535279806, + "grad_norm": 1.6872920155394338, + "learning_rate": 1.3603220387981009e-09, + "loss": 0.4509, + "step": 10201 + }, + { + "epoch": 0.9928953771289538, + "grad_norm": 1.6787863509729684, + "learning_rate": 1.3238066050341726e-09, + "loss": 0.4165, + "step": 10202 + }, + { + "epoch": 0.992992700729927, + "grad_norm": 2.005666573088554, + "learning_rate": 1.287787892795267e-09, + "loss": 0.4316, + "step": 10203 + }, + { + "epoch": 0.9930900243309002, + "grad_norm": 1.684136040038031, + "learning_rate": 1.2522659056618536e-09, + "loss": 0.2634, + "step": 10204 + }, + { + "epoch": 0.9931873479318735, + "grad_norm": 1.7484990904240891, + "learning_rate": 1.2172406471633314e-09, + "loss": 0.5472, + "step": 10205 + }, + { + "epoch": 0.9932846715328467, + "grad_norm": 1.4640338744272896, + "learning_rate": 1.1827121207796943e-09, + "loss": 0.3913, + "step": 10206 + }, + { + "epoch": 0.99338199513382, + "grad_norm": 1.889903790078848, + "learning_rate": 1.1486803299426419e-09, + "loss": 0.487, + "step": 10207 + }, + { + "epoch": 0.9934793187347932, + "grad_norm": 1.685847285475849, + "learning_rate": 1.1151452780333583e-09, + "loss": 0.3143, + "step": 10208 + }, + { + "epoch": 0.9935766423357664, + "grad_norm": 1.6087248146202024, + "learning_rate": 1.082106968385288e-09, + "loss": 0.3885, + "step": 10209 + }, + { + "epoch": 0.9936739659367396, + "grad_norm": 1.9165687812467131, + "learning_rate": 1.0495654042808056e-09, + "loss": 0.3859, + "step": 10210 + }, + { + "epoch": 0.9937712895377129, + "grad_norm": 1.6050180512598589, + "learning_rate": 1.0175205889528805e-09, + "loss": 0.4109, + "step": 10211 + }, + { + "epoch": 0.9938686131386861, + "grad_norm": 1.5577338195949784, + "learning_rate": 9.859725255872977e-10, + "loss": 0.2322, + "step": 10212 + }, + { + "epoch": 0.9939659367396594, + "grad_norm": 1.5560794905434274, + "learning_rate": 9.549212173182165e-10, + "loss": 0.2349, + "step": 10213 + }, + { + "epoch": 0.9940632603406326, + "grad_norm": 1.9038707706368625, + "learning_rate": 9.243666672309471e-10, + "loss": 0.4671, + "step": 10214 + }, + { + "epoch": 0.9941605839416059, + "grad_norm": 1.8751275972819175, + "learning_rate": 8.943088783619491e-10, + "loss": 0.3691, + "step": 10215 + }, + { + "epoch": 0.994257907542579, + "grad_norm": 1.893888887656737, + "learning_rate": 8.647478536982779e-10, + "loss": 0.3156, + "step": 10216 + }, + { + "epoch": 0.9943552311435523, + "grad_norm": 1.6704147949867993, + "learning_rate": 8.356835961775834e-10, + "loss": 0.4777, + "step": 10217 + }, + { + "epoch": 0.9944525547445255, + "grad_norm": 1.9337605722307276, + "learning_rate": 8.071161086875557e-10, + "loss": 0.4404, + "step": 10218 + }, + { + "epoch": 0.9945498783454988, + "grad_norm": 1.3859997241858006, + "learning_rate": 7.790453940675902e-10, + "loss": 0.3193, + "step": 10219 + }, + { + "epoch": 0.994647201946472, + "grad_norm": 1.630436532945661, + "learning_rate": 7.51471455106012e-10, + "loss": 0.2224, + "step": 10220 + }, + { + "epoch": 0.9947445255474453, + "grad_norm": 1.743814025578543, + "learning_rate": 7.243942945439619e-10, + "loss": 0.5075, + "step": 10221 + }, + { + "epoch": 0.9948418491484184, + "grad_norm": 1.5703633618065114, + "learning_rate": 6.978139150709551e-10, + "loss": 0.298, + "step": 10222 + }, + { + "epoch": 0.9949391727493917, + "grad_norm": 1.418497706764043, + "learning_rate": 6.717303193298774e-10, + "loss": 0.2721, + "step": 10223 + }, + { + "epoch": 0.995036496350365, + "grad_norm": 1.889396836043909, + "learning_rate": 6.461435099114344e-10, + "loss": 0.1676, + "step": 10224 + }, + { + "epoch": 0.9951338199513382, + "grad_norm": 1.6992218368326848, + "learning_rate": 6.210534893580367e-10, + "loss": 0.5394, + "step": 10225 + }, + { + "epoch": 0.9952311435523115, + "grad_norm": 1.7786460046130461, + "learning_rate": 5.964602601643555e-10, + "loss": 0.3683, + "step": 10226 + }, + { + "epoch": 0.9953284671532847, + "grad_norm": 1.7925175117690704, + "learning_rate": 5.723638247723262e-10, + "loss": 0.4741, + "step": 10227 + }, + { + "epoch": 0.9954257907542579, + "grad_norm": 1.4769561888615816, + "learning_rate": 5.487641855783654e-10, + "loss": 0.2009, + "step": 10228 + }, + { + "epoch": 0.9955231143552311, + "grad_norm": 1.8162520148311394, + "learning_rate": 5.256613449255987e-10, + "loss": 0.3064, + "step": 10229 + }, + { + "epoch": 0.9956204379562044, + "grad_norm": 1.9026471469834, + "learning_rate": 5.030553051116327e-10, + "loss": 0.4989, + "step": 10230 + }, + { + "epoch": 0.9957177615571776, + "grad_norm": 2.138271402288248, + "learning_rate": 4.809460683818934e-10, + "loss": 0.284, + "step": 10231 + }, + { + "epoch": 0.9958150851581509, + "grad_norm": 1.7100717801649878, + "learning_rate": 4.5933363693295707e-10, + "loss": 0.4052, + "step": 10232 + }, + { + "epoch": 0.9959124087591241, + "grad_norm": 1.4879635449220003, + "learning_rate": 4.382180129136604e-10, + "loss": 0.346, + "step": 10233 + }, + { + "epoch": 0.9960097323600974, + "grad_norm": 1.394483677669729, + "learning_rate": 4.1759919842121464e-10, + "loss": 0.3393, + "step": 10234 + }, + { + "epoch": 0.9961070559610705, + "grad_norm": 1.6117837157772448, + "learning_rate": 3.9747719550509155e-10, + "loss": 0.4577, + "step": 10235 + }, + { + "epoch": 0.9962043795620438, + "grad_norm": 1.5905841724724437, + "learning_rate": 3.7785200616424765e-10, + "loss": 0.4144, + "step": 10236 + }, + { + "epoch": 0.996301703163017, + "grad_norm": 1.7056667015538385, + "learning_rate": 3.587236323493448e-10, + "loss": 0.3142, + "step": 10237 + }, + { + "epoch": 0.9963990267639903, + "grad_norm": 1.8491883327366088, + "learning_rate": 3.4009207596163996e-10, + "loss": 0.2966, + "step": 10238 + }, + { + "epoch": 0.9964963503649635, + "grad_norm": 1.640418339383596, + "learning_rate": 3.2195733885187484e-10, + "loss": 0.2852, + "step": 10239 + }, + { + "epoch": 0.9965936739659368, + "grad_norm": 1.7302124660319336, + "learning_rate": 3.043194228219415e-10, + "loss": 0.3631, + "step": 10240 + }, + { + "epoch": 0.9966909975669099, + "grad_norm": 2.275910700355181, + "learning_rate": 2.87178329624882e-10, + "loss": 0.2561, + "step": 10241 + }, + { + "epoch": 0.9967883211678832, + "grad_norm": 1.4887201257842386, + "learning_rate": 2.7053406096433366e-10, + "loss": 0.2345, + "step": 10242 + }, + { + "epoch": 0.9968856447688564, + "grad_norm": 1.4972768542801418, + "learning_rate": 2.543866184934185e-10, + "loss": 0.3368, + "step": 10243 + }, + { + "epoch": 0.9969829683698297, + "grad_norm": 1.7231975924504286, + "learning_rate": 2.3873600381751903e-10, + "loss": 0.4021, + "step": 10244 + }, + { + "epoch": 0.997080291970803, + "grad_norm": 1.6557526471478121, + "learning_rate": 2.2358221849150262e-10, + "loss": 0.4738, + "step": 10245 + }, + { + "epoch": 0.9971776155717762, + "grad_norm": 1.7925703150366477, + "learning_rate": 2.089252640208317e-10, + "loss": 0.3506, + "step": 10246 + }, + { + "epoch": 0.9972749391727493, + "grad_norm": 1.6732777922684796, + "learning_rate": 1.9476514186267392e-10, + "loss": 0.38, + "step": 10247 + }, + { + "epoch": 0.9973722627737226, + "grad_norm": 1.7360436600191822, + "learning_rate": 1.8110185342423703e-10, + "loss": 0.158, + "step": 10248 + }, + { + "epoch": 0.9974695863746958, + "grad_norm": 1.6990345822205684, + "learning_rate": 1.6793540006221354e-10, + "loss": 0.4599, + "step": 10249 + }, + { + "epoch": 0.9975669099756691, + "grad_norm": 1.7945395509746245, + "learning_rate": 1.5526578308611152e-10, + "loss": 0.3518, + "step": 10250 + }, + { + "epoch": 0.9976642335766424, + "grad_norm": 1.7381425780800086, + "learning_rate": 1.4309300375381362e-10, + "loss": 0.3858, + "step": 10251 + }, + { + "epoch": 0.9977615571776156, + "grad_norm": 1.6040398023237967, + "learning_rate": 1.314170632760181e-10, + "loss": 0.2474, + "step": 10252 + }, + { + "epoch": 0.9978588807785888, + "grad_norm": 1.8672830086386438, + "learning_rate": 1.202379628123529e-10, + "loss": 0.3808, + "step": 10253 + }, + { + "epoch": 0.997956204379562, + "grad_norm": 1.7226691213113954, + "learning_rate": 1.0955570347359612e-10, + "loss": 0.286, + "step": 10254 + }, + { + "epoch": 0.9980535279805353, + "grad_norm": 1.695693715914563, + "learning_rate": 9.937028632167612e-11, + "loss": 0.3033, + "step": 10255 + }, + { + "epoch": 0.9981508515815085, + "grad_norm": 1.923333893626554, + "learning_rate": 8.968171236856115e-11, + "loss": 0.2369, + "step": 10256 + }, + { + "epoch": 0.9982481751824818, + "grad_norm": 1.6885749783800197, + "learning_rate": 8.048998257681461e-11, + "loss": 0.4709, + "step": 10257 + }, + { + "epoch": 0.998345498783455, + "grad_norm": 1.6396463290775232, + "learning_rate": 7.179509785959493e-11, + "loss": 0.337, + "step": 10258 + }, + { + "epoch": 0.9984428223844283, + "grad_norm": 1.4478470307581177, + "learning_rate": 6.359705908176583e-11, + "loss": 0.2122, + "step": 10259 + }, + { + "epoch": 0.9985401459854014, + "grad_norm": 1.5580828983939918, + "learning_rate": 5.589586705767591e-11, + "loss": 0.3471, + "step": 10260 + }, + { + "epoch": 0.9986374695863747, + "grad_norm": 1.393987041894788, + "learning_rate": 4.869152255226883e-11, + "loss": 0.3252, + "step": 10261 + }, + { + "epoch": 0.9987347931873479, + "grad_norm": 1.700722215615325, + "learning_rate": 4.1984026281638446e-11, + "loss": 0.6204, + "step": 10262 + }, + { + "epoch": 0.9988321167883212, + "grad_norm": 1.7781347505808283, + "learning_rate": 3.5773378911918566e-11, + "loss": 0.349, + "step": 10263 + }, + { + "epoch": 0.9989294403892944, + "grad_norm": 1.6056660997261587, + "learning_rate": 3.0059581060948304e-11, + "loss": 0.3304, + "step": 10264 + }, + { + "epoch": 0.9990267639902677, + "grad_norm": 1.3395367595986913, + "learning_rate": 2.4842633296051634e-11, + "loss": 0.2333, + "step": 10265 + }, + { + "epoch": 0.9991240875912408, + "grad_norm": 2.0615053957642546, + "learning_rate": 2.0122536135702696e-11, + "loss": 0.3436, + "step": 10266 + }, + { + "epoch": 0.9992214111922141, + "grad_norm": 1.4417640634760027, + "learning_rate": 1.589929004897073e-11, + "loss": 0.3244, + "step": 10267 + }, + { + "epoch": 0.9993187347931873, + "grad_norm": 1.520677619522254, + "learning_rate": 1.217289545607514e-11, + "loss": 0.2417, + "step": 10268 + }, + { + "epoch": 0.9994160583941606, + "grad_norm": 1.7756492501912053, + "learning_rate": 8.943352726165088e-12, + "loss": 0.1968, + "step": 10269 + }, + { + "epoch": 0.9995133819951338, + "grad_norm": 2.1451558521978082, + "learning_rate": 6.210662181205252e-12, + "loss": 0.2943, + "step": 10270 + }, + { + "epoch": 0.9996107055961071, + "grad_norm": 2.1384910472212435, + "learning_rate": 3.974824092090046e-12, + "loss": 0.3544, + "step": 10271 + }, + { + "epoch": 0.9997080291970802, + "grad_norm": 1.692690108763571, + "learning_rate": 2.235838681419189e-12, + "loss": 0.2713, + "step": 10272 + }, + { + "epoch": 0.9998053527980535, + "grad_norm": 1.6981957875121463, + "learning_rate": 9.937061218323607e-13, + "loss": 0.3989, + "step": 10273 + }, + { + "epoch": 0.9999026763990267, + "grad_norm": 1.5230612265690207, + "learning_rate": 2.484265365643168e-13, + "loss": 0.2062, + "step": 10274 + }, + { + "epoch": 1.0, + "grad_norm": 1.5616509228662199, + "learning_rate": 0.0, + "loss": 0.3914, + "step": 10275 + }, + { + "epoch": 1.0, + "step": 10275, + "total_flos": 1314524410126336.0, + "train_loss": 0.3748687234847215, + "train_runtime": 188352.4336, + "train_samples_per_second": 1.746, + "train_steps_per_second": 0.055 + } + ], + "logging_steps": 1.0, + "max_steps": 10275, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 515, + "total_flos": 1314524410126336.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}