diff --git "a/Qwen2-VL-7B-FULL-full/trainer_state.json" "b/Qwen2-VL-7B-FULL-full/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Qwen2-VL-7B-FULL-full/trainer_state.json" @@ -0,0 +1,66598 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999605610843073, + "eval_steps": 500, + "global_step": 9508, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010517044184731881, + "grad_norm": 59.92323260669162, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.6475, + "step": 1 + }, + { + "epoch": 0.00021034088369463762, + "grad_norm": 63.10940340752038, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.7602, + "step": 2 + }, + { + "epoch": 0.00031551132554195644, + "grad_norm": 57.05123636971969, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.6286, + "step": 3 + }, + { + "epoch": 0.00042068176738927523, + "grad_norm": 58.99194594170873, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.6448, + "step": 4 + }, + { + "epoch": 0.000525852209236594, + "grad_norm": 60.12716290697537, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.7222, + "step": 5 + }, + { + "epoch": 0.0006310226510839129, + "grad_norm": 65.69643117171603, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.7782, + "step": 6 + }, + { + "epoch": 0.0007361930929312317, + "grad_norm": 54.90002763855268, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.5453, + "step": 7 + }, + { + "epoch": 0.0008413635347785505, + "grad_norm": 61.987907330986985, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6941, + "step": 8 + }, + { + "epoch": 0.0009465339766258693, + "grad_norm": 56.061715666844634, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.6707, + "step": 9 + }, + { + "epoch": 0.001051704418473188, + "grad_norm": 63.95430356271557, + "learning_rate": 5.000000000000001e-07, + "loss": 2.6813, + "step": 10 + }, + { + "epoch": 0.001156874860320507, + "grad_norm": 57.584908322695675, + "learning_rate": 5.5e-07, + "loss": 2.4722, + "step": 11 + }, + { + "epoch": 0.0012620453021678258, + "grad_norm": 54.9509876269393, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4741, + "step": 12 + }, + { + "epoch": 0.0013672157440151446, + "grad_norm": 56.36235985124101, + "learning_rate": 6.5e-07, + "loss": 2.638, + "step": 13 + }, + { + "epoch": 0.0014723861858624634, + "grad_norm": 46.019883313287565, + "learning_rate": 7.000000000000001e-07, + "loss": 2.2947, + "step": 14 + }, + { + "epoch": 0.001577556627709782, + "grad_norm": 39.36043169435088, + "learning_rate": 7.5e-07, + "loss": 1.9651, + "step": 15 + }, + { + "epoch": 0.001682727069557101, + "grad_norm": 44.53064805034052, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1088, + "step": 16 + }, + { + "epoch": 0.0017878975114044198, + "grad_norm": 40.77122432291506, + "learning_rate": 8.500000000000001e-07, + "loss": 1.9506, + "step": 17 + }, + { + "epoch": 0.0018930679532517386, + "grad_norm": 33.413887606907345, + "learning_rate": 9.000000000000001e-07, + "loss": 1.8277, + "step": 18 + }, + { + "epoch": 0.0019982383950990575, + "grad_norm": 35.333835421159975, + "learning_rate": 9.500000000000001e-07, + "loss": 1.7883, + "step": 19 + }, + { + "epoch": 0.002103408836946376, + "grad_norm": 19.922751692123384, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4946, + "step": 20 + }, + { + "epoch": 0.002208579278793695, + "grad_norm": 18.4584352012479, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4215, + "step": 21 + }, + { + "epoch": 0.002313749720641014, + "grad_norm": 19.920485594493087, + "learning_rate": 1.1e-06, + "loss": 1.4416, + "step": 22 + }, + { + "epoch": 0.002418920162488333, + "grad_norm": 16.206117061616325, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.408, + "step": 23 + }, + { + "epoch": 0.0025240906043356515, + "grad_norm": 14.345274194736033, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.3782, + "step": 24 + }, + { + "epoch": 0.00262926104618297, + "grad_norm": 14.507180671796032, + "learning_rate": 1.25e-06, + "loss": 1.3618, + "step": 25 + }, + { + "epoch": 0.002734431488030289, + "grad_norm": 9.60203081004738, + "learning_rate": 1.3e-06, + "loss": 1.2395, + "step": 26 + }, + { + "epoch": 0.002839601929877608, + "grad_norm": 8.95961863190965, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.1838, + "step": 27 + }, + { + "epoch": 0.002944772371724927, + "grad_norm": 7.743322908623659, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.149, + "step": 28 + }, + { + "epoch": 0.0030499428135722455, + "grad_norm": 8.000577585626917, + "learning_rate": 1.45e-06, + "loss": 1.1665, + "step": 29 + }, + { + "epoch": 0.003155113255419564, + "grad_norm": 7.385939102175253, + "learning_rate": 1.5e-06, + "loss": 1.1347, + "step": 30 + }, + { + "epoch": 0.0032602836972668832, + "grad_norm": 7.716119057096808, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.1274, + "step": 31 + }, + { + "epoch": 0.003365454139114202, + "grad_norm": 7.434862596192311, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1286, + "step": 32 + }, + { + "epoch": 0.003470624580961521, + "grad_norm": 7.734788123462178, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.1539, + "step": 33 + }, + { + "epoch": 0.0035757950228088396, + "grad_norm": 5.966202361098791, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.1445, + "step": 34 + }, + { + "epoch": 0.003680965464656158, + "grad_norm": 5.487171731422804, + "learning_rate": 1.75e-06, + "loss": 1.1067, + "step": 35 + }, + { + "epoch": 0.0037861359065034773, + "grad_norm": 4.431591345917228, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0843, + "step": 36 + }, + { + "epoch": 0.003891306348350796, + "grad_norm": 6.362482878285757, + "learning_rate": 1.85e-06, + "loss": 1.1035, + "step": 37 + }, + { + "epoch": 0.003996476790198115, + "grad_norm": 5.97952819240149, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.1151, + "step": 38 + }, + { + "epoch": 0.004101647232045434, + "grad_norm": 7.236076440399765, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.111, + "step": 39 + }, + { + "epoch": 0.004206817673892752, + "grad_norm": 7.231000749346946, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.0934, + "step": 40 + }, + { + "epoch": 0.004311988115740071, + "grad_norm": 7.458173023837684, + "learning_rate": 2.05e-06, + "loss": 1.1124, + "step": 41 + }, + { + "epoch": 0.00441715855758739, + "grad_norm": 7.048786741399918, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.0972, + "step": 42 + }, + { + "epoch": 0.0045223289994347085, + "grad_norm": 5.904771802109113, + "learning_rate": 2.15e-06, + "loss": 1.1162, + "step": 43 + }, + { + "epoch": 0.004627499441282028, + "grad_norm": 8.892328258421735, + "learning_rate": 2.2e-06, + "loss": 1.1236, + "step": 44 + }, + { + "epoch": 0.004732669883129347, + "grad_norm": 5.984008442771571, + "learning_rate": 2.25e-06, + "loss": 1.1132, + "step": 45 + }, + { + "epoch": 0.004837840324976666, + "grad_norm": 4.433024003668077, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0935, + "step": 46 + }, + { + "epoch": 0.004943010766823984, + "grad_norm": 6.204751514592563, + "learning_rate": 2.35e-06, + "loss": 1.0928, + "step": 47 + }, + { + "epoch": 0.005048181208671303, + "grad_norm": 5.759130507078808, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.129, + "step": 48 + }, + { + "epoch": 0.005153351650518622, + "grad_norm": 7.090689618351067, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.1351, + "step": 49 + }, + { + "epoch": 0.00525852209236594, + "grad_norm": 5.9876797839663745, + "learning_rate": 2.5e-06, + "loss": 1.0683, + "step": 50 + }, + { + "epoch": 0.005363692534213259, + "grad_norm": 7.595469488307658, + "learning_rate": 2.55e-06, + "loss": 1.0764, + "step": 51 + }, + { + "epoch": 0.005468862976060578, + "grad_norm": 5.978805506171299, + "learning_rate": 2.6e-06, + "loss": 1.1101, + "step": 52 + }, + { + "epoch": 0.005574033417907897, + "grad_norm": 5.389419934729486, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.1183, + "step": 53 + }, + { + "epoch": 0.005679203859755216, + "grad_norm": 4.867913228404486, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.1145, + "step": 54 + }, + { + "epoch": 0.005784374301602535, + "grad_norm": 8.051057067872096, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0788, + "step": 55 + }, + { + "epoch": 0.005889544743449854, + "grad_norm": 3.529571123644078, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.0933, + "step": 56 + }, + { + "epoch": 0.005994715185297172, + "grad_norm": 5.92833470887133, + "learning_rate": 2.85e-06, + "loss": 1.0814, + "step": 57 + }, + { + "epoch": 0.006099885627144491, + "grad_norm": 4.04957198212851, + "learning_rate": 2.9e-06, + "loss": 1.0894, + "step": 58 + }, + { + "epoch": 0.00620505606899181, + "grad_norm": 5.75800280839568, + "learning_rate": 2.95e-06, + "loss": 1.1081, + "step": 59 + }, + { + "epoch": 0.006310226510839128, + "grad_norm": 7.099778116155824, + "learning_rate": 3e-06, + "loss": 1.0787, + "step": 60 + }, + { + "epoch": 0.006415396952686447, + "grad_norm": 6.963451888461215, + "learning_rate": 3.05e-06, + "loss": 1.0834, + "step": 61 + }, + { + "epoch": 0.0065205673945337665, + "grad_norm": 6.132860287326327, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.1122, + "step": 62 + }, + { + "epoch": 0.0066257378363810855, + "grad_norm": 5.193732285933575, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.0615, + "step": 63 + }, + { + "epoch": 0.006730908278228404, + "grad_norm": 8.634476668713994, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1458, + "step": 64 + }, + { + "epoch": 0.006836078720075723, + "grad_norm": 6.06480211066226, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0445, + "step": 65 + }, + { + "epoch": 0.006941249161923042, + "grad_norm": 6.9138684741530305, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.1032, + "step": 66 + }, + { + "epoch": 0.00704641960377036, + "grad_norm": 5.982376980555477, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0606, + "step": 67 + }, + { + "epoch": 0.007151590045617679, + "grad_norm": 5.730675646561972, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.0767, + "step": 68 + }, + { + "epoch": 0.007256760487464998, + "grad_norm": 4.46760589551449, + "learning_rate": 3.45e-06, + "loss": 1.054, + "step": 69 + }, + { + "epoch": 0.007361930929312316, + "grad_norm": 6.883313991446145, + "learning_rate": 3.5e-06, + "loss": 1.06, + "step": 70 + }, + { + "epoch": 0.0074671013711596354, + "grad_norm": 6.893517348928994, + "learning_rate": 3.5500000000000003e-06, + "loss": 1.076, + "step": 71 + }, + { + "epoch": 0.0075722718130069545, + "grad_norm": 6.622296647960904, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.0866, + "step": 72 + }, + { + "epoch": 0.007677442254854274, + "grad_norm": 7.375737365019887, + "learning_rate": 3.65e-06, + "loss": 1.1335, + "step": 73 + }, + { + "epoch": 0.007782612696701592, + "grad_norm": 6.842915774489913, + "learning_rate": 3.7e-06, + "loss": 1.0819, + "step": 74 + }, + { + "epoch": 0.00788778313854891, + "grad_norm": 4.643560225418101, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.0627, + "step": 75 + }, + { + "epoch": 0.00799295358039623, + "grad_norm": 6.239260554512991, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.0939, + "step": 76 + }, + { + "epoch": 0.008098124022243549, + "grad_norm": 6.739902596879763, + "learning_rate": 3.85e-06, + "loss": 1.0527, + "step": 77 + }, + { + "epoch": 0.008203294464090868, + "grad_norm": 4.805461911549342, + "learning_rate": 3.900000000000001e-06, + "loss": 1.0995, + "step": 78 + }, + { + "epoch": 0.008308464905938185, + "grad_norm": 4.600713739789344, + "learning_rate": 3.95e-06, + "loss": 1.0967, + "step": 79 + }, + { + "epoch": 0.008413635347785504, + "grad_norm": 5.785549449267131, + "learning_rate": 4.000000000000001e-06, + "loss": 1.079, + "step": 80 + }, + { + "epoch": 0.008518805789632823, + "grad_norm": 5.974468879016202, + "learning_rate": 4.05e-06, + "loss": 1.0991, + "step": 81 + }, + { + "epoch": 0.008623976231480143, + "grad_norm": 6.971544736299156, + "learning_rate": 4.1e-06, + "loss": 1.0889, + "step": 82 + }, + { + "epoch": 0.008729146673327462, + "grad_norm": 5.443939888698424, + "learning_rate": 4.15e-06, + "loss": 1.0758, + "step": 83 + }, + { + "epoch": 0.00883431711517478, + "grad_norm": 4.987130690796422, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.1191, + "step": 84 + }, + { + "epoch": 0.0089394875570221, + "grad_norm": 4.91454726971417, + "learning_rate": 4.25e-06, + "loss": 1.0437, + "step": 85 + }, + { + "epoch": 0.009044657998869417, + "grad_norm": 4.1569197342316375, + "learning_rate": 4.3e-06, + "loss": 1.0681, + "step": 86 + }, + { + "epoch": 0.009149828440716736, + "grad_norm": 6.3276090379354795, + "learning_rate": 4.350000000000001e-06, + "loss": 1.0924, + "step": 87 + }, + { + "epoch": 0.009254998882564055, + "grad_norm": 7.136019535146, + "learning_rate": 4.4e-06, + "loss": 1.062, + "step": 88 + }, + { + "epoch": 0.009360169324411374, + "grad_norm": 7.000501847709018, + "learning_rate": 4.450000000000001e-06, + "loss": 1.0825, + "step": 89 + }, + { + "epoch": 0.009465339766258693, + "grad_norm": 8.348084064157154, + "learning_rate": 4.5e-06, + "loss": 1.098, + "step": 90 + }, + { + "epoch": 0.009570510208106012, + "grad_norm": 6.3365124926671585, + "learning_rate": 4.5500000000000005e-06, + "loss": 1.0722, + "step": 91 + }, + { + "epoch": 0.009675680649953331, + "grad_norm": 6.850521475132954, + "learning_rate": 4.600000000000001e-06, + "loss": 1.0531, + "step": 92 + }, + { + "epoch": 0.009780851091800649, + "grad_norm": 3.534183914997874, + "learning_rate": 4.65e-06, + "loss": 1.0619, + "step": 93 + }, + { + "epoch": 0.009886021533647968, + "grad_norm": 5.513355921851675, + "learning_rate": 4.7e-06, + "loss": 1.0853, + "step": 94 + }, + { + "epoch": 0.009991191975495287, + "grad_norm": 4.3299555896256745, + "learning_rate": 4.75e-06, + "loss": 1.0755, + "step": 95 + }, + { + "epoch": 0.010096362417342606, + "grad_norm": 6.325453913242973, + "learning_rate": 4.800000000000001e-06, + "loss": 1.0889, + "step": 96 + }, + { + "epoch": 0.010201532859189925, + "grad_norm": 4.789718329298915, + "learning_rate": 4.85e-06, + "loss": 1.0862, + "step": 97 + }, + { + "epoch": 0.010306703301037244, + "grad_norm": 3.8761128602402297, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.0612, + "step": 98 + }, + { + "epoch": 0.010411873742884561, + "grad_norm": 5.009647930586521, + "learning_rate": 4.95e-06, + "loss": 1.0595, + "step": 99 + }, + { + "epoch": 0.01051704418473188, + "grad_norm": 4.418331270407165, + "learning_rate": 5e-06, + "loss": 1.0824, + "step": 100 + }, + { + "epoch": 0.0106222146265792, + "grad_norm": 3.9028162186690802, + "learning_rate": 4.99999986061529e-06, + "loss": 1.0651, + "step": 101 + }, + { + "epoch": 0.010727385068426519, + "grad_norm": 6.8929955441897075, + "learning_rate": 4.999999442461175e-06, + "loss": 1.0997, + "step": 102 + }, + { + "epoch": 0.010832555510273838, + "grad_norm": 4.7248893287683185, + "learning_rate": 4.999998745537701e-06, + "loss": 1.0686, + "step": 103 + }, + { + "epoch": 0.010937725952121157, + "grad_norm": 5.812256570251753, + "learning_rate": 4.999997769844946e-06, + "loss": 1.0618, + "step": 104 + }, + { + "epoch": 0.011042896393968476, + "grad_norm": 2.708772175167679, + "learning_rate": 4.999996515383019e-06, + "loss": 1.0804, + "step": 105 + }, + { + "epoch": 0.011148066835815793, + "grad_norm": 6.2527627142277815, + "learning_rate": 4.9999949821520596e-06, + "loss": 1.1207, + "step": 106 + }, + { + "epoch": 0.011253237277663112, + "grad_norm": 5.5457481955779455, + "learning_rate": 4.99999317015224e-06, + "loss": 1.091, + "step": 107 + }, + { + "epoch": 0.011358407719510431, + "grad_norm": 5.761317582160581, + "learning_rate": 4.999991079383761e-06, + "loss": 1.095, + "step": 108 + }, + { + "epoch": 0.01146357816135775, + "grad_norm": 4.220512736500614, + "learning_rate": 4.999988709846856e-06, + "loss": 1.053, + "step": 109 + }, + { + "epoch": 0.01156874860320507, + "grad_norm": 6.66166762678115, + "learning_rate": 4.999986061541789e-06, + "loss": 1.0754, + "step": 110 + }, + { + "epoch": 0.011673919045052389, + "grad_norm": 3.35655019228154, + "learning_rate": 4.999983134468855e-06, + "loss": 1.0594, + "step": 111 + }, + { + "epoch": 0.011779089486899708, + "grad_norm": 4.939800790239611, + "learning_rate": 4.999979928628382e-06, + "loss": 1.0732, + "step": 112 + }, + { + "epoch": 0.011884259928747025, + "grad_norm": 5.566242564632212, + "learning_rate": 4.999976444020727e-06, + "loss": 1.1101, + "step": 113 + }, + { + "epoch": 0.011989430370594344, + "grad_norm": 4.129321853432761, + "learning_rate": 4.999972680646276e-06, + "loss": 1.0468, + "step": 114 + }, + { + "epoch": 0.012094600812441663, + "grad_norm": 5.459895093767263, + "learning_rate": 4.999968638505452e-06, + "loss": 1.1098, + "step": 115 + }, + { + "epoch": 0.012199771254288982, + "grad_norm": 4.268158570515893, + "learning_rate": 4.9999643175987045e-06, + "loss": 1.0813, + "step": 116 + }, + { + "epoch": 0.012304941696136301, + "grad_norm": 5.11466661467585, + "learning_rate": 4.999959717926515e-06, + "loss": 1.0552, + "step": 117 + }, + { + "epoch": 0.01241011213798362, + "grad_norm": 3.4664159857194274, + "learning_rate": 4.999954839489396e-06, + "loss": 1.0541, + "step": 118 + }, + { + "epoch": 0.01251528257983094, + "grad_norm": 3.9511165586614445, + "learning_rate": 4.999949682287893e-06, + "loss": 1.058, + "step": 119 + }, + { + "epoch": 0.012620453021678257, + "grad_norm": 3.614330210857767, + "learning_rate": 4.999944246322579e-06, + "loss": 1.0401, + "step": 120 + }, + { + "epoch": 0.012725623463525576, + "grad_norm": 5.105632604786978, + "learning_rate": 4.999938531594062e-06, + "loss": 1.0793, + "step": 121 + }, + { + "epoch": 0.012830793905372895, + "grad_norm": 4.957380407595618, + "learning_rate": 4.999932538102978e-06, + "loss": 1.0787, + "step": 122 + }, + { + "epoch": 0.012935964347220214, + "grad_norm": 4.643940273061057, + "learning_rate": 4.999926265849997e-06, + "loss": 1.0568, + "step": 123 + }, + { + "epoch": 0.013041134789067533, + "grad_norm": 4.627911589515215, + "learning_rate": 4.999919714835816e-06, + "loss": 1.1011, + "step": 124 + }, + { + "epoch": 0.013146305230914852, + "grad_norm": 3.102924045859167, + "learning_rate": 4.999912885061167e-06, + "loss": 1.0854, + "step": 125 + }, + { + "epoch": 0.013251475672762171, + "grad_norm": 4.92620716306597, + "learning_rate": 4.999905776526811e-06, + "loss": 1.106, + "step": 126 + }, + { + "epoch": 0.013356646114609488, + "grad_norm": 3.527926170990561, + "learning_rate": 4.9998983892335416e-06, + "loss": 1.0882, + "step": 127 + }, + { + "epoch": 0.013461816556456807, + "grad_norm": 4.005511364388478, + "learning_rate": 4.999890723182183e-06, + "loss": 1.0608, + "step": 128 + }, + { + "epoch": 0.013566986998304126, + "grad_norm": 3.6947799424472887, + "learning_rate": 4.9998827783735875e-06, + "loss": 1.082, + "step": 129 + }, + { + "epoch": 0.013672157440151446, + "grad_norm": 4.8537336025866225, + "learning_rate": 4.999874554808643e-06, + "loss": 1.0154, + "step": 130 + }, + { + "epoch": 0.013777327881998765, + "grad_norm": 5.224073594225298, + "learning_rate": 4.999866052488265e-06, + "loss": 1.0672, + "step": 131 + }, + { + "epoch": 0.013882498323846084, + "grad_norm": 6.699186460471001, + "learning_rate": 4.999857271413403e-06, + "loss": 1.1189, + "step": 132 + }, + { + "epoch": 0.013987668765693401, + "grad_norm": 3.974289494553639, + "learning_rate": 4.999848211585035e-06, + "loss": 1.0751, + "step": 133 + }, + { + "epoch": 0.01409283920754072, + "grad_norm": 4.528291252084844, + "learning_rate": 4.999838873004173e-06, + "loss": 1.0478, + "step": 134 + }, + { + "epoch": 0.01419800964938804, + "grad_norm": 4.971558341946056, + "learning_rate": 4.999829255671857e-06, + "loss": 1.0358, + "step": 135 + }, + { + "epoch": 0.014303180091235358, + "grad_norm": 4.085942947769176, + "learning_rate": 4.99981935958916e-06, + "loss": 1.0944, + "step": 136 + }, + { + "epoch": 0.014408350533082677, + "grad_norm": 6.664417732538393, + "learning_rate": 4.999809184757185e-06, + "loss": 1.0917, + "step": 137 + }, + { + "epoch": 0.014513520974929996, + "grad_norm": 4.806426966250999, + "learning_rate": 4.999798731177066e-06, + "loss": 1.1051, + "step": 138 + }, + { + "epoch": 0.014618691416777315, + "grad_norm": 4.561326503607282, + "learning_rate": 4.9997879988499695e-06, + "loss": 1.0322, + "step": 139 + }, + { + "epoch": 0.014723861858624633, + "grad_norm": 4.264177348921955, + "learning_rate": 4.999776987777093e-06, + "loss": 1.0489, + "step": 140 + }, + { + "epoch": 0.014829032300471952, + "grad_norm": 4.72227071149687, + "learning_rate": 4.9997656979596624e-06, + "loss": 1.0356, + "step": 141 + }, + { + "epoch": 0.014934202742319271, + "grad_norm": 5.240830574426804, + "learning_rate": 4.999754129398938e-06, + "loss": 1.0517, + "step": 142 + }, + { + "epoch": 0.01503937318416659, + "grad_norm": 3.7824663189008674, + "learning_rate": 4.999742282096209e-06, + "loss": 1.0765, + "step": 143 + }, + { + "epoch": 0.015144543626013909, + "grad_norm": 4.031279478713754, + "learning_rate": 4.9997301560527976e-06, + "loss": 1.043, + "step": 144 + }, + { + "epoch": 0.015249714067861228, + "grad_norm": 3.4761815646490244, + "learning_rate": 4.999717751270055e-06, + "loss": 1.0808, + "step": 145 + }, + { + "epoch": 0.015354884509708547, + "grad_norm": 3.5505878417744525, + "learning_rate": 4.9997050677493646e-06, + "loss": 1.0576, + "step": 146 + }, + { + "epoch": 0.015460054951555864, + "grad_norm": 5.655354096949263, + "learning_rate": 4.999692105492141e-06, + "loss": 1.1257, + "step": 147 + }, + { + "epoch": 0.015565225393403184, + "grad_norm": 4.080899903125995, + "learning_rate": 4.999678864499828e-06, + "loss": 1.1199, + "step": 148 + }, + { + "epoch": 0.015670395835250504, + "grad_norm": 4.634309433383539, + "learning_rate": 4.9996653447739054e-06, + "loss": 1.105, + "step": 149 + }, + { + "epoch": 0.01577556627709782, + "grad_norm": 4.261945588917611, + "learning_rate": 4.999651546315877e-06, + "loss": 1.0543, + "step": 150 + }, + { + "epoch": 0.01588073671894514, + "grad_norm": 4.213617826403962, + "learning_rate": 4.999637469127284e-06, + "loss": 1.0604, + "step": 151 + }, + { + "epoch": 0.01598590716079246, + "grad_norm": 5.107781163495558, + "learning_rate": 4.9996231132096955e-06, + "loss": 1.075, + "step": 152 + }, + { + "epoch": 0.016091077602639777, + "grad_norm": 4.210150800964649, + "learning_rate": 4.999608478564713e-06, + "loss": 1.0672, + "step": 153 + }, + { + "epoch": 0.016196248044487098, + "grad_norm": 3.1431426378379084, + "learning_rate": 4.999593565193965e-06, + "loss": 1.0347, + "step": 154 + }, + { + "epoch": 0.016301418486334415, + "grad_norm": 4.579222492136831, + "learning_rate": 4.999578373099119e-06, + "loss": 1.0858, + "step": 155 + }, + { + "epoch": 0.016406588928181736, + "grad_norm": 5.066019223738902, + "learning_rate": 4.999562902281866e-06, + "loss": 1.0952, + "step": 156 + }, + { + "epoch": 0.016511759370029053, + "grad_norm": 2.6553215468038216, + "learning_rate": 4.999547152743933e-06, + "loss": 1.062, + "step": 157 + }, + { + "epoch": 0.01661692981187637, + "grad_norm": 2.465224547489308, + "learning_rate": 4.999531124487074e-06, + "loss": 1.0518, + "step": 158 + }, + { + "epoch": 0.01672210025372369, + "grad_norm": 3.446424772289735, + "learning_rate": 4.999514817513079e-06, + "loss": 1.0765, + "step": 159 + }, + { + "epoch": 0.01682727069557101, + "grad_norm": 5.154039712884113, + "learning_rate": 4.999498231823765e-06, + "loss": 1.0515, + "step": 160 + }, + { + "epoch": 0.01693244113741833, + "grad_norm": 3.327200405368395, + "learning_rate": 4.9994813674209805e-06, + "loss": 1.0734, + "step": 161 + }, + { + "epoch": 0.017037611579265647, + "grad_norm": 6.642307412221283, + "learning_rate": 4.999464224306607e-06, + "loss": 1.1043, + "step": 162 + }, + { + "epoch": 0.017142782021112968, + "grad_norm": 3.530091197441022, + "learning_rate": 4.999446802482556e-06, + "loss": 1.0833, + "step": 163 + }, + { + "epoch": 0.017247952462960285, + "grad_norm": 5.657095039085531, + "learning_rate": 4.99942910195077e-06, + "loss": 1.0632, + "step": 164 + }, + { + "epoch": 0.017353122904807602, + "grad_norm": 5.574109052251118, + "learning_rate": 4.999411122713223e-06, + "loss": 1.095, + "step": 165 + }, + { + "epoch": 0.017458293346654923, + "grad_norm": 5.919741121398457, + "learning_rate": 4.99939286477192e-06, + "loss": 1.0714, + "step": 166 + }, + { + "epoch": 0.01756346378850224, + "grad_norm": 5.72438831771799, + "learning_rate": 4.999374328128896e-06, + "loss": 1.1101, + "step": 167 + }, + { + "epoch": 0.01766863423034956, + "grad_norm": 3.6834749885484404, + "learning_rate": 4.99935551278622e-06, + "loss": 1.0671, + "step": 168 + }, + { + "epoch": 0.01777380467219688, + "grad_norm": 3.573832404245989, + "learning_rate": 4.999336418745987e-06, + "loss": 1.0852, + "step": 169 + }, + { + "epoch": 0.0178789751140442, + "grad_norm": 4.735678508134887, + "learning_rate": 4.9993170460103295e-06, + "loss": 1.0536, + "step": 170 + }, + { + "epoch": 0.017984145555891517, + "grad_norm": 4.056452634779431, + "learning_rate": 4.9992973945814045e-06, + "loss": 1.0438, + "step": 171 + }, + { + "epoch": 0.018089315997738834, + "grad_norm": 4.957195740828829, + "learning_rate": 4.999277464461405e-06, + "loss": 1.0839, + "step": 172 + }, + { + "epoch": 0.018194486439586155, + "grad_norm": 3.926000211879302, + "learning_rate": 4.9992572556525535e-06, + "loss": 1.0788, + "step": 173 + }, + { + "epoch": 0.018299656881433472, + "grad_norm": 4.037823469074097, + "learning_rate": 4.999236768157103e-06, + "loss": 1.0648, + "step": 174 + }, + { + "epoch": 0.018404827323280793, + "grad_norm": 3.5763594650594706, + "learning_rate": 4.999216001977338e-06, + "loss": 1.0776, + "step": 175 + }, + { + "epoch": 0.01850999776512811, + "grad_norm": 4.216315800530571, + "learning_rate": 4.999194957115574e-06, + "loss": 1.0727, + "step": 176 + }, + { + "epoch": 0.01861516820697543, + "grad_norm": 3.7534868688617897, + "learning_rate": 4.999173633574158e-06, + "loss": 1.0114, + "step": 177 + }, + { + "epoch": 0.01872033864882275, + "grad_norm": 3.2854824452012257, + "learning_rate": 4.999152031355468e-06, + "loss": 1.0757, + "step": 178 + }, + { + "epoch": 0.018825509090670066, + "grad_norm": 5.452335174224296, + "learning_rate": 4.999130150461912e-06, + "loss": 1.1222, + "step": 179 + }, + { + "epoch": 0.018930679532517387, + "grad_norm": 4.533901928234252, + "learning_rate": 4.999107990895931e-06, + "loss": 1.0774, + "step": 180 + }, + { + "epoch": 0.019035849974364704, + "grad_norm": 3.9049991918717053, + "learning_rate": 4.999085552659995e-06, + "loss": 1.0649, + "step": 181 + }, + { + "epoch": 0.019141020416212025, + "grad_norm": 3.813729543734002, + "learning_rate": 4.9990628357566055e-06, + "loss": 1.0448, + "step": 182 + }, + { + "epoch": 0.019246190858059342, + "grad_norm": 3.5500653660890524, + "learning_rate": 4.999039840188297e-06, + "loss": 1.0776, + "step": 183 + }, + { + "epoch": 0.019351361299906663, + "grad_norm": 4.714466443116458, + "learning_rate": 4.999016565957633e-06, + "loss": 1.1458, + "step": 184 + }, + { + "epoch": 0.01945653174175398, + "grad_norm": 5.858915587801188, + "learning_rate": 4.9989930130672085e-06, + "loss": 1.067, + "step": 185 + }, + { + "epoch": 0.019561702183601298, + "grad_norm": 6.034299083940081, + "learning_rate": 4.99896918151965e-06, + "loss": 1.0984, + "step": 186 + }, + { + "epoch": 0.01966687262544862, + "grad_norm": 4.074883949501879, + "learning_rate": 4.9989450713176156e-06, + "loss": 1.0182, + "step": 187 + }, + { + "epoch": 0.019772043067295936, + "grad_norm": 3.8546892311347984, + "learning_rate": 4.998920682463794e-06, + "loss": 1.1006, + "step": 188 + }, + { + "epoch": 0.019877213509143257, + "grad_norm": 2.8323107910480316, + "learning_rate": 4.998896014960904e-06, + "loss": 1.1025, + "step": 189 + }, + { + "epoch": 0.019982383950990574, + "grad_norm": 3.2647909530548818, + "learning_rate": 4.998871068811695e-06, + "loss": 1.0553, + "step": 190 + }, + { + "epoch": 0.02008755439283789, + "grad_norm": 4.8772055016374924, + "learning_rate": 4.99884584401895e-06, + "loss": 1.0435, + "step": 191 + }, + { + "epoch": 0.020192724834685212, + "grad_norm": 4.446998405870374, + "learning_rate": 4.998820340585482e-06, + "loss": 1.0841, + "step": 192 + }, + { + "epoch": 0.02029789527653253, + "grad_norm": 6.789642965482605, + "learning_rate": 4.998794558514135e-06, + "loss": 1.0501, + "step": 193 + }, + { + "epoch": 0.02040306571837985, + "grad_norm": 5.368972583293381, + "learning_rate": 4.998768497807783e-06, + "loss": 1.0759, + "step": 194 + }, + { + "epoch": 0.020508236160227167, + "grad_norm": 4.6463740756090175, + "learning_rate": 4.9987421584693316e-06, + "loss": 1.0359, + "step": 195 + }, + { + "epoch": 0.02061340660207449, + "grad_norm": 3.638853744840631, + "learning_rate": 4.99871554050172e-06, + "loss": 1.043, + "step": 196 + }, + { + "epoch": 0.020718577043921806, + "grad_norm": 5.963532277665872, + "learning_rate": 4.998688643907914e-06, + "loss": 1.0821, + "step": 197 + }, + { + "epoch": 0.020823747485769123, + "grad_norm": 4.211503968938633, + "learning_rate": 4.998661468690914e-06, + "loss": 1.0341, + "step": 198 + }, + { + "epoch": 0.020928917927616444, + "grad_norm": 5.9295459087612095, + "learning_rate": 4.9986340148537506e-06, + "loss": 1.0743, + "step": 199 + }, + { + "epoch": 0.02103408836946376, + "grad_norm": 3.4504029111986774, + "learning_rate": 4.998606282399484e-06, + "loss": 1.0975, + "step": 200 + }, + { + "epoch": 0.021139258811311082, + "grad_norm": 2.7556837965327654, + "learning_rate": 4.998578271331207e-06, + "loss": 1.1087, + "step": 201 + }, + { + "epoch": 0.0212444292531584, + "grad_norm": 4.467263377291867, + "learning_rate": 4.998549981652043e-06, + "loss": 1.0805, + "step": 202 + }, + { + "epoch": 0.02134959969500572, + "grad_norm": 3.12366809969675, + "learning_rate": 4.998521413365147e-06, + "loss": 1.1032, + "step": 203 + }, + { + "epoch": 0.021454770136853037, + "grad_norm": 3.678013910323886, + "learning_rate": 4.998492566473704e-06, + "loss": 1.0831, + "step": 204 + }, + { + "epoch": 0.021559940578700355, + "grad_norm": 4.534344934215397, + "learning_rate": 4.998463440980931e-06, + "loss": 1.0911, + "step": 205 + }, + { + "epoch": 0.021665111020547675, + "grad_norm": 5.977433768607247, + "learning_rate": 4.998434036890075e-06, + "loss": 1.0532, + "step": 206 + }, + { + "epoch": 0.021770281462394993, + "grad_norm": 3.097171042849419, + "learning_rate": 4.998404354204416e-06, + "loss": 1.0624, + "step": 207 + }, + { + "epoch": 0.021875451904242314, + "grad_norm": 4.741569481149735, + "learning_rate": 4.998374392927262e-06, + "loss": 1.0814, + "step": 208 + }, + { + "epoch": 0.02198062234608963, + "grad_norm": 5.387514870731206, + "learning_rate": 4.998344153061957e-06, + "loss": 1.0667, + "step": 209 + }, + { + "epoch": 0.02208579278793695, + "grad_norm": 3.107349100997203, + "learning_rate": 4.99831363461187e-06, + "loss": 1.0283, + "step": 210 + }, + { + "epoch": 0.02219096322978427, + "grad_norm": 4.446119580642245, + "learning_rate": 4.998282837580405e-06, + "loss": 1.0642, + "step": 211 + }, + { + "epoch": 0.022296133671631586, + "grad_norm": 3.6643593798924377, + "learning_rate": 4.998251761970997e-06, + "loss": 1.0556, + "step": 212 + }, + { + "epoch": 0.022401304113478907, + "grad_norm": 3.2516463208152917, + "learning_rate": 4.99822040778711e-06, + "loss": 1.0386, + "step": 213 + }, + { + "epoch": 0.022506474555326225, + "grad_norm": 5.733316290126696, + "learning_rate": 4.998188775032241e-06, + "loss": 1.0906, + "step": 214 + }, + { + "epoch": 0.022611644997173545, + "grad_norm": 4.91709171945248, + "learning_rate": 4.998156863709917e-06, + "loss": 1.0903, + "step": 215 + }, + { + "epoch": 0.022716815439020863, + "grad_norm": 4.657782710443831, + "learning_rate": 4.998124673823695e-06, + "loss": 1.1172, + "step": 216 + }, + { + "epoch": 0.022821985880868183, + "grad_norm": 6.169459224100382, + "learning_rate": 4.998092205377168e-06, + "loss": 1.0814, + "step": 217 + }, + { + "epoch": 0.0229271563227155, + "grad_norm": 4.948076959697061, + "learning_rate": 4.998059458373952e-06, + "loss": 1.1041, + "step": 218 + }, + { + "epoch": 0.023032326764562818, + "grad_norm": 5.515487044434082, + "learning_rate": 4.998026432817702e-06, + "loss": 1.0771, + "step": 219 + }, + { + "epoch": 0.02313749720641014, + "grad_norm": 3.6929446615249364, + "learning_rate": 4.9979931287121e-06, + "loss": 1.0612, + "step": 220 + }, + { + "epoch": 0.023242667648257456, + "grad_norm": 3.8867790075290976, + "learning_rate": 4.9979595460608575e-06, + "loss": 1.0381, + "step": 221 + }, + { + "epoch": 0.023347838090104777, + "grad_norm": 4.9882563091630825, + "learning_rate": 4.997925684867721e-06, + "loss": 1.096, + "step": 222 + }, + { + "epoch": 0.023453008531952094, + "grad_norm": 3.8165955011698727, + "learning_rate": 4.997891545136467e-06, + "loss": 1.04, + "step": 223 + }, + { + "epoch": 0.023558178973799415, + "grad_norm": 4.840235092033454, + "learning_rate": 4.997857126870902e-06, + "loss": 1.0896, + "step": 224 + }, + { + "epoch": 0.023663349415646732, + "grad_norm": 5.334008497621717, + "learning_rate": 4.9978224300748625e-06, + "loss": 1.1243, + "step": 225 + }, + { + "epoch": 0.02376851985749405, + "grad_norm": 3.6424315914693337, + "learning_rate": 4.997787454752217e-06, + "loss": 1.0253, + "step": 226 + }, + { + "epoch": 0.02387369029934137, + "grad_norm": 3.7986132911207617, + "learning_rate": 4.997752200906868e-06, + "loss": 1.0196, + "step": 227 + }, + { + "epoch": 0.023978860741188688, + "grad_norm": 3.5858200402131777, + "learning_rate": 4.997716668542746e-06, + "loss": 1.0516, + "step": 228 + }, + { + "epoch": 0.02408403118303601, + "grad_norm": 4.090414058227785, + "learning_rate": 4.997680857663812e-06, + "loss": 1.0889, + "step": 229 + }, + { + "epoch": 0.024189201624883326, + "grad_norm": 6.149411432586733, + "learning_rate": 4.997644768274059e-06, + "loss": 1.0915, + "step": 230 + }, + { + "epoch": 0.024294372066730647, + "grad_norm": 4.990447662747164, + "learning_rate": 4.997608400377513e-06, + "loss": 1.1035, + "step": 231 + }, + { + "epoch": 0.024399542508577964, + "grad_norm": 3.6379678181805626, + "learning_rate": 4.997571753978228e-06, + "loss": 1.0523, + "step": 232 + }, + { + "epoch": 0.02450471295042528, + "grad_norm": 4.063194976272717, + "learning_rate": 4.99753482908029e-06, + "loss": 1.0584, + "step": 233 + }, + { + "epoch": 0.024609883392272602, + "grad_norm": 4.805240814647609, + "learning_rate": 4.997497625687818e-06, + "loss": 1.1217, + "step": 234 + }, + { + "epoch": 0.02471505383411992, + "grad_norm": 4.213633680202407, + "learning_rate": 4.997460143804958e-06, + "loss": 1.0653, + "step": 235 + }, + { + "epoch": 0.02482022427596724, + "grad_norm": 4.605436966627413, + "learning_rate": 4.997422383435893e-06, + "loss": 1.0525, + "step": 236 + }, + { + "epoch": 0.024925394717814558, + "grad_norm": 3.403523309179731, + "learning_rate": 4.99738434458483e-06, + "loss": 1.0587, + "step": 237 + }, + { + "epoch": 0.02503056515966188, + "grad_norm": 3.2122262832368973, + "learning_rate": 4.997346027256013e-06, + "loss": 1.0795, + "step": 238 + }, + { + "epoch": 0.025135735601509196, + "grad_norm": 4.571309867532744, + "learning_rate": 4.997307431453713e-06, + "loss": 1.0673, + "step": 239 + }, + { + "epoch": 0.025240906043356513, + "grad_norm": 3.5447243507334396, + "learning_rate": 4.9972685571822355e-06, + "loss": 1.0261, + "step": 240 + }, + { + "epoch": 0.025346076485203834, + "grad_norm": 3.2377036811282336, + "learning_rate": 4.997229404445914e-06, + "loss": 1.0424, + "step": 241 + }, + { + "epoch": 0.02545124692705115, + "grad_norm": 3.6188960442806017, + "learning_rate": 4.997189973249115e-06, + "loss": 1.0553, + "step": 242 + }, + { + "epoch": 0.025556417368898472, + "grad_norm": 3.3956125214606336, + "learning_rate": 4.997150263596236e-06, + "loss": 1.0437, + "step": 243 + }, + { + "epoch": 0.02566158781074579, + "grad_norm": 4.3265152622652, + "learning_rate": 4.997110275491702e-06, + "loss": 1.0729, + "step": 244 + }, + { + "epoch": 0.02576675825259311, + "grad_norm": 4.350664171417144, + "learning_rate": 4.997070008939976e-06, + "loss": 1.0522, + "step": 245 + }, + { + "epoch": 0.025871928694440428, + "grad_norm": 3.8316671691883872, + "learning_rate": 4.997029463945545e-06, + "loss": 1.0685, + "step": 246 + }, + { + "epoch": 0.025977099136287745, + "grad_norm": 3.5671479642145076, + "learning_rate": 4.996988640512931e-06, + "loss": 1.066, + "step": 247 + }, + { + "epoch": 0.026082269578135066, + "grad_norm": 3.947898665647903, + "learning_rate": 4.996947538646687e-06, + "loss": 1.0524, + "step": 248 + }, + { + "epoch": 0.026187440019982383, + "grad_norm": 3.499962475992957, + "learning_rate": 4.996906158351396e-06, + "loss": 1.0395, + "step": 249 + }, + { + "epoch": 0.026292610461829704, + "grad_norm": 3.653224022870733, + "learning_rate": 4.99686449963167e-06, + "loss": 1.0245, + "step": 250 + }, + { + "epoch": 0.02639778090367702, + "grad_norm": 3.2905872706720634, + "learning_rate": 4.996822562492157e-06, + "loss": 1.1054, + "step": 251 + }, + { + "epoch": 0.026502951345524342, + "grad_norm": 4.813210566349707, + "learning_rate": 4.996780346937532e-06, + "loss": 1.1126, + "step": 252 + }, + { + "epoch": 0.02660812178737166, + "grad_norm": 3.8457085777908926, + "learning_rate": 4.996737852972503e-06, + "loss": 1.0783, + "step": 253 + }, + { + "epoch": 0.026713292229218977, + "grad_norm": 3.7993606994618316, + "learning_rate": 4.9966950806018075e-06, + "loss": 1.0461, + "step": 254 + }, + { + "epoch": 0.026818462671066298, + "grad_norm": 4.8200166677976375, + "learning_rate": 4.996652029830216e-06, + "loss": 1.0875, + "step": 255 + }, + { + "epoch": 0.026923633112913615, + "grad_norm": 4.328119154393376, + "learning_rate": 4.996608700662528e-06, + "loss": 1.0739, + "step": 256 + }, + { + "epoch": 0.027028803554760936, + "grad_norm": 3.1342866705412686, + "learning_rate": 4.996565093103576e-06, + "loss": 1.0948, + "step": 257 + }, + { + "epoch": 0.027133973996608253, + "grad_norm": 4.961920624574888, + "learning_rate": 4.996521207158222e-06, + "loss": 1.0635, + "step": 258 + }, + { + "epoch": 0.027239144438455574, + "grad_norm": 3.360160077037182, + "learning_rate": 4.99647704283136e-06, + "loss": 1.0652, + "step": 259 + }, + { + "epoch": 0.02734431488030289, + "grad_norm": 3.5370384271061077, + "learning_rate": 4.9964326001279145e-06, + "loss": 1.0602, + "step": 260 + }, + { + "epoch": 0.02744948532215021, + "grad_norm": 3.991312523764047, + "learning_rate": 4.996387879052841e-06, + "loss": 1.0563, + "step": 261 + }, + { + "epoch": 0.02755465576399753, + "grad_norm": 3.80460958358854, + "learning_rate": 4.996342879611126e-06, + "loss": 1.0713, + "step": 262 + }, + { + "epoch": 0.027659826205844847, + "grad_norm": 4.454127422134502, + "learning_rate": 4.996297601807788e-06, + "loss": 1.0725, + "step": 263 + }, + { + "epoch": 0.027764996647692167, + "grad_norm": 4.743749897023404, + "learning_rate": 4.996252045647875e-06, + "loss": 1.0079, + "step": 264 + }, + { + "epoch": 0.027870167089539485, + "grad_norm": 4.979818582007352, + "learning_rate": 4.9962062111364675e-06, + "loss": 1.0908, + "step": 265 + }, + { + "epoch": 0.027975337531386802, + "grad_norm": 3.8751951820571597, + "learning_rate": 4.996160098278676e-06, + "loss": 1.0515, + "step": 266 + }, + { + "epoch": 0.028080507973234123, + "grad_norm": 3.5344802127222783, + "learning_rate": 4.996113707079644e-06, + "loss": 1.0124, + "step": 267 + }, + { + "epoch": 0.02818567841508144, + "grad_norm": 4.089090640577849, + "learning_rate": 4.996067037544542e-06, + "loss": 1.0692, + "step": 268 + }, + { + "epoch": 0.02829084885692876, + "grad_norm": 3.2448736578587423, + "learning_rate": 4.996020089678575e-06, + "loss": 1.039, + "step": 269 + }, + { + "epoch": 0.02839601929877608, + "grad_norm": 4.037941021774882, + "learning_rate": 4.995972863486978e-06, + "loss": 1.08, + "step": 270 + }, + { + "epoch": 0.0285011897406234, + "grad_norm": 4.186559650612413, + "learning_rate": 4.9959253589750185e-06, + "loss": 1.0745, + "step": 271 + }, + { + "epoch": 0.028606360182470716, + "grad_norm": 2.7388820923632777, + "learning_rate": 4.995877576147993e-06, + "loss": 1.0651, + "step": 272 + }, + { + "epoch": 0.028711530624318034, + "grad_norm": 3.5856157401608666, + "learning_rate": 4.9958295150112275e-06, + "loss": 1.0874, + "step": 273 + }, + { + "epoch": 0.028816701066165355, + "grad_norm": 3.380461362281521, + "learning_rate": 4.995781175570083e-06, + "loss": 1.0709, + "step": 274 + }, + { + "epoch": 0.028921871508012672, + "grad_norm": 3.5335871406927724, + "learning_rate": 4.99573255782995e-06, + "loss": 1.0515, + "step": 275 + }, + { + "epoch": 0.029027041949859993, + "grad_norm": 4.463613237000542, + "learning_rate": 4.995683661796249e-06, + "loss": 1.0656, + "step": 276 + }, + { + "epoch": 0.02913221239170731, + "grad_norm": 3.4536408983322886, + "learning_rate": 4.995634487474433e-06, + "loss": 1.0725, + "step": 277 + }, + { + "epoch": 0.02923738283355463, + "grad_norm": 4.333343256417978, + "learning_rate": 4.995585034869984e-06, + "loss": 1.0408, + "step": 278 + }, + { + "epoch": 0.029342553275401948, + "grad_norm": 3.9790917572612847, + "learning_rate": 4.995535303988418e-06, + "loss": 1.0569, + "step": 279 + }, + { + "epoch": 0.029447723717249265, + "grad_norm": 3.5455400588602717, + "learning_rate": 4.99548529483528e-06, + "loss": 1.049, + "step": 280 + }, + { + "epoch": 0.029552894159096586, + "grad_norm": 2.336572942885481, + "learning_rate": 4.995435007416145e-06, + "loss": 1.0106, + "step": 281 + }, + { + "epoch": 0.029658064600943904, + "grad_norm": 4.019746471320661, + "learning_rate": 4.995384441736622e-06, + "loss": 1.0501, + "step": 282 + }, + { + "epoch": 0.029763235042791224, + "grad_norm": 5.0468126375485065, + "learning_rate": 4.995333597802349e-06, + "loss": 1.1031, + "step": 283 + }, + { + "epoch": 0.029868405484638542, + "grad_norm": 4.142494686730891, + "learning_rate": 4.995282475618995e-06, + "loss": 1.0887, + "step": 284 + }, + { + "epoch": 0.029973575926485863, + "grad_norm": 5.271308613066555, + "learning_rate": 4.9952310751922615e-06, + "loss": 1.0625, + "step": 285 + }, + { + "epoch": 0.03007874636833318, + "grad_norm": 4.001834907924138, + "learning_rate": 4.995179396527878e-06, + "loss": 1.0622, + "step": 286 + }, + { + "epoch": 0.030183916810180497, + "grad_norm": 3.9801706203568856, + "learning_rate": 4.99512743963161e-06, + "loss": 1.0791, + "step": 287 + }, + { + "epoch": 0.030289087252027818, + "grad_norm": 5.1051985033669425, + "learning_rate": 4.99507520450925e-06, + "loss": 1.0684, + "step": 288 + }, + { + "epoch": 0.030394257693875135, + "grad_norm": 3.936963708694301, + "learning_rate": 4.995022691166621e-06, + "loss": 1.047, + "step": 289 + }, + { + "epoch": 0.030499428135722456, + "grad_norm": 3.714404487265949, + "learning_rate": 4.994969899609581e-06, + "loss": 1.0303, + "step": 290 + }, + { + "epoch": 0.030604598577569773, + "grad_norm": 4.912241616953762, + "learning_rate": 4.994916829844015e-06, + "loss": 1.0836, + "step": 291 + }, + { + "epoch": 0.030709769019417094, + "grad_norm": 4.067161813822889, + "learning_rate": 4.994863481875842e-06, + "loss": 1.0335, + "step": 292 + }, + { + "epoch": 0.03081493946126441, + "grad_norm": 4.6662969482103644, + "learning_rate": 4.994809855711009e-06, + "loss": 1.0666, + "step": 293 + }, + { + "epoch": 0.03092010990311173, + "grad_norm": 4.517226678040155, + "learning_rate": 4.994755951355496e-06, + "loss": 1.0358, + "step": 294 + }, + { + "epoch": 0.03102528034495905, + "grad_norm": 3.7309332333912724, + "learning_rate": 4.994701768815317e-06, + "loss": 1.0611, + "step": 295 + }, + { + "epoch": 0.031130450786806367, + "grad_norm": 4.014104758735321, + "learning_rate": 4.994647308096509e-06, + "loss": 1.0727, + "step": 296 + }, + { + "epoch": 0.031235621228653688, + "grad_norm": 3.29886953781129, + "learning_rate": 4.994592569205148e-06, + "loss": 1.0919, + "step": 297 + }, + { + "epoch": 0.03134079167050101, + "grad_norm": 2.7078199003469257, + "learning_rate": 4.994537552147337e-06, + "loss": 1.0173, + "step": 298 + }, + { + "epoch": 0.031445962112348326, + "grad_norm": 2.8266073729377332, + "learning_rate": 4.994482256929211e-06, + "loss": 1.0709, + "step": 299 + }, + { + "epoch": 0.03155113255419564, + "grad_norm": 3.268397966576739, + "learning_rate": 4.994426683556935e-06, + "loss": 1.0217, + "step": 300 + }, + { + "epoch": 0.03165630299604296, + "grad_norm": 2.6632983909674994, + "learning_rate": 4.994370832036705e-06, + "loss": 1.0279, + "step": 301 + }, + { + "epoch": 0.03176147343789028, + "grad_norm": 4.0187186523919705, + "learning_rate": 4.994314702374753e-06, + "loss": 1.0646, + "step": 302 + }, + { + "epoch": 0.0318666438797376, + "grad_norm": 4.204278869511361, + "learning_rate": 4.994258294577333e-06, + "loss": 1.0762, + "step": 303 + }, + { + "epoch": 0.03197181432158492, + "grad_norm": 4.383109964538248, + "learning_rate": 4.9942016086507384e-06, + "loss": 1.0602, + "step": 304 + }, + { + "epoch": 0.03207698476343224, + "grad_norm": 5.459005697486776, + "learning_rate": 4.9941446446012874e-06, + "loss": 1.0854, + "step": 305 + }, + { + "epoch": 0.032182155205279554, + "grad_norm": 3.251384242833329, + "learning_rate": 4.994087402435335e-06, + "loss": 1.0001, + "step": 306 + }, + { + "epoch": 0.03228732564712687, + "grad_norm": 4.629466051075931, + "learning_rate": 4.994029882159261e-06, + "loss": 1.0552, + "step": 307 + }, + { + "epoch": 0.032392496088974196, + "grad_norm": 2.716529865958913, + "learning_rate": 4.993972083779482e-06, + "loss": 1.0108, + "step": 308 + }, + { + "epoch": 0.03249766653082151, + "grad_norm": 3.575967885671494, + "learning_rate": 4.993914007302441e-06, + "loss": 1.0709, + "step": 309 + }, + { + "epoch": 0.03260283697266883, + "grad_norm": 3.5669292643875186, + "learning_rate": 4.993855652734616e-06, + "loss": 1.0912, + "step": 310 + }, + { + "epoch": 0.03270800741451615, + "grad_norm": 4.010833762984559, + "learning_rate": 4.993797020082511e-06, + "loss": 1.0583, + "step": 311 + }, + { + "epoch": 0.03281317785636347, + "grad_norm": 4.104510377305269, + "learning_rate": 4.9937381093526675e-06, + "loss": 1.0087, + "step": 312 + }, + { + "epoch": 0.03291834829821079, + "grad_norm": 4.784979263043128, + "learning_rate": 4.993678920551651e-06, + "loss": 1.0615, + "step": 313 + }, + { + "epoch": 0.03302351874005811, + "grad_norm": 4.164191063473395, + "learning_rate": 4.993619453686065e-06, + "loss": 1.0409, + "step": 314 + }, + { + "epoch": 0.033128689181905424, + "grad_norm": 3.1649556775301733, + "learning_rate": 4.993559708762538e-06, + "loss": 1.0811, + "step": 315 + }, + { + "epoch": 0.03323385962375274, + "grad_norm": 3.6637842760946917, + "learning_rate": 4.993499685787732e-06, + "loss": 1.0539, + "step": 316 + }, + { + "epoch": 0.033339030065600066, + "grad_norm": 3.1821001996556157, + "learning_rate": 4.993439384768342e-06, + "loss": 1.0799, + "step": 317 + }, + { + "epoch": 0.03344420050744738, + "grad_norm": 3.3225742030256535, + "learning_rate": 4.993378805711091e-06, + "loss": 1.079, + "step": 318 + }, + { + "epoch": 0.0335493709492947, + "grad_norm": 3.7196848657078045, + "learning_rate": 4.993317948622733e-06, + "loss": 1.0537, + "step": 319 + }, + { + "epoch": 0.03365454139114202, + "grad_norm": 3.0347036015285385, + "learning_rate": 4.993256813510055e-06, + "loss": 1.0425, + "step": 320 + }, + { + "epoch": 0.033759711832989335, + "grad_norm": 3.4466877790068056, + "learning_rate": 4.993195400379875e-06, + "loss": 1.0275, + "step": 321 + }, + { + "epoch": 0.03386488227483666, + "grad_norm": 3.4937996524768393, + "learning_rate": 4.993133709239039e-06, + "loss": 1.0074, + "step": 322 + }, + { + "epoch": 0.03397005271668398, + "grad_norm": 3.961714485473967, + "learning_rate": 4.993071740094428e-06, + "loss": 1.0152, + "step": 323 + }, + { + "epoch": 0.034075223158531294, + "grad_norm": 3.882633562046403, + "learning_rate": 4.993009492952951e-06, + "loss": 1.0408, + "step": 324 + }, + { + "epoch": 0.03418039360037861, + "grad_norm": 4.777184945580783, + "learning_rate": 4.992946967821549e-06, + "loss": 1.095, + "step": 325 + }, + { + "epoch": 0.034285564042225936, + "grad_norm": 4.8588365098982775, + "learning_rate": 4.9928841647071936e-06, + "loss": 1.0568, + "step": 326 + }, + { + "epoch": 0.03439073448407325, + "grad_norm": 3.6839425202673297, + "learning_rate": 4.992821083616889e-06, + "loss": 1.0496, + "step": 327 + }, + { + "epoch": 0.03449590492592057, + "grad_norm": 3.66693362260413, + "learning_rate": 4.992757724557669e-06, + "loss": 1.0753, + "step": 328 + }, + { + "epoch": 0.03460107536776789, + "grad_norm": 4.978727167916742, + "learning_rate": 4.992694087536597e-06, + "loss": 1.0729, + "step": 329 + }, + { + "epoch": 0.034706245809615205, + "grad_norm": 5.114707679163974, + "learning_rate": 4.992630172560771e-06, + "loss": 1.0943, + "step": 330 + }, + { + "epoch": 0.03481141625146253, + "grad_norm": 3.163523736651089, + "learning_rate": 4.992565979637318e-06, + "loss": 1.0419, + "step": 331 + }, + { + "epoch": 0.034916586693309846, + "grad_norm": 3.636944273388002, + "learning_rate": 4.9925015087733945e-06, + "loss": 1.047, + "step": 332 + }, + { + "epoch": 0.035021757135157164, + "grad_norm": 4.259395764869889, + "learning_rate": 4.992436759976191e-06, + "loss": 1.0776, + "step": 333 + }, + { + "epoch": 0.03512692757700448, + "grad_norm": 3.920573922209101, + "learning_rate": 4.992371733252926e-06, + "loss": 1.0535, + "step": 334 + }, + { + "epoch": 0.0352320980188518, + "grad_norm": 4.293490908646825, + "learning_rate": 4.992306428610853e-06, + "loss": 1.0483, + "step": 335 + }, + { + "epoch": 0.03533726846069912, + "grad_norm": 3.6789152142394914, + "learning_rate": 4.9922408460572505e-06, + "loss": 1.0418, + "step": 336 + }, + { + "epoch": 0.03544243890254644, + "grad_norm": 3.3817970301847238, + "learning_rate": 4.992174985599434e-06, + "loss": 1.0514, + "step": 337 + }, + { + "epoch": 0.03554760934439376, + "grad_norm": 3.3702499323179596, + "learning_rate": 4.992108847244746e-06, + "loss": 1.0259, + "step": 338 + }, + { + "epoch": 0.035652779786241075, + "grad_norm": 4.5111317152835815, + "learning_rate": 4.9920424310005625e-06, + "loss": 1.0555, + "step": 339 + }, + { + "epoch": 0.0357579502280884, + "grad_norm": 4.693632339648465, + "learning_rate": 4.9919757368742895e-06, + "loss": 1.0464, + "step": 340 + }, + { + "epoch": 0.035863120669935716, + "grad_norm": 3.7052320429842736, + "learning_rate": 4.9919087648733634e-06, + "loss": 1.0429, + "step": 341 + }, + { + "epoch": 0.035968291111783034, + "grad_norm": 5.043659801808216, + "learning_rate": 4.991841515005253e-06, + "loss": 1.0648, + "step": 342 + }, + { + "epoch": 0.03607346155363035, + "grad_norm": 4.146978032877157, + "learning_rate": 4.991773987277455e-06, + "loss": 1.1074, + "step": 343 + }, + { + "epoch": 0.03617863199547767, + "grad_norm": 6.009958045778625, + "learning_rate": 4.991706181697501e-06, + "loss": 1.0836, + "step": 344 + }, + { + "epoch": 0.03628380243732499, + "grad_norm": 4.730758307272077, + "learning_rate": 4.991638098272951e-06, + "loss": 1.0436, + "step": 345 + }, + { + "epoch": 0.03638897287917231, + "grad_norm": 4.19923954684141, + "learning_rate": 4.991569737011398e-06, + "loss": 1.0276, + "step": 346 + }, + { + "epoch": 0.03649414332101963, + "grad_norm": 3.5491173146821247, + "learning_rate": 4.991501097920464e-06, + "loss": 1.0366, + "step": 347 + }, + { + "epoch": 0.036599313762866945, + "grad_norm": 3.4759309555067213, + "learning_rate": 4.991432181007803e-06, + "loss": 1.0173, + "step": 348 + }, + { + "epoch": 0.03670448420471426, + "grad_norm": 3.817356633667997, + "learning_rate": 4.9913629862811e-06, + "loss": 1.0628, + "step": 349 + }, + { + "epoch": 0.036809654646561586, + "grad_norm": 3.537846598304811, + "learning_rate": 4.99129351374807e-06, + "loss": 1.057, + "step": 350 + }, + { + "epoch": 0.036914825088408904, + "grad_norm": 5.421441758466292, + "learning_rate": 4.99122376341646e-06, + "loss": 1.0986, + "step": 351 + }, + { + "epoch": 0.03701999553025622, + "grad_norm": 4.12752414216299, + "learning_rate": 4.9911537352940485e-06, + "loss": 1.0398, + "step": 352 + }, + { + "epoch": 0.03712516597210354, + "grad_norm": 4.437838613524334, + "learning_rate": 4.991083429388643e-06, + "loss": 1.0359, + "step": 353 + }, + { + "epoch": 0.03723033641395086, + "grad_norm": 3.22408613692183, + "learning_rate": 4.991012845708084e-06, + "loss": 1.0653, + "step": 354 + }, + { + "epoch": 0.03733550685579818, + "grad_norm": 3.3399036692465884, + "learning_rate": 4.990941984260241e-06, + "loss": 1.0765, + "step": 355 + }, + { + "epoch": 0.0374406772976455, + "grad_norm": 3.936416890688034, + "learning_rate": 4.9908708450530174e-06, + "loss": 1.051, + "step": 356 + }, + { + "epoch": 0.037545847739492814, + "grad_norm": 3.990025349268498, + "learning_rate": 4.990799428094345e-06, + "loss": 1.0507, + "step": 357 + }, + { + "epoch": 0.03765101818134013, + "grad_norm": 4.239682909525811, + "learning_rate": 4.9907277333921865e-06, + "loss": 1.0568, + "step": 358 + }, + { + "epoch": 0.037756188623187456, + "grad_norm": 3.7354470248901133, + "learning_rate": 4.9906557609545375e-06, + "loss": 1.0847, + "step": 359 + }, + { + "epoch": 0.03786135906503477, + "grad_norm": 5.431410350419551, + "learning_rate": 4.990583510789423e-06, + "loss": 1.088, + "step": 360 + }, + { + "epoch": 0.03796652950688209, + "grad_norm": 4.049632017073229, + "learning_rate": 4.990510982904899e-06, + "loss": 1.0853, + "step": 361 + }, + { + "epoch": 0.03807169994872941, + "grad_norm": 3.8736875944016775, + "learning_rate": 4.990438177309054e-06, + "loss": 1.0628, + "step": 362 + }, + { + "epoch": 0.038176870390576725, + "grad_norm": 2.99774541371039, + "learning_rate": 4.9903650940100055e-06, + "loss": 1.0509, + "step": 363 + }, + { + "epoch": 0.03828204083242405, + "grad_norm": 5.319456457308596, + "learning_rate": 4.990291733015904e-06, + "loss": 1.037, + "step": 364 + }, + { + "epoch": 0.03838721127427137, + "grad_norm": 4.702856139460145, + "learning_rate": 4.9902180943349286e-06, + "loss": 1.0805, + "step": 365 + }, + { + "epoch": 0.038492381716118684, + "grad_norm": 3.8095265476696665, + "learning_rate": 4.99014417797529e-06, + "loss": 1.0669, + "step": 366 + }, + { + "epoch": 0.038597552157966, + "grad_norm": 4.961192575518297, + "learning_rate": 4.990069983945233e-06, + "loss": 1.0844, + "step": 367 + }, + { + "epoch": 0.038702722599813326, + "grad_norm": 4.891981596003543, + "learning_rate": 4.989995512253028e-06, + "loss": 1.0502, + "step": 368 + }, + { + "epoch": 0.03880789304166064, + "grad_norm": 3.6566189119364667, + "learning_rate": 4.989920762906981e-06, + "loss": 1.06, + "step": 369 + }, + { + "epoch": 0.03891306348350796, + "grad_norm": 5.900942628114732, + "learning_rate": 4.989845735915426e-06, + "loss": 1.068, + "step": 370 + }, + { + "epoch": 0.03901823392535528, + "grad_norm": 3.8108566487033477, + "learning_rate": 4.98977043128673e-06, + "loss": 1.0531, + "step": 371 + }, + { + "epoch": 0.039123404367202595, + "grad_norm": 3.9068966220872006, + "learning_rate": 4.9896948490292905e-06, + "loss": 1.064, + "step": 372 + }, + { + "epoch": 0.03922857480904992, + "grad_norm": 4.376179595496623, + "learning_rate": 4.989618989151533e-06, + "loss": 1.0025, + "step": 373 + }, + { + "epoch": 0.03933374525089724, + "grad_norm": 3.6380450371166155, + "learning_rate": 4.989542851661919e-06, + "loss": 1.0173, + "step": 374 + }, + { + "epoch": 0.039438915692744554, + "grad_norm": 4.554841578105061, + "learning_rate": 4.9894664365689385e-06, + "loss": 1.0903, + "step": 375 + }, + { + "epoch": 0.03954408613459187, + "grad_norm": 2.853296420735663, + "learning_rate": 4.9893897438811106e-06, + "loss": 1.0246, + "step": 376 + }, + { + "epoch": 0.03964925657643919, + "grad_norm": 4.257333189026485, + "learning_rate": 4.989312773606988e-06, + "loss": 1.097, + "step": 377 + }, + { + "epoch": 0.03975442701828651, + "grad_norm": 3.678981492068401, + "learning_rate": 4.989235525755154e-06, + "loss": 1.0611, + "step": 378 + }, + { + "epoch": 0.03985959746013383, + "grad_norm": 5.195313256569016, + "learning_rate": 4.989158000334221e-06, + "loss": 1.0454, + "step": 379 + }, + { + "epoch": 0.03996476790198115, + "grad_norm": 1.8104051625470303, + "learning_rate": 4.989080197352834e-06, + "loss": 1.0422, + "step": 380 + }, + { + "epoch": 0.040069938343828465, + "grad_norm": 3.538275104798571, + "learning_rate": 4.989002116819671e-06, + "loss": 1.0597, + "step": 381 + }, + { + "epoch": 0.04017510878567578, + "grad_norm": 2.8012444815542716, + "learning_rate": 4.9889237587434355e-06, + "loss": 1.0183, + "step": 382 + }, + { + "epoch": 0.04028027922752311, + "grad_norm": 3.367459724491466, + "learning_rate": 4.988845123132867e-06, + "loss": 1.0388, + "step": 383 + }, + { + "epoch": 0.040385449669370424, + "grad_norm": 4.145106659583553, + "learning_rate": 4.988766209996733e-06, + "loss": 1.0344, + "step": 384 + }, + { + "epoch": 0.04049062011121774, + "grad_norm": 3.40206472561461, + "learning_rate": 4.9886870193438325e-06, + "loss": 1.052, + "step": 385 + }, + { + "epoch": 0.04059579055306506, + "grad_norm": 3.725684215375892, + "learning_rate": 4.988607551182997e-06, + "loss": 1.0809, + "step": 386 + }, + { + "epoch": 0.04070096099491238, + "grad_norm": 3.731707512512842, + "learning_rate": 4.9885278055230875e-06, + "loss": 1.0266, + "step": 387 + }, + { + "epoch": 0.0408061314367597, + "grad_norm": 3.928396974689701, + "learning_rate": 4.988447782372996e-06, + "loss": 1.063, + "step": 388 + }, + { + "epoch": 0.04091130187860702, + "grad_norm": 4.31218060090711, + "learning_rate": 4.9883674817416464e-06, + "loss": 1.0864, + "step": 389 + }, + { + "epoch": 0.041016472320454335, + "grad_norm": 3.0718531395576583, + "learning_rate": 4.988286903637991e-06, + "loss": 1.0459, + "step": 390 + }, + { + "epoch": 0.04112164276230165, + "grad_norm": 2.9157630700754886, + "learning_rate": 4.988206048071017e-06, + "loss": 1.0581, + "step": 391 + }, + { + "epoch": 0.04122681320414898, + "grad_norm": 3.8247937216792, + "learning_rate": 4.988124915049739e-06, + "loss": 1.0286, + "step": 392 + }, + { + "epoch": 0.041331983645996294, + "grad_norm": 2.6874655896655097, + "learning_rate": 4.988043504583206e-06, + "loss": 1.0387, + "step": 393 + }, + { + "epoch": 0.04143715408784361, + "grad_norm": 2.9913671043398677, + "learning_rate": 4.987961816680493e-06, + "loss": 1.0536, + "step": 394 + }, + { + "epoch": 0.04154232452969093, + "grad_norm": 2.8088862124300404, + "learning_rate": 4.987879851350711e-06, + "loss": 1.078, + "step": 395 + }, + { + "epoch": 0.041647494971538246, + "grad_norm": 3.755585245451105, + "learning_rate": 4.987797608602997e-06, + "loss": 1.0792, + "step": 396 + }, + { + "epoch": 0.04175266541338557, + "grad_norm": 2.8770765754045846, + "learning_rate": 4.987715088446526e-06, + "loss": 1.0757, + "step": 397 + }, + { + "epoch": 0.04185783585523289, + "grad_norm": 3.273137502812656, + "learning_rate": 4.987632290890497e-06, + "loss": 1.0471, + "step": 398 + }, + { + "epoch": 0.041963006297080205, + "grad_norm": 3.636240730716032, + "learning_rate": 4.987549215944143e-06, + "loss": 1.0532, + "step": 399 + }, + { + "epoch": 0.04206817673892752, + "grad_norm": 4.208273895144222, + "learning_rate": 4.987465863616727e-06, + "loss": 1.0681, + "step": 400 + }, + { + "epoch": 0.042173347180774846, + "grad_norm": 3.5106533475647748, + "learning_rate": 4.987382233917545e-06, + "loss": 1.0538, + "step": 401 + }, + { + "epoch": 0.042278517622622164, + "grad_norm": 4.886913187088557, + "learning_rate": 4.98729832685592e-06, + "loss": 1.0417, + "step": 402 + }, + { + "epoch": 0.04238368806446948, + "grad_norm": 5.560665522659327, + "learning_rate": 4.98721414244121e-06, + "loss": 1.0699, + "step": 403 + }, + { + "epoch": 0.0424888585063168, + "grad_norm": 4.094323716122683, + "learning_rate": 4.987129680682802e-06, + "loss": 1.0764, + "step": 404 + }, + { + "epoch": 0.042594028948164116, + "grad_norm": 2.6629248986818075, + "learning_rate": 4.987044941590114e-06, + "loss": 1.0667, + "step": 405 + }, + { + "epoch": 0.04269919939001144, + "grad_norm": 3.4109848570390193, + "learning_rate": 4.986959925172594e-06, + "loss": 1.037, + "step": 406 + }, + { + "epoch": 0.04280436983185876, + "grad_norm": 3.0342413572937317, + "learning_rate": 4.986874631439724e-06, + "loss": 1.0709, + "step": 407 + }, + { + "epoch": 0.042909540273706075, + "grad_norm": 2.8390095815269465, + "learning_rate": 4.9867890604010125e-06, + "loss": 1.0196, + "step": 408 + }, + { + "epoch": 0.04301471071555339, + "grad_norm": 3.0707226746539464, + "learning_rate": 4.9867032120660045e-06, + "loss": 1.0393, + "step": 409 + }, + { + "epoch": 0.04311988115740071, + "grad_norm": 3.8038872687449765, + "learning_rate": 4.98661708644427e-06, + "loss": 1.032, + "step": 410 + }, + { + "epoch": 0.043225051599248034, + "grad_norm": 3.6068516146454193, + "learning_rate": 4.986530683545414e-06, + "loss": 1.0328, + "step": 411 + }, + { + "epoch": 0.04333022204109535, + "grad_norm": 3.8245339487596546, + "learning_rate": 4.98644400337907e-06, + "loss": 1.103, + "step": 412 + }, + { + "epoch": 0.04343539248294267, + "grad_norm": 3.4368283671133746, + "learning_rate": 4.986357045954905e-06, + "loss": 1.0507, + "step": 413 + }, + { + "epoch": 0.043540562924789986, + "grad_norm": 2.74188819425057, + "learning_rate": 4.986269811282615e-06, + "loss": 1.0316, + "step": 414 + }, + { + "epoch": 0.04364573336663731, + "grad_norm": 3.679825151599156, + "learning_rate": 4.9861822993719255e-06, + "loss": 1.0097, + "step": 415 + }, + { + "epoch": 0.04375090380848463, + "grad_norm": 3.297855659802533, + "learning_rate": 4.986094510232597e-06, + "loss": 1.0219, + "step": 416 + }, + { + "epoch": 0.043856074250331945, + "grad_norm": 3.4058245947376173, + "learning_rate": 4.986006443874419e-06, + "loss": 1.0377, + "step": 417 + }, + { + "epoch": 0.04396124469217926, + "grad_norm": 4.363555255178052, + "learning_rate": 4.9859181003072095e-06, + "loss": 1.0637, + "step": 418 + }, + { + "epoch": 0.04406641513402658, + "grad_norm": 3.9928133996636395, + "learning_rate": 4.985829479540821e-06, + "loss": 1.0591, + "step": 419 + }, + { + "epoch": 0.0441715855758739, + "grad_norm": 4.073405278783087, + "learning_rate": 4.985740581585135e-06, + "loss": 1.059, + "step": 420 + }, + { + "epoch": 0.04427675601772122, + "grad_norm": 4.084709791211826, + "learning_rate": 4.985651406450063e-06, + "loss": 1.0313, + "step": 421 + }, + { + "epoch": 0.04438192645956854, + "grad_norm": 3.715460139932838, + "learning_rate": 4.985561954145552e-06, + "loss": 1.0913, + "step": 422 + }, + { + "epoch": 0.044487096901415855, + "grad_norm": 3.7233678758471744, + "learning_rate": 4.985472224681573e-06, + "loss": 1.0577, + "step": 423 + }, + { + "epoch": 0.04459226734326317, + "grad_norm": 5.652763390578416, + "learning_rate": 4.985382218068133e-06, + "loss": 1.0382, + "step": 424 + }, + { + "epoch": 0.0446974377851105, + "grad_norm": 3.468271207036182, + "learning_rate": 4.985291934315269e-06, + "loss": 1.0498, + "step": 425 + }, + { + "epoch": 0.044802608226957814, + "grad_norm": 3.1935939850144233, + "learning_rate": 4.985201373433048e-06, + "loss": 1.0395, + "step": 426 + }, + { + "epoch": 0.04490777866880513, + "grad_norm": 3.714801883999893, + "learning_rate": 4.985110535431569e-06, + "loss": 1.063, + "step": 427 + }, + { + "epoch": 0.04501294911065245, + "grad_norm": 3.441136685461427, + "learning_rate": 4.985019420320959e-06, + "loss": 1.0503, + "step": 428 + }, + { + "epoch": 0.04511811955249977, + "grad_norm": 2.8419115986373042, + "learning_rate": 4.98492802811138e-06, + "loss": 1.0479, + "step": 429 + }, + { + "epoch": 0.04522328999434709, + "grad_norm": 3.039904416144775, + "learning_rate": 4.984836358813022e-06, + "loss": 1.0469, + "step": 430 + }, + { + "epoch": 0.04532846043619441, + "grad_norm": 4.691661020519824, + "learning_rate": 4.984744412436107e-06, + "loss": 1.0572, + "step": 431 + }, + { + "epoch": 0.045433630878041725, + "grad_norm": 4.540987997363801, + "learning_rate": 4.9846521889908885e-06, + "loss": 1.0781, + "step": 432 + }, + { + "epoch": 0.04553880131988904, + "grad_norm": 4.296517950327922, + "learning_rate": 4.984559688487649e-06, + "loss": 1.0635, + "step": 433 + }, + { + "epoch": 0.04564397176173637, + "grad_norm": 3.2572651182872585, + "learning_rate": 4.984466910936703e-06, + "loss": 1.0681, + "step": 434 + }, + { + "epoch": 0.045749142203583684, + "grad_norm": 2.9557462605367415, + "learning_rate": 4.9843738563483975e-06, + "loss": 1.0567, + "step": 435 + }, + { + "epoch": 0.045854312645431, + "grad_norm": 3.9469604897643022, + "learning_rate": 4.984280524733107e-06, + "loss": 1.0509, + "step": 436 + }, + { + "epoch": 0.04595948308727832, + "grad_norm": 2.7541457544733188, + "learning_rate": 4.984186916101239e-06, + "loss": 1.0414, + "step": 437 + }, + { + "epoch": 0.046064653529125636, + "grad_norm": 3.622690550239156, + "learning_rate": 4.984093030463233e-06, + "loss": 1.0864, + "step": 438 + }, + { + "epoch": 0.04616982397097296, + "grad_norm": 4.32000715987469, + "learning_rate": 4.983998867829557e-06, + "loss": 1.0398, + "step": 439 + }, + { + "epoch": 0.04627499441282028, + "grad_norm": 3.7777411421192792, + "learning_rate": 4.98390442821071e-06, + "loss": 1.0565, + "step": 440 + }, + { + "epoch": 0.046380164854667595, + "grad_norm": 3.47497239447561, + "learning_rate": 4.983809711617224e-06, + "loss": 1.0516, + "step": 441 + }, + { + "epoch": 0.04648533529651491, + "grad_norm": 3.8957477430994687, + "learning_rate": 4.98371471805966e-06, + "loss": 1.0513, + "step": 442 + }, + { + "epoch": 0.04659050573836224, + "grad_norm": 2.4570852022191603, + "learning_rate": 4.983619447548611e-06, + "loss": 1.0278, + "step": 443 + }, + { + "epoch": 0.046695676180209554, + "grad_norm": 4.079822189897958, + "learning_rate": 4.9835239000947005e-06, + "loss": 1.0827, + "step": 444 + }, + { + "epoch": 0.04680084662205687, + "grad_norm": 3.547742042978678, + "learning_rate": 4.983428075708582e-06, + "loss": 1.1007, + "step": 445 + }, + { + "epoch": 0.04690601706390419, + "grad_norm": 4.0885742408881125, + "learning_rate": 4.983331974400941e-06, + "loss": 1.0316, + "step": 446 + }, + { + "epoch": 0.047011187505751506, + "grad_norm": 3.103253885780263, + "learning_rate": 4.983235596182494e-06, + "loss": 1.0792, + "step": 447 + }, + { + "epoch": 0.04711635794759883, + "grad_norm": 2.6930299837503164, + "learning_rate": 4.983138941063988e-06, + "loss": 1.0769, + "step": 448 + }, + { + "epoch": 0.04722152838944615, + "grad_norm": 2.828513719063435, + "learning_rate": 4.9830420090562e-06, + "loss": 1.0662, + "step": 449 + }, + { + "epoch": 0.047326698831293465, + "grad_norm": 3.6605584013370245, + "learning_rate": 4.982944800169939e-06, + "loss": 1.0513, + "step": 450 + }, + { + "epoch": 0.04743186927314078, + "grad_norm": 2.902862443517808, + "learning_rate": 4.982847314416044e-06, + "loss": 1.0157, + "step": 451 + }, + { + "epoch": 0.0475370397149881, + "grad_norm": 2.8680844974252686, + "learning_rate": 4.982749551805387e-06, + "loss": 1.0493, + "step": 452 + }, + { + "epoch": 0.047642210156835424, + "grad_norm": 2.8491457922180197, + "learning_rate": 4.9826515123488675e-06, + "loss": 1.0603, + "step": 453 + }, + { + "epoch": 0.04774738059868274, + "grad_norm": 3.0486623847828334, + "learning_rate": 4.982553196057419e-06, + "loss": 1.0604, + "step": 454 + }, + { + "epoch": 0.04785255104053006, + "grad_norm": 2.2871258606215874, + "learning_rate": 4.982454602942004e-06, + "loss": 1.0367, + "step": 455 + }, + { + "epoch": 0.047957721482377376, + "grad_norm": 3.822855535156918, + "learning_rate": 4.982355733013616e-06, + "loss": 1.0289, + "step": 456 + }, + { + "epoch": 0.04806289192422469, + "grad_norm": 3.4057137793165486, + "learning_rate": 4.98225658628328e-06, + "loss": 1.0331, + "step": 457 + }, + { + "epoch": 0.04816806236607202, + "grad_norm": 4.014699719424169, + "learning_rate": 4.982157162762052e-06, + "loss": 1.0255, + "step": 458 + }, + { + "epoch": 0.048273232807919335, + "grad_norm": 4.028054004330074, + "learning_rate": 4.982057462461018e-06, + "loss": 1.0444, + "step": 459 + }, + { + "epoch": 0.04837840324976665, + "grad_norm": 5.391630404479916, + "learning_rate": 4.981957485391297e-06, + "loss": 1.0536, + "step": 460 + }, + { + "epoch": 0.04848357369161397, + "grad_norm": 2.3994992334050105, + "learning_rate": 4.981857231564034e-06, + "loss": 1.0103, + "step": 461 + }, + { + "epoch": 0.048588744133461294, + "grad_norm": 3.798097639454563, + "learning_rate": 4.9817567009904114e-06, + "loss": 1.0238, + "step": 462 + }, + { + "epoch": 0.04869391457530861, + "grad_norm": 4.714715898953462, + "learning_rate": 4.981655893681637e-06, + "loss": 1.0237, + "step": 463 + }, + { + "epoch": 0.04879908501715593, + "grad_norm": 4.388735872403405, + "learning_rate": 4.981554809648952e-06, + "loss": 1.076, + "step": 464 + }, + { + "epoch": 0.048904255459003246, + "grad_norm": 4.403273076780409, + "learning_rate": 4.981453448903629e-06, + "loss": 1.0737, + "step": 465 + }, + { + "epoch": 0.04900942590085056, + "grad_norm": 2.5521629736975404, + "learning_rate": 4.98135181145697e-06, + "loss": 1.0514, + "step": 466 + }, + { + "epoch": 0.04911459634269789, + "grad_norm": 3.8606552004883534, + "learning_rate": 4.981249897320307e-06, + "loss": 1.0413, + "step": 467 + }, + { + "epoch": 0.049219766784545205, + "grad_norm": 2.8692792806121536, + "learning_rate": 4.981147706505007e-06, + "loss": 1.0463, + "step": 468 + }, + { + "epoch": 0.04932493722639252, + "grad_norm": 2.7409626254110315, + "learning_rate": 4.981045239022462e-06, + "loss": 1.0609, + "step": 469 + }, + { + "epoch": 0.04943010766823984, + "grad_norm": 2.9586823141347707, + "learning_rate": 4.9809424948841e-06, + "loss": 1.0576, + "step": 470 + }, + { + "epoch": 0.04953527811008716, + "grad_norm": 2.6686230862505886, + "learning_rate": 4.980839474101378e-06, + "loss": 1.0157, + "step": 471 + }, + { + "epoch": 0.04964044855193448, + "grad_norm": 3.139097480801446, + "learning_rate": 4.980736176685782e-06, + "loss": 1.0454, + "step": 472 + }, + { + "epoch": 0.0497456189937818, + "grad_norm": 3.100054655774797, + "learning_rate": 4.980632602648831e-06, + "loss": 1.0296, + "step": 473 + }, + { + "epoch": 0.049850789435629116, + "grad_norm": 4.665650390709168, + "learning_rate": 4.980528752002075e-06, + "loss": 1.0421, + "step": 474 + }, + { + "epoch": 0.04995595987747643, + "grad_norm": 3.2001730382677067, + "learning_rate": 4.9804246247570935e-06, + "loss": 1.0599, + "step": 475 + }, + { + "epoch": 0.05006113031932376, + "grad_norm": 3.5026752591463493, + "learning_rate": 4.980320220925498e-06, + "loss": 1.0683, + "step": 476 + }, + { + "epoch": 0.050166300761171075, + "grad_norm": 4.538529420292572, + "learning_rate": 4.98021554051893e-06, + "loss": 1.0342, + "step": 477 + }, + { + "epoch": 0.05027147120301839, + "grad_norm": 2.9057622514006374, + "learning_rate": 4.980110583549062e-06, + "loss": 1.0419, + "step": 478 + }, + { + "epoch": 0.05037664164486571, + "grad_norm": 4.168498583654285, + "learning_rate": 4.980005350027598e-06, + "loss": 1.0235, + "step": 479 + }, + { + "epoch": 0.05048181208671303, + "grad_norm": 3.465721454429658, + "learning_rate": 4.979899839966273e-06, + "loss": 1.0825, + "step": 480 + }, + { + "epoch": 0.05058698252856035, + "grad_norm": 3.9874964500146994, + "learning_rate": 4.97979405337685e-06, + "loss": 1.0683, + "step": 481 + }, + { + "epoch": 0.05069215297040767, + "grad_norm": 3.256605985477986, + "learning_rate": 4.979687990271126e-06, + "loss": 1.0309, + "step": 482 + }, + { + "epoch": 0.050797323412254985, + "grad_norm": 2.822984342445266, + "learning_rate": 4.979581650660929e-06, + "loss": 1.0334, + "step": 483 + }, + { + "epoch": 0.0509024938541023, + "grad_norm": 4.047140521535769, + "learning_rate": 4.979475034558115e-06, + "loss": 1.0336, + "step": 484 + }, + { + "epoch": 0.05100766429594962, + "grad_norm": 3.121659309179012, + "learning_rate": 4.979368141974575e-06, + "loss": 1.0975, + "step": 485 + }, + { + "epoch": 0.051112834737796944, + "grad_norm": 4.436709845774703, + "learning_rate": 4.979260972922226e-06, + "loss": 1.0085, + "step": 486 + }, + { + "epoch": 0.05121800517964426, + "grad_norm": 5.131430337585738, + "learning_rate": 4.9791535274130185e-06, + "loss": 1.0919, + "step": 487 + }, + { + "epoch": 0.05132317562149158, + "grad_norm": 4.09555529702269, + "learning_rate": 4.9790458054589344e-06, + "loss": 1.0877, + "step": 488 + }, + { + "epoch": 0.051428346063338896, + "grad_norm": 3.287984548144607, + "learning_rate": 4.9789378070719854e-06, + "loss": 1.0707, + "step": 489 + }, + { + "epoch": 0.05153351650518622, + "grad_norm": 3.3946741219834977, + "learning_rate": 4.978829532264213e-06, + "loss": 1.055, + "step": 490 + }, + { + "epoch": 0.05163868694703354, + "grad_norm": 3.058713885707903, + "learning_rate": 4.978720981047692e-06, + "loss": 1.0804, + "step": 491 + }, + { + "epoch": 0.051743857388880855, + "grad_norm": 3.6709984313290933, + "learning_rate": 4.978612153434527e-06, + "loss": 1.0564, + "step": 492 + }, + { + "epoch": 0.05184902783072817, + "grad_norm": 5.006510707239013, + "learning_rate": 4.9785030494368515e-06, + "loss": 1.0852, + "step": 493 + }, + { + "epoch": 0.05195419827257549, + "grad_norm": 3.653015612885224, + "learning_rate": 4.978393669066832e-06, + "loss": 1.0751, + "step": 494 + }, + { + "epoch": 0.052059368714422814, + "grad_norm": 3.339203412325409, + "learning_rate": 4.978284012336666e-06, + "loss": 1.0509, + "step": 495 + }, + { + "epoch": 0.05216453915627013, + "grad_norm": 4.106088996601506, + "learning_rate": 4.978174079258581e-06, + "loss": 1.0331, + "step": 496 + }, + { + "epoch": 0.05226970959811745, + "grad_norm": 4.537839424849167, + "learning_rate": 4.978063869844834e-06, + "loss": 1.0399, + "step": 497 + }, + { + "epoch": 0.052374880039964766, + "grad_norm": 3.671123861891514, + "learning_rate": 4.977953384107716e-06, + "loss": 1.0404, + "step": 498 + }, + { + "epoch": 0.052480050481812084, + "grad_norm": 3.835556456175973, + "learning_rate": 4.977842622059546e-06, + "loss": 1.042, + "step": 499 + }, + { + "epoch": 0.05258522092365941, + "grad_norm": 3.810143572024863, + "learning_rate": 4.977731583712675e-06, + "loss": 1.0619, + "step": 500 + }, + { + "epoch": 0.052690391365506725, + "grad_norm": 2.977023375505081, + "learning_rate": 4.977620269079485e-06, + "loss": 1.0557, + "step": 501 + }, + { + "epoch": 0.05279556180735404, + "grad_norm": 2.785680113793684, + "learning_rate": 4.977508678172388e-06, + "loss": 1.0228, + "step": 502 + }, + { + "epoch": 0.05290073224920136, + "grad_norm": 3.1735915547497573, + "learning_rate": 4.977396811003828e-06, + "loss": 1.0574, + "step": 503 + }, + { + "epoch": 0.053005902691048684, + "grad_norm": 2.4497703751127227, + "learning_rate": 4.977284667586278e-06, + "loss": 1.0517, + "step": 504 + }, + { + "epoch": 0.053111073132896, + "grad_norm": 5.411811630465418, + "learning_rate": 4.977172247932243e-06, + "loss": 1.087, + "step": 505 + }, + { + "epoch": 0.05321624357474332, + "grad_norm": 3.763750928236047, + "learning_rate": 4.97705955205426e-06, + "loss": 1.0788, + "step": 506 + }, + { + "epoch": 0.053321414016590636, + "grad_norm": 3.322087579873048, + "learning_rate": 4.976946579964893e-06, + "loss": 1.0785, + "step": 507 + }, + { + "epoch": 0.05342658445843795, + "grad_norm": 4.615164294462559, + "learning_rate": 4.976833331676742e-06, + "loss": 1.0071, + "step": 508 + }, + { + "epoch": 0.05353175490028528, + "grad_norm": 4.4250976293850295, + "learning_rate": 4.976719807202434e-06, + "loss": 1.0403, + "step": 509 + }, + { + "epoch": 0.053636925342132595, + "grad_norm": 3.8283999846040877, + "learning_rate": 4.976606006554626e-06, + "loss": 1.0061, + "step": 510 + }, + { + "epoch": 0.05374209578397991, + "grad_norm": 3.441735254441529, + "learning_rate": 4.976491929746011e-06, + "loss": 1.0699, + "step": 511 + }, + { + "epoch": 0.05384726622582723, + "grad_norm": 3.3474483003361835, + "learning_rate": 4.976377576789307e-06, + "loss": 1.0513, + "step": 512 + }, + { + "epoch": 0.05395243666767455, + "grad_norm": 3.2087836619502603, + "learning_rate": 4.976262947697265e-06, + "loss": 1.015, + "step": 513 + }, + { + "epoch": 0.05405760710952187, + "grad_norm": 4.79038201957431, + "learning_rate": 4.97614804248267e-06, + "loss": 1.0619, + "step": 514 + }, + { + "epoch": 0.05416277755136919, + "grad_norm": 5.0334628294143595, + "learning_rate": 4.976032861158332e-06, + "loss": 1.0501, + "step": 515 + }, + { + "epoch": 0.054267947993216506, + "grad_norm": 5.716661298960797, + "learning_rate": 4.9759174037370955e-06, + "loss": 1.051, + "step": 516 + }, + { + "epoch": 0.05437311843506382, + "grad_norm": 3.219972550593159, + "learning_rate": 4.975801670231835e-06, + "loss": 1.0549, + "step": 517 + }, + { + "epoch": 0.05447828887691115, + "grad_norm": 4.042629628393561, + "learning_rate": 4.975685660655456e-06, + "loss": 1.0104, + "step": 518 + }, + { + "epoch": 0.054583459318758465, + "grad_norm": 4.588832225167543, + "learning_rate": 4.975569375020894e-06, + "loss": 1.0681, + "step": 519 + }, + { + "epoch": 0.05468862976060578, + "grad_norm": 3.6574107244998344, + "learning_rate": 4.9754528133411144e-06, + "loss": 1.0306, + "step": 520 + }, + { + "epoch": 0.0547938002024531, + "grad_norm": 3.064757154713279, + "learning_rate": 4.975335975629117e-06, + "loss": 0.9752, + "step": 521 + }, + { + "epoch": 0.05489897064430042, + "grad_norm": 3.7277635260897997, + "learning_rate": 4.975218861897929e-06, + "loss": 1.0457, + "step": 522 + }, + { + "epoch": 0.05500414108614774, + "grad_norm": 3.2935157357520124, + "learning_rate": 4.9751014721606104e-06, + "loss": 1.0834, + "step": 523 + }, + { + "epoch": 0.05510931152799506, + "grad_norm": 2.341096648916625, + "learning_rate": 4.97498380643025e-06, + "loss": 1.0159, + "step": 524 + }, + { + "epoch": 0.055214481969842376, + "grad_norm": 2.9750956985730785, + "learning_rate": 4.974865864719969e-06, + "loss": 1.0646, + "step": 525 + }, + { + "epoch": 0.05531965241168969, + "grad_norm": 2.712385516803827, + "learning_rate": 4.974747647042918e-06, + "loss": 1.0122, + "step": 526 + }, + { + "epoch": 0.05542482285353701, + "grad_norm": 3.225400308884659, + "learning_rate": 4.9746291534122805e-06, + "loss": 1.0523, + "step": 527 + }, + { + "epoch": 0.055529993295384335, + "grad_norm": 4.290571209392225, + "learning_rate": 4.974510383841269e-06, + "loss": 1.0738, + "step": 528 + }, + { + "epoch": 0.05563516373723165, + "grad_norm": 4.014053600914674, + "learning_rate": 4.974391338343126e-06, + "loss": 1.0592, + "step": 529 + }, + { + "epoch": 0.05574033417907897, + "grad_norm": 3.3268020956167166, + "learning_rate": 4.974272016931127e-06, + "loss": 1.043, + "step": 530 + }, + { + "epoch": 0.05584550462092629, + "grad_norm": 4.333148338760938, + "learning_rate": 4.974152419618579e-06, + "loss": 1.0473, + "step": 531 + }, + { + "epoch": 0.055950675062773604, + "grad_norm": 2.8778208791693616, + "learning_rate": 4.974032546418816e-06, + "loss": 1.0629, + "step": 532 + }, + { + "epoch": 0.05605584550462093, + "grad_norm": 3.2582876551113142, + "learning_rate": 4.973912397345204e-06, + "loss": 1.0486, + "step": 533 + }, + { + "epoch": 0.056161015946468246, + "grad_norm": 3.287154664003592, + "learning_rate": 4.973791972411142e-06, + "loss": 1.0428, + "step": 534 + }, + { + "epoch": 0.05626618638831556, + "grad_norm": 3.331118043139556, + "learning_rate": 4.973671271630057e-06, + "loss": 1.0282, + "step": 535 + }, + { + "epoch": 0.05637135683016288, + "grad_norm": 3.3363705578676464, + "learning_rate": 4.973550295015411e-06, + "loss": 1.0733, + "step": 536 + }, + { + "epoch": 0.056476527272010205, + "grad_norm": 3.191076118323749, + "learning_rate": 4.973429042580691e-06, + "loss": 1.0458, + "step": 537 + }, + { + "epoch": 0.05658169771385752, + "grad_norm": 3.2347419761171716, + "learning_rate": 4.973307514339418e-06, + "loss": 1.053, + "step": 538 + }, + { + "epoch": 0.05668686815570484, + "grad_norm": 3.4907812740771575, + "learning_rate": 4.973185710305145e-06, + "loss": 1.0548, + "step": 539 + }, + { + "epoch": 0.05679203859755216, + "grad_norm": 4.184646901965426, + "learning_rate": 4.973063630491451e-06, + "loss": 1.0761, + "step": 540 + }, + { + "epoch": 0.056897209039399474, + "grad_norm": 3.0281792628868645, + "learning_rate": 4.972941274911953e-06, + "loss": 1.0389, + "step": 541 + }, + { + "epoch": 0.0570023794812468, + "grad_norm": 3.6955546648249378, + "learning_rate": 4.972818643580291e-06, + "loss": 1.0145, + "step": 542 + }, + { + "epoch": 0.057107549923094116, + "grad_norm": 3.1190860521523813, + "learning_rate": 4.972695736510141e-06, + "loss": 0.9828, + "step": 543 + }, + { + "epoch": 0.05721272036494143, + "grad_norm": 3.100972513801203, + "learning_rate": 4.972572553715208e-06, + "loss": 1.035, + "step": 544 + }, + { + "epoch": 0.05731789080678875, + "grad_norm": 3.0257460963065297, + "learning_rate": 4.972449095209227e-06, + "loss": 1.0376, + "step": 545 + }, + { + "epoch": 0.05742306124863607, + "grad_norm": 3.932692512358207, + "learning_rate": 4.972325361005966e-06, + "loss": 1.0455, + "step": 546 + }, + { + "epoch": 0.05752823169048339, + "grad_norm": 3.196511181526071, + "learning_rate": 4.9722013511192215e-06, + "loss": 0.9909, + "step": 547 + }, + { + "epoch": 0.05763340213233071, + "grad_norm": 3.467930305235877, + "learning_rate": 4.9720770655628216e-06, + "loss": 1.044, + "step": 548 + }, + { + "epoch": 0.057738572574178026, + "grad_norm": 4.5794753754186495, + "learning_rate": 4.971952504350625e-06, + "loss": 1.0603, + "step": 549 + }, + { + "epoch": 0.057843743016025344, + "grad_norm": 3.2755087160087073, + "learning_rate": 4.9718276674965215e-06, + "loss": 1.066, + "step": 550 + }, + { + "epoch": 0.05794891345787267, + "grad_norm": 3.9349343797276197, + "learning_rate": 4.971702555014431e-06, + "loss": 1.0681, + "step": 551 + }, + { + "epoch": 0.058054083899719985, + "grad_norm": 4.361018090944349, + "learning_rate": 4.971577166918305e-06, + "loss": 1.0697, + "step": 552 + }, + { + "epoch": 0.0581592543415673, + "grad_norm": 4.570084487039526, + "learning_rate": 4.971451503222125e-06, + "loss": 1.0249, + "step": 553 + }, + { + "epoch": 0.05826442478341462, + "grad_norm": 4.362303063650197, + "learning_rate": 4.971325563939903e-06, + "loss": 1.0465, + "step": 554 + }, + { + "epoch": 0.05836959522526194, + "grad_norm": 2.2989031239751285, + "learning_rate": 4.971199349085683e-06, + "loss": 1.0107, + "step": 555 + }, + { + "epoch": 0.05847476566710926, + "grad_norm": 3.672401717225256, + "learning_rate": 4.971072858673539e-06, + "loss": 1.0765, + "step": 556 + }, + { + "epoch": 0.05857993610895658, + "grad_norm": 3.633753822132858, + "learning_rate": 4.970946092717574e-06, + "loss": 1.0491, + "step": 557 + }, + { + "epoch": 0.058685106550803896, + "grad_norm": 4.381583646223104, + "learning_rate": 4.970819051231927e-06, + "loss": 1.0819, + "step": 558 + }, + { + "epoch": 0.058790276992651214, + "grad_norm": 3.3145358767742987, + "learning_rate": 4.970691734230759e-06, + "loss": 1.0243, + "step": 559 + }, + { + "epoch": 0.05889544743449853, + "grad_norm": 2.16306091430307, + "learning_rate": 4.970564141728271e-06, + "loss": 1.0251, + "step": 560 + }, + { + "epoch": 0.059000617876345855, + "grad_norm": 4.923343581263921, + "learning_rate": 4.9704362737386894e-06, + "loss": 1.0501, + "step": 561 + }, + { + "epoch": 0.05910578831819317, + "grad_norm": 3.808471042980745, + "learning_rate": 4.970308130276273e-06, + "loss": 1.0398, + "step": 562 + }, + { + "epoch": 0.05921095876004049, + "grad_norm": 3.13104997246359, + "learning_rate": 4.970179711355308e-06, + "loss": 1.0325, + "step": 563 + }, + { + "epoch": 0.05931612920188781, + "grad_norm": 3.0500679981758436, + "learning_rate": 4.970051016990118e-06, + "loss": 1.0579, + "step": 564 + }, + { + "epoch": 0.05942129964373513, + "grad_norm": 2.9497172428170204, + "learning_rate": 4.96992204719505e-06, + "loss": 1.049, + "step": 565 + }, + { + "epoch": 0.05952647008558245, + "grad_norm": 1.9764188767604873, + "learning_rate": 4.9697928019844885e-06, + "loss": 1.0358, + "step": 566 + }, + { + "epoch": 0.059631640527429766, + "grad_norm": 3.9746447469142185, + "learning_rate": 4.969663281372842e-06, + "loss": 1.0577, + "step": 567 + }, + { + "epoch": 0.059736810969277084, + "grad_norm": 3.788137117647662, + "learning_rate": 4.969533485374556e-06, + "loss": 0.9947, + "step": 568 + }, + { + "epoch": 0.0598419814111244, + "grad_norm": 4.405021349521944, + "learning_rate": 4.9694034140041015e-06, + "loss": 1.0581, + "step": 569 + }, + { + "epoch": 0.059947151852971725, + "grad_norm": 2.0847517500095334, + "learning_rate": 4.9692730672759835e-06, + "loss": 1.0142, + "step": 570 + }, + { + "epoch": 0.06005232229481904, + "grad_norm": 5.811807304246141, + "learning_rate": 4.969142445204736e-06, + "loss": 1.0579, + "step": 571 + }, + { + "epoch": 0.06015749273666636, + "grad_norm": 3.222186946748591, + "learning_rate": 4.969011547804925e-06, + "loss": 1.0306, + "step": 572 + }, + { + "epoch": 0.06026266317851368, + "grad_norm": 3.7831417030237837, + "learning_rate": 4.968880375091147e-06, + "loss": 1.0391, + "step": 573 + }, + { + "epoch": 0.060367833620360994, + "grad_norm": 3.9362867297970143, + "learning_rate": 4.968748927078028e-06, + "loss": 1.0684, + "step": 574 + }, + { + "epoch": 0.06047300406220832, + "grad_norm": 3.329344719091068, + "learning_rate": 4.968617203780226e-06, + "loss": 1.0679, + "step": 575 + }, + { + "epoch": 0.060578174504055636, + "grad_norm": 2.693926328348047, + "learning_rate": 4.968485205212429e-06, + "loss": 1.0686, + "step": 576 + }, + { + "epoch": 0.06068334494590295, + "grad_norm": 4.752717425247337, + "learning_rate": 4.968352931389355e-06, + "loss": 1.0477, + "step": 577 + }, + { + "epoch": 0.06078851538775027, + "grad_norm": 5.382651519152381, + "learning_rate": 4.968220382325755e-06, + "loss": 1.0522, + "step": 578 + }, + { + "epoch": 0.060893685829597595, + "grad_norm": 5.095229123548204, + "learning_rate": 4.968087558036408e-06, + "loss": 1.0342, + "step": 579 + }, + { + "epoch": 0.06099885627144491, + "grad_norm": 2.9622063334614506, + "learning_rate": 4.967954458536126e-06, + "loss": 1.0545, + "step": 580 + }, + { + "epoch": 0.06110402671329223, + "grad_norm": 4.010704877984255, + "learning_rate": 4.96782108383975e-06, + "loss": 1.0396, + "step": 581 + }, + { + "epoch": 0.06120919715513955, + "grad_norm": 3.679353200958754, + "learning_rate": 4.967687433962152e-06, + "loss": 1.0354, + "step": 582 + }, + { + "epoch": 0.061314367596986864, + "grad_norm": 4.1840739624189345, + "learning_rate": 4.967553508918236e-06, + "loss": 1.0032, + "step": 583 + }, + { + "epoch": 0.06141953803883419, + "grad_norm": 3.8257222018053394, + "learning_rate": 4.967419308722935e-06, + "loss": 1.0402, + "step": 584 + }, + { + "epoch": 0.061524708480681506, + "grad_norm": 4.453654621625787, + "learning_rate": 4.967284833391213e-06, + "loss": 1.0446, + "step": 585 + }, + { + "epoch": 0.06162987892252882, + "grad_norm": 3.37619320579649, + "learning_rate": 4.967150082938066e-06, + "loss": 1.0381, + "step": 586 + }, + { + "epoch": 0.06173504936437614, + "grad_norm": 4.183795371545729, + "learning_rate": 4.9670150573785195e-06, + "loss": 1.0677, + "step": 587 + }, + { + "epoch": 0.06184021980622346, + "grad_norm": 2.901597121091973, + "learning_rate": 4.96687975672763e-06, + "loss": 1.0499, + "step": 588 + }, + { + "epoch": 0.06194539024807078, + "grad_norm": 3.838808993150415, + "learning_rate": 4.966744181000483e-06, + "loss": 1.0541, + "step": 589 + }, + { + "epoch": 0.0620505606899181, + "grad_norm": 3.750578964588987, + "learning_rate": 4.966608330212198e-06, + "loss": 1.0377, + "step": 590 + }, + { + "epoch": 0.06215573113176542, + "grad_norm": 3.6029951214777403, + "learning_rate": 4.9664722043779226e-06, + "loss": 1.0386, + "step": 591 + }, + { + "epoch": 0.062260901573612734, + "grad_norm": 3.430327053415612, + "learning_rate": 4.966335803512837e-06, + "loss": 1.0532, + "step": 592 + }, + { + "epoch": 0.06236607201546006, + "grad_norm": 2.7139325949372775, + "learning_rate": 4.96619912763215e-06, + "loss": 1.0649, + "step": 593 + }, + { + "epoch": 0.062471242457307376, + "grad_norm": 3.1492912949831884, + "learning_rate": 4.966062176751101e-06, + "loss": 1.0493, + "step": 594 + }, + { + "epoch": 0.0625764128991547, + "grad_norm": 4.125956595172075, + "learning_rate": 4.965924950884964e-06, + "loss": 1.0206, + "step": 595 + }, + { + "epoch": 0.06268158334100202, + "grad_norm": 3.7929128777641425, + "learning_rate": 4.965787450049038e-06, + "loss": 1.0299, + "step": 596 + }, + { + "epoch": 0.06278675378284933, + "grad_norm": 4.0810239450371215, + "learning_rate": 4.965649674258657e-06, + "loss": 1.1232, + "step": 597 + }, + { + "epoch": 0.06289192422469665, + "grad_norm": 4.6131544836052, + "learning_rate": 4.965511623529183e-06, + "loss": 1.0422, + "step": 598 + }, + { + "epoch": 0.06299709466654396, + "grad_norm": 4.798402879282435, + "learning_rate": 4.965373297876009e-06, + "loss": 1.0787, + "step": 599 + }, + { + "epoch": 0.06310226510839129, + "grad_norm": 4.335100341025516, + "learning_rate": 4.965234697314563e-06, + "loss": 0.9823, + "step": 600 + }, + { + "epoch": 0.06320743555023861, + "grad_norm": 3.842720236255419, + "learning_rate": 4.965095821860296e-06, + "loss": 1.0744, + "step": 601 + }, + { + "epoch": 0.06331260599208592, + "grad_norm": 5.2410971364749335, + "learning_rate": 4.964956671528696e-06, + "loss": 1.0372, + "step": 602 + }, + { + "epoch": 0.06341777643393325, + "grad_norm": 3.974109563837657, + "learning_rate": 4.964817246335278e-06, + "loss": 1.0889, + "step": 603 + }, + { + "epoch": 0.06352294687578056, + "grad_norm": 3.8457386538934326, + "learning_rate": 4.96467754629559e-06, + "loss": 1.0567, + "step": 604 + }, + { + "epoch": 0.06362811731762788, + "grad_norm": 4.314777488524298, + "learning_rate": 4.964537571425209e-06, + "loss": 1.0197, + "step": 605 + }, + { + "epoch": 0.0637332877594752, + "grad_norm": 3.1075473114740966, + "learning_rate": 4.9643973217397446e-06, + "loss": 1.042, + "step": 606 + }, + { + "epoch": 0.06383845820132251, + "grad_norm": 2.8685624789514637, + "learning_rate": 4.964256797254834e-06, + "loss": 1.0444, + "step": 607 + }, + { + "epoch": 0.06394362864316984, + "grad_norm": 3.0667602949659627, + "learning_rate": 4.964115997986148e-06, + "loss": 1.0434, + "step": 608 + }, + { + "epoch": 0.06404879908501715, + "grad_norm": 2.3511531120455955, + "learning_rate": 4.963974923949386e-06, + "loss": 1.0321, + "step": 609 + }, + { + "epoch": 0.06415396952686447, + "grad_norm": 2.7978514748891055, + "learning_rate": 4.963833575160278e-06, + "loss": 1.0402, + "step": 610 + }, + { + "epoch": 0.0642591399687118, + "grad_norm": 4.049018255338047, + "learning_rate": 4.963691951634588e-06, + "loss": 1.0826, + "step": 611 + }, + { + "epoch": 0.06436431041055911, + "grad_norm": 3.3678790577984072, + "learning_rate": 4.9635500533881065e-06, + "loss": 1.0306, + "step": 612 + }, + { + "epoch": 0.06446948085240643, + "grad_norm": 3.0634850122257786, + "learning_rate": 4.963407880436657e-06, + "loss": 1.0558, + "step": 613 + }, + { + "epoch": 0.06457465129425374, + "grad_norm": 3.0330037801287957, + "learning_rate": 4.963265432796092e-06, + "loss": 1.0743, + "step": 614 + }, + { + "epoch": 0.06467982173610107, + "grad_norm": 3.071000708505292, + "learning_rate": 4.963122710482295e-06, + "loss": 1.0543, + "step": 615 + }, + { + "epoch": 0.06478499217794839, + "grad_norm": 2.8610188182071385, + "learning_rate": 4.962979713511183e-06, + "loss": 1.0579, + "step": 616 + }, + { + "epoch": 0.0648901626197957, + "grad_norm": 4.206292540959885, + "learning_rate": 4.962836441898699e-06, + "loss": 1.0701, + "step": 617 + }, + { + "epoch": 0.06499533306164303, + "grad_norm": 4.356324104061252, + "learning_rate": 4.96269289566082e-06, + "loss": 1.0955, + "step": 618 + }, + { + "epoch": 0.06510050350349035, + "grad_norm": 3.319002510122104, + "learning_rate": 4.9625490748135525e-06, + "loss": 1.0207, + "step": 619 + }, + { + "epoch": 0.06520567394533766, + "grad_norm": 3.7127033614212026, + "learning_rate": 4.962404979372933e-06, + "loss": 1.0698, + "step": 620 + }, + { + "epoch": 0.06531084438718499, + "grad_norm": 4.18629805171437, + "learning_rate": 4.962260609355029e-06, + "loss": 1.0392, + "step": 621 + }, + { + "epoch": 0.0654160148290323, + "grad_norm": 4.993374261310267, + "learning_rate": 4.962115964775941e-06, + "loss": 1.008, + "step": 622 + }, + { + "epoch": 0.06552118527087962, + "grad_norm": 3.017269799303926, + "learning_rate": 4.961971045651796e-06, + "loss": 1.0126, + "step": 623 + }, + { + "epoch": 0.06562635571272694, + "grad_norm": 2.700741825605453, + "learning_rate": 4.961825851998754e-06, + "loss": 1.0446, + "step": 624 + }, + { + "epoch": 0.06573152615457425, + "grad_norm": 3.3134417945987567, + "learning_rate": 4.961680383833005e-06, + "loss": 1.0507, + "step": 625 + }, + { + "epoch": 0.06583669659642158, + "grad_norm": 4.294665174497018, + "learning_rate": 4.961534641170771e-06, + "loss": 1.0448, + "step": 626 + }, + { + "epoch": 0.06594186703826889, + "grad_norm": 3.659408792404163, + "learning_rate": 4.961388624028303e-06, + "loss": 1.0181, + "step": 627 + }, + { + "epoch": 0.06604703748011621, + "grad_norm": 3.640846682505268, + "learning_rate": 4.9612423324218816e-06, + "loss": 1.1181, + "step": 628 + }, + { + "epoch": 0.06615220792196354, + "grad_norm": 3.086905697715193, + "learning_rate": 4.961095766367821e-06, + "loss": 1.0573, + "step": 629 + }, + { + "epoch": 0.06625737836381085, + "grad_norm": 3.9597870998951468, + "learning_rate": 4.960948925882464e-06, + "loss": 1.0285, + "step": 630 + }, + { + "epoch": 0.06636254880565817, + "grad_norm": 4.292598646500938, + "learning_rate": 4.960801810982184e-06, + "loss": 1.0601, + "step": 631 + }, + { + "epoch": 0.06646771924750548, + "grad_norm": 3.600297435919102, + "learning_rate": 4.9606544216833865e-06, + "loss": 1.0509, + "step": 632 + }, + { + "epoch": 0.06657288968935281, + "grad_norm": 3.7580399038893755, + "learning_rate": 4.960506758002506e-06, + "loss": 1.0315, + "step": 633 + }, + { + "epoch": 0.06667806013120013, + "grad_norm": 3.2034136066067047, + "learning_rate": 4.960358819956007e-06, + "loss": 1.0486, + "step": 634 + }, + { + "epoch": 0.06678323057304744, + "grad_norm": 2.650453131312526, + "learning_rate": 4.960210607560387e-06, + "loss": 1.004, + "step": 635 + }, + { + "epoch": 0.06688840101489477, + "grad_norm": 3.3291779041483793, + "learning_rate": 4.960062120832174e-06, + "loss": 1.0592, + "step": 636 + }, + { + "epoch": 0.06699357145674208, + "grad_norm": 4.162217220561789, + "learning_rate": 4.959913359787923e-06, + "loss": 1.0178, + "step": 637 + }, + { + "epoch": 0.0670987418985894, + "grad_norm": 2.909767419364276, + "learning_rate": 4.959764324444224e-06, + "loss": 1.0572, + "step": 638 + }, + { + "epoch": 0.06720391234043673, + "grad_norm": 3.4590810268323127, + "learning_rate": 4.959615014817694e-06, + "loss": 0.9815, + "step": 639 + }, + { + "epoch": 0.06730908278228404, + "grad_norm": 5.476510291071485, + "learning_rate": 4.959465430924983e-06, + "loss": 1.0637, + "step": 640 + }, + { + "epoch": 0.06741425322413136, + "grad_norm": 3.499714211214465, + "learning_rate": 4.9593155727827705e-06, + "loss": 1.0513, + "step": 641 + }, + { + "epoch": 0.06751942366597867, + "grad_norm": 2.055004505673057, + "learning_rate": 4.9591654404077675e-06, + "loss": 1.043, + "step": 642 + }, + { + "epoch": 0.067624594107826, + "grad_norm": 3.4790021910325915, + "learning_rate": 4.959015033816714e-06, + "loss": 1.0246, + "step": 643 + }, + { + "epoch": 0.06772976454967332, + "grad_norm": 3.278908306612351, + "learning_rate": 4.958864353026382e-06, + "loss": 1.0671, + "step": 644 + }, + { + "epoch": 0.06783493499152063, + "grad_norm": 3.2634424157131585, + "learning_rate": 4.958713398053574e-06, + "loss": 1.052, + "step": 645 + }, + { + "epoch": 0.06794010543336795, + "grad_norm": 3.123492650449054, + "learning_rate": 4.958562168915121e-06, + "loss": 1.0305, + "step": 646 + }, + { + "epoch": 0.06804527587521528, + "grad_norm": 3.237487642730812, + "learning_rate": 4.9584106656278884e-06, + "loss": 1.0463, + "step": 647 + }, + { + "epoch": 0.06815044631706259, + "grad_norm": 4.187978143084541, + "learning_rate": 4.958258888208769e-06, + "loss": 1.0597, + "step": 648 + }, + { + "epoch": 0.06825561675890991, + "grad_norm": 2.9512732989254475, + "learning_rate": 4.958106836674686e-06, + "loss": 1.0864, + "step": 649 + }, + { + "epoch": 0.06836078720075722, + "grad_norm": 2.922076078908869, + "learning_rate": 4.957954511042598e-06, + "loss": 1.0638, + "step": 650 + }, + { + "epoch": 0.06846595764260455, + "grad_norm": 4.07040889167094, + "learning_rate": 4.957801911329485e-06, + "loss": 1.0417, + "step": 651 + }, + { + "epoch": 0.06857112808445187, + "grad_norm": 2.8281332452599495, + "learning_rate": 4.957649037552368e-06, + "loss": 1.0614, + "step": 652 + }, + { + "epoch": 0.06867629852629918, + "grad_norm": 2.6425711970959544, + "learning_rate": 4.957495889728291e-06, + "loss": 1.0922, + "step": 653 + }, + { + "epoch": 0.0687814689681465, + "grad_norm": 2.4951631686592157, + "learning_rate": 4.957342467874331e-06, + "loss": 1.017, + "step": 654 + }, + { + "epoch": 0.06888663940999382, + "grad_norm": 2.738893400348362, + "learning_rate": 4.957188772007597e-06, + "loss": 1.0489, + "step": 655 + }, + { + "epoch": 0.06899180985184114, + "grad_norm": 4.8906186641004785, + "learning_rate": 4.957034802145227e-06, + "loss": 1.0226, + "step": 656 + }, + { + "epoch": 0.06909698029368846, + "grad_norm": 3.9715716237558287, + "learning_rate": 4.9568805583043905e-06, + "loss": 1.0836, + "step": 657 + }, + { + "epoch": 0.06920215073553578, + "grad_norm": 2.7599656326533384, + "learning_rate": 4.956726040502285e-06, + "loss": 1.0429, + "step": 658 + }, + { + "epoch": 0.0693073211773831, + "grad_norm": 3.4427680533746687, + "learning_rate": 4.956571248756142e-06, + "loss": 1.03, + "step": 659 + }, + { + "epoch": 0.06941249161923041, + "grad_norm": 3.5854381263268666, + "learning_rate": 4.9564161830832214e-06, + "loss": 1.0513, + "step": 660 + }, + { + "epoch": 0.06951766206107773, + "grad_norm": 3.7704419908620355, + "learning_rate": 4.956260843500814e-06, + "loss": 1.0662, + "step": 661 + }, + { + "epoch": 0.06962283250292506, + "grad_norm": 4.3580242444386545, + "learning_rate": 4.9561052300262415e-06, + "loss": 1.0661, + "step": 662 + }, + { + "epoch": 0.06972800294477237, + "grad_norm": 3.720266896736387, + "learning_rate": 4.955949342676856e-06, + "loss": 1.0175, + "step": 663 + }, + { + "epoch": 0.06983317338661969, + "grad_norm": 4.7410376644337475, + "learning_rate": 4.955793181470041e-06, + "loss": 1.0149, + "step": 664 + }, + { + "epoch": 0.069938343828467, + "grad_norm": 3.32459207886422, + "learning_rate": 4.955636746423208e-06, + "loss": 1.0271, + "step": 665 + }, + { + "epoch": 0.07004351427031433, + "grad_norm": 4.773668981576176, + "learning_rate": 4.9554800375538026e-06, + "loss": 1.1019, + "step": 666 + }, + { + "epoch": 0.07014868471216165, + "grad_norm": 4.563403967070941, + "learning_rate": 4.955323054879298e-06, + "loss": 1.0346, + "step": 667 + }, + { + "epoch": 0.07025385515400896, + "grad_norm": 3.2801934792913885, + "learning_rate": 4.9551657984171995e-06, + "loss": 1.0036, + "step": 668 + }, + { + "epoch": 0.07035902559585629, + "grad_norm": 2.815254256813279, + "learning_rate": 4.955008268185041e-06, + "loss": 1.0693, + "step": 669 + }, + { + "epoch": 0.0704641960377036, + "grad_norm": 3.9847665137382737, + "learning_rate": 4.954850464200391e-06, + "loss": 1.0579, + "step": 670 + }, + { + "epoch": 0.07056936647955092, + "grad_norm": 3.8295707952048708, + "learning_rate": 4.954692386480843e-06, + "loss": 1.0618, + "step": 671 + }, + { + "epoch": 0.07067453692139825, + "grad_norm": 4.523650449147604, + "learning_rate": 4.954534035044025e-06, + "loss": 1.0736, + "step": 672 + }, + { + "epoch": 0.07077970736324556, + "grad_norm": 3.6692146168192235, + "learning_rate": 4.954375409907595e-06, + "loss": 1.0575, + "step": 673 + }, + { + "epoch": 0.07088487780509288, + "grad_norm": 3.325725179037857, + "learning_rate": 4.954216511089242e-06, + "loss": 1.0396, + "step": 674 + }, + { + "epoch": 0.07099004824694019, + "grad_norm": 3.3396872270434512, + "learning_rate": 4.954057338606681e-06, + "loss": 1.0595, + "step": 675 + }, + { + "epoch": 0.07109521868878751, + "grad_norm": 2.472910469422091, + "learning_rate": 4.953897892477664e-06, + "loss": 1.0438, + "step": 676 + }, + { + "epoch": 0.07120038913063484, + "grad_norm": 2.9528197412204227, + "learning_rate": 4.953738172719969e-06, + "loss": 1.0478, + "step": 677 + }, + { + "epoch": 0.07130555957248215, + "grad_norm": 3.1922126178334853, + "learning_rate": 4.953578179351407e-06, + "loss": 1.0706, + "step": 678 + }, + { + "epoch": 0.07141073001432947, + "grad_norm": 2.9060870572179778, + "learning_rate": 4.953417912389817e-06, + "loss": 1.0375, + "step": 679 + }, + { + "epoch": 0.0715159004561768, + "grad_norm": 2.2896504072768393, + "learning_rate": 4.9532573718530716e-06, + "loss": 1.0023, + "step": 680 + }, + { + "epoch": 0.07162107089802411, + "grad_norm": 3.4112113846941288, + "learning_rate": 4.953096557759072e-06, + "loss": 1.0565, + "step": 681 + }, + { + "epoch": 0.07172624133987143, + "grad_norm": 3.7035390337251837, + "learning_rate": 4.9529354701257486e-06, + "loss": 1.0406, + "step": 682 + }, + { + "epoch": 0.07183141178171874, + "grad_norm": 2.8453052460942705, + "learning_rate": 4.9527741089710664e-06, + "loss": 1.0652, + "step": 683 + }, + { + "epoch": 0.07193658222356607, + "grad_norm": 3.7620645383784233, + "learning_rate": 4.952612474313017e-06, + "loss": 1.047, + "step": 684 + }, + { + "epoch": 0.07204175266541339, + "grad_norm": 4.070346821177919, + "learning_rate": 4.9524505661696235e-06, + "loss": 1.0757, + "step": 685 + }, + { + "epoch": 0.0721469231072607, + "grad_norm": 3.5783791981742135, + "learning_rate": 4.952288384558941e-06, + "loss": 1.0512, + "step": 686 + }, + { + "epoch": 0.07225209354910803, + "grad_norm": 3.0673840522581046, + "learning_rate": 4.952125929499054e-06, + "loss": 1.0278, + "step": 687 + }, + { + "epoch": 0.07235726399095534, + "grad_norm": 3.2229108300056653, + "learning_rate": 4.9519632010080765e-06, + "loss": 1.0657, + "step": 688 + }, + { + "epoch": 0.07246243443280266, + "grad_norm": 2.652129171991843, + "learning_rate": 4.951800199104155e-06, + "loss": 1.0299, + "step": 689 + }, + { + "epoch": 0.07256760487464999, + "grad_norm": 3.364270267477337, + "learning_rate": 4.951636923805464e-06, + "loss": 1.0527, + "step": 690 + }, + { + "epoch": 0.0726727753164973, + "grad_norm": 3.4812038018158136, + "learning_rate": 4.951473375130212e-06, + "loss": 1.0331, + "step": 691 + }, + { + "epoch": 0.07277794575834462, + "grad_norm": 3.7937124089162704, + "learning_rate": 4.951309553096635e-06, + "loss": 1.0407, + "step": 692 + }, + { + "epoch": 0.07288311620019193, + "grad_norm": 3.1503406868653787, + "learning_rate": 4.951145457723e-06, + "loss": 1.0531, + "step": 693 + }, + { + "epoch": 0.07298828664203925, + "grad_norm": 2.8144705447628526, + "learning_rate": 4.9509810890276065e-06, + "loss": 1.1033, + "step": 694 + }, + { + "epoch": 0.07309345708388658, + "grad_norm": 3.3436807422524053, + "learning_rate": 4.95081644702878e-06, + "loss": 1.0837, + "step": 695 + }, + { + "epoch": 0.07319862752573389, + "grad_norm": 4.010147599775435, + "learning_rate": 4.950651531744882e-06, + "loss": 1.0769, + "step": 696 + }, + { + "epoch": 0.07330379796758121, + "grad_norm": 3.1326979849262195, + "learning_rate": 4.9504863431943004e-06, + "loss": 1.0019, + "step": 697 + }, + { + "epoch": 0.07340896840942852, + "grad_norm": 3.1431506128416995, + "learning_rate": 4.9503208813954565e-06, + "loss": 1.0326, + "step": 698 + }, + { + "epoch": 0.07351413885127585, + "grad_norm": 3.563862494912588, + "learning_rate": 4.9501551463667985e-06, + "loss": 1.0365, + "step": 699 + }, + { + "epoch": 0.07361930929312317, + "grad_norm": 2.9908810962971812, + "learning_rate": 4.949989138126809e-06, + "loss": 1.0655, + "step": 700 + }, + { + "epoch": 0.07372447973497048, + "grad_norm": 2.169128476617307, + "learning_rate": 4.949822856693997e-06, + "loss": 1.0292, + "step": 701 + }, + { + "epoch": 0.07382965017681781, + "grad_norm": 4.359267398693771, + "learning_rate": 4.949656302086907e-06, + "loss": 1.0228, + "step": 702 + }, + { + "epoch": 0.07393482061866512, + "grad_norm": 3.787799254738041, + "learning_rate": 4.949489474324108e-06, + "loss": 1.0041, + "step": 703 + }, + { + "epoch": 0.07403999106051244, + "grad_norm": 2.667883375421812, + "learning_rate": 4.949322373424206e-06, + "loss": 1.0423, + "step": 704 + }, + { + "epoch": 0.07414516150235977, + "grad_norm": 2.9302612190851787, + "learning_rate": 4.949154999405832e-06, + "loss": 1.0259, + "step": 705 + }, + { + "epoch": 0.07425033194420708, + "grad_norm": 4.283595912438445, + "learning_rate": 4.94898735228765e-06, + "loss": 1.0675, + "step": 706 + }, + { + "epoch": 0.0743555023860544, + "grad_norm": 3.2175291514292996, + "learning_rate": 4.948819432088353e-06, + "loss": 1.0215, + "step": 707 + }, + { + "epoch": 0.07446067282790172, + "grad_norm": 3.7422947687070645, + "learning_rate": 4.948651238826667e-06, + "loss": 1.0539, + "step": 708 + }, + { + "epoch": 0.07456584326974904, + "grad_norm": 3.6570635294285383, + "learning_rate": 4.948482772521346e-06, + "loss": 1.0375, + "step": 709 + }, + { + "epoch": 0.07467101371159636, + "grad_norm": 4.267142662139774, + "learning_rate": 4.948314033191175e-06, + "loss": 1.0495, + "step": 710 + }, + { + "epoch": 0.07477618415344367, + "grad_norm": 3.6286208637111277, + "learning_rate": 4.948145020854971e-06, + "loss": 1.0083, + "step": 711 + }, + { + "epoch": 0.074881354595291, + "grad_norm": 3.942094346556032, + "learning_rate": 4.947975735531578e-06, + "loss": 1.0355, + "step": 712 + }, + { + "epoch": 0.07498652503713832, + "grad_norm": 2.0042428048097256, + "learning_rate": 4.947806177239875e-06, + "loss": 1.066, + "step": 713 + }, + { + "epoch": 0.07509169547898563, + "grad_norm": 3.4172310190837143, + "learning_rate": 4.947636345998768e-06, + "loss": 1.0153, + "step": 714 + }, + { + "epoch": 0.07519686592083295, + "grad_norm": 3.2523213929196273, + "learning_rate": 4.947466241827194e-06, + "loss": 1.0243, + "step": 715 + }, + { + "epoch": 0.07530203636268026, + "grad_norm": 4.081933448185989, + "learning_rate": 4.947295864744121e-06, + "loss": 0.9925, + "step": 716 + }, + { + "epoch": 0.07540720680452759, + "grad_norm": 3.0031168886878583, + "learning_rate": 4.947125214768549e-06, + "loss": 1.0452, + "step": 717 + }, + { + "epoch": 0.07551237724637491, + "grad_norm": 2.8581225129342287, + "learning_rate": 4.946954291919505e-06, + "loss": 1.0324, + "step": 718 + }, + { + "epoch": 0.07561754768822222, + "grad_norm": 2.663605746652252, + "learning_rate": 4.946783096216049e-06, + "loss": 1.0029, + "step": 719 + }, + { + "epoch": 0.07572271813006955, + "grad_norm": 3.2647089266169984, + "learning_rate": 4.94661162767727e-06, + "loss": 1.0534, + "step": 720 + }, + { + "epoch": 0.07582788857191686, + "grad_norm": 3.6156070223917216, + "learning_rate": 4.94643988632229e-06, + "loss": 0.9935, + "step": 721 + }, + { + "epoch": 0.07593305901376418, + "grad_norm": 3.758495500937088, + "learning_rate": 4.946267872170256e-06, + "loss": 0.9976, + "step": 722 + }, + { + "epoch": 0.0760382294556115, + "grad_norm": 2.8459717369309736, + "learning_rate": 4.946095585240353e-06, + "loss": 1.0202, + "step": 723 + }, + { + "epoch": 0.07614339989745882, + "grad_norm": 4.228677921247087, + "learning_rate": 4.945923025551789e-06, + "loss": 1.0273, + "step": 724 + }, + { + "epoch": 0.07624857033930614, + "grad_norm": 2.354998022999138, + "learning_rate": 4.945750193123808e-06, + "loss": 1.0082, + "step": 725 + }, + { + "epoch": 0.07635374078115345, + "grad_norm": 3.1974238173146965, + "learning_rate": 4.94557708797568e-06, + "loss": 1.0845, + "step": 726 + }, + { + "epoch": 0.07645891122300078, + "grad_norm": 3.2991867123662186, + "learning_rate": 4.94540371012671e-06, + "loss": 1.0363, + "step": 727 + }, + { + "epoch": 0.0765640816648481, + "grad_norm": 4.790231421271602, + "learning_rate": 4.945230059596229e-06, + "loss": 1.0522, + "step": 728 + }, + { + "epoch": 0.07666925210669541, + "grad_norm": 2.985913413424653, + "learning_rate": 4.945056136403601e-06, + "loss": 1.0011, + "step": 729 + }, + { + "epoch": 0.07677442254854273, + "grad_norm": 3.4749105660247452, + "learning_rate": 4.944881940568219e-06, + "loss": 1.0506, + "step": 730 + }, + { + "epoch": 0.07687959299039004, + "grad_norm": 2.331821909393682, + "learning_rate": 4.94470747210951e-06, + "loss": 0.9939, + "step": 731 + }, + { + "epoch": 0.07698476343223737, + "grad_norm": 3.2284649198570965, + "learning_rate": 4.944532731046926e-06, + "loss": 1.0831, + "step": 732 + }, + { + "epoch": 0.07708993387408469, + "grad_norm": 3.902120982308078, + "learning_rate": 4.944357717399952e-06, + "loss": 1.1132, + "step": 733 + }, + { + "epoch": 0.077195104315932, + "grad_norm": 3.4172360346109785, + "learning_rate": 4.944182431188106e-06, + "loss": 1.073, + "step": 734 + }, + { + "epoch": 0.07730027475777933, + "grad_norm": 3.6093384619673254, + "learning_rate": 4.94400687243093e-06, + "loss": 1.0422, + "step": 735 + }, + { + "epoch": 0.07740544519962665, + "grad_norm": 2.938298485477721, + "learning_rate": 4.943831041148003e-06, + "loss": 0.991, + "step": 736 + }, + { + "epoch": 0.07751061564147396, + "grad_norm": 3.5768669215903386, + "learning_rate": 4.94365493735893e-06, + "loss": 1.0159, + "step": 737 + }, + { + "epoch": 0.07761578608332129, + "grad_norm": 3.236703601168475, + "learning_rate": 4.943478561083349e-06, + "loss": 1.0307, + "step": 738 + }, + { + "epoch": 0.0777209565251686, + "grad_norm": 4.136119049876218, + "learning_rate": 4.9433019123409264e-06, + "loss": 1.0184, + "step": 739 + }, + { + "epoch": 0.07782612696701592, + "grad_norm": 3.485691899686312, + "learning_rate": 4.94312499115136e-06, + "loss": 1.0624, + "step": 740 + }, + { + "epoch": 0.07793129740886325, + "grad_norm": 3.294064033644912, + "learning_rate": 4.942947797534379e-06, + "loss": 1.0091, + "step": 741 + }, + { + "epoch": 0.07803646785071056, + "grad_norm": 3.388312825573561, + "learning_rate": 4.942770331509741e-06, + "loss": 1.048, + "step": 742 + }, + { + "epoch": 0.07814163829255788, + "grad_norm": 3.8425212422184707, + "learning_rate": 4.942592593097235e-06, + "loss": 1.0264, + "step": 743 + }, + { + "epoch": 0.07824680873440519, + "grad_norm": 4.216912890195311, + "learning_rate": 4.942414582316679e-06, + "loss": 1.0031, + "step": 744 + }, + { + "epoch": 0.07835197917625251, + "grad_norm": 2.943110966674974, + "learning_rate": 4.942236299187926e-06, + "loss": 1.0654, + "step": 745 + }, + { + "epoch": 0.07845714961809984, + "grad_norm": 2.7561756500798076, + "learning_rate": 4.942057743730852e-06, + "loss": 1.0762, + "step": 746 + }, + { + "epoch": 0.07856232005994715, + "grad_norm": 2.5060436964593458, + "learning_rate": 4.941878915965369e-06, + "loss": 1.0292, + "step": 747 + }, + { + "epoch": 0.07866749050179447, + "grad_norm": 4.677558119647075, + "learning_rate": 4.941699815911418e-06, + "loss": 1.0099, + "step": 748 + }, + { + "epoch": 0.07877266094364178, + "grad_norm": 2.6303885402682554, + "learning_rate": 4.94152044358897e-06, + "loss": 1.0585, + "step": 749 + }, + { + "epoch": 0.07887783138548911, + "grad_norm": 2.9906552945797777, + "learning_rate": 4.941340799018026e-06, + "loss": 1.0716, + "step": 750 + }, + { + "epoch": 0.07898300182733643, + "grad_norm": 3.1557961688407, + "learning_rate": 4.941160882218617e-06, + "loss": 1.0114, + "step": 751 + }, + { + "epoch": 0.07908817226918374, + "grad_norm": 2.5082121223741614, + "learning_rate": 4.940980693210807e-06, + "loss": 1.0839, + "step": 752 + }, + { + "epoch": 0.07919334271103107, + "grad_norm": 4.115073295467908, + "learning_rate": 4.940800232014688e-06, + "loss": 1.0528, + "step": 753 + }, + { + "epoch": 0.07929851315287838, + "grad_norm": 4.221836693988338, + "learning_rate": 4.940619498650381e-06, + "loss": 1.0458, + "step": 754 + }, + { + "epoch": 0.0794036835947257, + "grad_norm": 3.43936918626118, + "learning_rate": 4.940438493138041e-06, + "loss": 1.1102, + "step": 755 + }, + { + "epoch": 0.07950885403657303, + "grad_norm": 2.534335147364592, + "learning_rate": 4.9402572154978515e-06, + "loss": 1.03, + "step": 756 + }, + { + "epoch": 0.07961402447842034, + "grad_norm": 3.6985951390943397, + "learning_rate": 4.940075665750026e-06, + "loss": 1.0355, + "step": 757 + }, + { + "epoch": 0.07971919492026766, + "grad_norm": 3.654741106636443, + "learning_rate": 4.939893843914808e-06, + "loss": 1.0647, + "step": 758 + }, + { + "epoch": 0.07982436536211497, + "grad_norm": 3.686292494310798, + "learning_rate": 4.9397117500124725e-06, + "loss": 1.0508, + "step": 759 + }, + { + "epoch": 0.0799295358039623, + "grad_norm": 4.219962087918095, + "learning_rate": 4.939529384063325e-06, + "loss": 1.0415, + "step": 760 + }, + { + "epoch": 0.08003470624580962, + "grad_norm": 3.5870732019483387, + "learning_rate": 4.9393467460876995e-06, + "loss": 1.0291, + "step": 761 + }, + { + "epoch": 0.08013987668765693, + "grad_norm": 3.5610335680344902, + "learning_rate": 4.939163836105964e-06, + "loss": 1.0283, + "step": 762 + }, + { + "epoch": 0.08024504712950425, + "grad_norm": 2.8440793536671802, + "learning_rate": 4.938980654138511e-06, + "loss": 1.0676, + "step": 763 + }, + { + "epoch": 0.08035021757135156, + "grad_norm": 3.5144508685059512, + "learning_rate": 4.93879720020577e-06, + "loss": 0.9927, + "step": 764 + }, + { + "epoch": 0.08045538801319889, + "grad_norm": 3.6070080778872353, + "learning_rate": 4.938613474328195e-06, + "loss": 1.0694, + "step": 765 + }, + { + "epoch": 0.08056055845504621, + "grad_norm": 4.242572556515164, + "learning_rate": 4.938429476526273e-06, + "loss": 1.027, + "step": 766 + }, + { + "epoch": 0.08066572889689352, + "grad_norm": 2.4803307039348605, + "learning_rate": 4.938245206820522e-06, + "loss": 1.0119, + "step": 767 + }, + { + "epoch": 0.08077089933874085, + "grad_norm": 2.9387959106412804, + "learning_rate": 4.938060665231491e-06, + "loss": 1.0404, + "step": 768 + }, + { + "epoch": 0.08087606978058817, + "grad_norm": 4.172139806499708, + "learning_rate": 4.937875851779755e-06, + "loss": 1.0385, + "step": 769 + }, + { + "epoch": 0.08098124022243548, + "grad_norm": 3.0840295037017786, + "learning_rate": 4.937690766485924e-06, + "loss": 1.0129, + "step": 770 + }, + { + "epoch": 0.08108641066428281, + "grad_norm": 3.070735572354231, + "learning_rate": 4.9375054093706356e-06, + "loss": 1.0741, + "step": 771 + }, + { + "epoch": 0.08119158110613012, + "grad_norm": 3.1878677650163247, + "learning_rate": 4.937319780454559e-06, + "loss": 1.0794, + "step": 772 + }, + { + "epoch": 0.08129675154797744, + "grad_norm": 2.863706515463834, + "learning_rate": 4.937133879758394e-06, + "loss": 1.0336, + "step": 773 + }, + { + "epoch": 0.08140192198982477, + "grad_norm": 3.0701280575553334, + "learning_rate": 4.936947707302868e-06, + "loss": 1.0586, + "step": 774 + }, + { + "epoch": 0.08150709243167208, + "grad_norm": 3.7990360279352218, + "learning_rate": 4.936761263108742e-06, + "loss": 1.013, + "step": 775 + }, + { + "epoch": 0.0816122628735194, + "grad_norm": 3.2463147605003493, + "learning_rate": 4.936574547196806e-06, + "loss": 1.0674, + "step": 776 + }, + { + "epoch": 0.08171743331536671, + "grad_norm": 3.346659843217337, + "learning_rate": 4.93638755958788e-06, + "loss": 1.0484, + "step": 777 + }, + { + "epoch": 0.08182260375721404, + "grad_norm": 3.421009993046732, + "learning_rate": 4.9362003003028135e-06, + "loss": 0.9867, + "step": 778 + }, + { + "epoch": 0.08192777419906136, + "grad_norm": 3.5732995197430086, + "learning_rate": 4.93601276936249e-06, + "loss": 1.0472, + "step": 779 + }, + { + "epoch": 0.08203294464090867, + "grad_norm": 2.5915581383584554, + "learning_rate": 4.935824966787818e-06, + "loss": 1.0096, + "step": 780 + }, + { + "epoch": 0.082138115082756, + "grad_norm": 2.6084843786308376, + "learning_rate": 4.935636892599741e-06, + "loss": 1.0011, + "step": 781 + }, + { + "epoch": 0.0822432855246033, + "grad_norm": 2.7979653896224375, + "learning_rate": 4.935448546819229e-06, + "loss": 1.0501, + "step": 782 + }, + { + "epoch": 0.08234845596645063, + "grad_norm": 3.1927150256696253, + "learning_rate": 4.935259929467285e-06, + "loss": 1.0239, + "step": 783 + }, + { + "epoch": 0.08245362640829795, + "grad_norm": 3.5723314450964945, + "learning_rate": 4.935071040564942e-06, + "loss": 1.06, + "step": 784 + }, + { + "epoch": 0.08255879685014526, + "grad_norm": 2.951913249700751, + "learning_rate": 4.93488188013326e-06, + "loss": 1.091, + "step": 785 + }, + { + "epoch": 0.08266396729199259, + "grad_norm": 2.897178403548757, + "learning_rate": 4.9346924481933345e-06, + "loss": 0.9997, + "step": 786 + }, + { + "epoch": 0.0827691377338399, + "grad_norm": 2.976619910088866, + "learning_rate": 4.9345027447662876e-06, + "loss": 1.0628, + "step": 787 + }, + { + "epoch": 0.08287430817568722, + "grad_norm": 5.562882028586305, + "learning_rate": 4.934312769873273e-06, + "loss": 1.0514, + "step": 788 + }, + { + "epoch": 0.08297947861753455, + "grad_norm": 3.3122007501483086, + "learning_rate": 4.934122523535474e-06, + "loss": 1.017, + "step": 789 + }, + { + "epoch": 0.08308464905938186, + "grad_norm": 4.484356852207891, + "learning_rate": 4.9339320057741045e-06, + "loss": 1.0628, + "step": 790 + }, + { + "epoch": 0.08318981950122918, + "grad_norm": 2.662851425317841, + "learning_rate": 4.933741216610409e-06, + "loss": 1.068, + "step": 791 + }, + { + "epoch": 0.08329498994307649, + "grad_norm": 3.0734598927453765, + "learning_rate": 4.933550156065662e-06, + "loss": 1.043, + "step": 792 + }, + { + "epoch": 0.08340016038492382, + "grad_norm": 2.692919866577266, + "learning_rate": 4.933358824161167e-06, + "loss": 1.0257, + "step": 793 + }, + { + "epoch": 0.08350533082677114, + "grad_norm": 3.656399597645738, + "learning_rate": 4.933167220918262e-06, + "loss": 1.0345, + "step": 794 + }, + { + "epoch": 0.08361050126861845, + "grad_norm": 3.587236612420999, + "learning_rate": 4.9329753463583095e-06, + "loss": 1.0455, + "step": 795 + }, + { + "epoch": 0.08371567171046577, + "grad_norm": 3.656718452993634, + "learning_rate": 4.932783200502705e-06, + "loss": 1.0298, + "step": 796 + }, + { + "epoch": 0.0838208421523131, + "grad_norm": 3.00036115403809, + "learning_rate": 4.932590783372877e-06, + "loss": 1.022, + "step": 797 + }, + { + "epoch": 0.08392601259416041, + "grad_norm": 2.5105535700401242, + "learning_rate": 4.9323980949902786e-06, + "loss": 1.0868, + "step": 798 + }, + { + "epoch": 0.08403118303600773, + "grad_norm": 3.7118503835471546, + "learning_rate": 4.9322051353763965e-06, + "loss": 1.03, + "step": 799 + }, + { + "epoch": 0.08413635347785504, + "grad_norm": 4.549244874514477, + "learning_rate": 4.932011904552749e-06, + "loss": 1.0312, + "step": 800 + }, + { + "epoch": 0.08424152391970237, + "grad_norm": 2.695319366140478, + "learning_rate": 4.931818402540881e-06, + "loss": 1.0456, + "step": 801 + }, + { + "epoch": 0.08434669436154969, + "grad_norm": 5.172251410102943, + "learning_rate": 4.93162462936237e-06, + "loss": 1.06, + "step": 802 + }, + { + "epoch": 0.084451864803397, + "grad_norm": 3.731288930108174, + "learning_rate": 4.931430585038823e-06, + "loss": 1.0429, + "step": 803 + }, + { + "epoch": 0.08455703524524433, + "grad_norm": 2.688017597159252, + "learning_rate": 4.931236269591878e-06, + "loss": 1.0244, + "step": 804 + }, + { + "epoch": 0.08466220568709164, + "grad_norm": 2.8764757418451796, + "learning_rate": 4.9310416830432025e-06, + "loss": 1.0227, + "step": 805 + }, + { + "epoch": 0.08476737612893896, + "grad_norm": 4.0235649503508455, + "learning_rate": 4.930846825414495e-06, + "loss": 1.0423, + "step": 806 + }, + { + "epoch": 0.08487254657078629, + "grad_norm": 3.536335146675604, + "learning_rate": 4.930651696727482e-06, + "loss": 1.0062, + "step": 807 + }, + { + "epoch": 0.0849777170126336, + "grad_norm": 2.957564708059679, + "learning_rate": 4.930456297003923e-06, + "loss": 1.0522, + "step": 808 + }, + { + "epoch": 0.08508288745448092, + "grad_norm": 3.219817117203254, + "learning_rate": 4.930260626265607e-06, + "loss": 1.0915, + "step": 809 + }, + { + "epoch": 0.08518805789632823, + "grad_norm": 2.2575110274834636, + "learning_rate": 4.930064684534352e-06, + "loss": 1.0395, + "step": 810 + }, + { + "epoch": 0.08529322833817556, + "grad_norm": 3.301667345209895, + "learning_rate": 4.929868471832007e-06, + "loss": 1.0394, + "step": 811 + }, + { + "epoch": 0.08539839878002288, + "grad_norm": 4.501375773442117, + "learning_rate": 4.929671988180452e-06, + "loss": 1.0537, + "step": 812 + }, + { + "epoch": 0.08550356922187019, + "grad_norm": 2.9447996895825437, + "learning_rate": 4.929475233601595e-06, + "loss": 1.0163, + "step": 813 + }, + { + "epoch": 0.08560873966371751, + "grad_norm": 3.890755121107722, + "learning_rate": 4.929278208117378e-06, + "loss": 1.0418, + "step": 814 + }, + { + "epoch": 0.08571391010556483, + "grad_norm": 2.55891195688046, + "learning_rate": 4.929080911749769e-06, + "loss": 1.0191, + "step": 815 + }, + { + "epoch": 0.08581908054741215, + "grad_norm": 4.672850050572631, + "learning_rate": 4.928883344520768e-06, + "loss": 1.0363, + "step": 816 + }, + { + "epoch": 0.08592425098925947, + "grad_norm": 3.1282491114827575, + "learning_rate": 4.928685506452407e-06, + "loss": 1.0421, + "step": 817 + }, + { + "epoch": 0.08602942143110678, + "grad_norm": 2.0646227484923143, + "learning_rate": 4.928487397566743e-06, + "loss": 1.0407, + "step": 818 + }, + { + "epoch": 0.08613459187295411, + "grad_norm": 3.2962481364973564, + "learning_rate": 4.928289017885871e-06, + "loss": 1.0724, + "step": 819 + }, + { + "epoch": 0.08623976231480142, + "grad_norm": 3.4645243615129897, + "learning_rate": 4.92809036743191e-06, + "loss": 1.0396, + "step": 820 + }, + { + "epoch": 0.08634493275664874, + "grad_norm": 2.3605676802931286, + "learning_rate": 4.92789144622701e-06, + "loss": 0.9944, + "step": 821 + }, + { + "epoch": 0.08645010319849607, + "grad_norm": 4.514015314071715, + "learning_rate": 4.927692254293354e-06, + "loss": 1.0792, + "step": 822 + }, + { + "epoch": 0.08655527364034338, + "grad_norm": 3.364110041018555, + "learning_rate": 4.927492791653153e-06, + "loss": 1.0558, + "step": 823 + }, + { + "epoch": 0.0866604440821907, + "grad_norm": 2.1273722043275995, + "learning_rate": 4.927293058328647e-06, + "loss": 1.0158, + "step": 824 + }, + { + "epoch": 0.08676561452403801, + "grad_norm": 3.5256816639423327, + "learning_rate": 4.92709305434211e-06, + "loss": 1.0353, + "step": 825 + }, + { + "epoch": 0.08687078496588534, + "grad_norm": 3.6339397393250095, + "learning_rate": 4.926892779715843e-06, + "loss": 1.0626, + "step": 826 + }, + { + "epoch": 0.08697595540773266, + "grad_norm": 2.9798523498933567, + "learning_rate": 4.926692234472178e-06, + "loss": 1.0116, + "step": 827 + }, + { + "epoch": 0.08708112584957997, + "grad_norm": 3.025215020843175, + "learning_rate": 4.9264914186334775e-06, + "loss": 1.0143, + "step": 828 + }, + { + "epoch": 0.0871862962914273, + "grad_norm": 2.931478722868882, + "learning_rate": 4.926290332222134e-06, + "loss": 1.0852, + "step": 829 + }, + { + "epoch": 0.08729146673327462, + "grad_norm": 3.179330588406845, + "learning_rate": 4.9260889752605715e-06, + "loss": 1.0382, + "step": 830 + }, + { + "epoch": 0.08739663717512193, + "grad_norm": 2.7220659147274264, + "learning_rate": 4.925887347771241e-06, + "loss": 1.0312, + "step": 831 + }, + { + "epoch": 0.08750180761696925, + "grad_norm": 4.610622268460886, + "learning_rate": 4.925685449776627e-06, + "loss": 1.046, + "step": 832 + }, + { + "epoch": 0.08760697805881656, + "grad_norm": 2.720186510106471, + "learning_rate": 4.925483281299242e-06, + "loss": 1.0246, + "step": 833 + }, + { + "epoch": 0.08771214850066389, + "grad_norm": 2.4258313763166854, + "learning_rate": 4.925280842361628e-06, + "loss": 1.0416, + "step": 834 + }, + { + "epoch": 0.08781731894251121, + "grad_norm": 2.5395043340810464, + "learning_rate": 4.925078132986361e-06, + "loss": 0.9997, + "step": 835 + }, + { + "epoch": 0.08792248938435852, + "grad_norm": 3.0748465405966168, + "learning_rate": 4.924875153196042e-06, + "loss": 1.0486, + "step": 836 + }, + { + "epoch": 0.08802765982620585, + "grad_norm": 3.676854554009859, + "learning_rate": 4.924671903013308e-06, + "loss": 1.0313, + "step": 837 + }, + { + "epoch": 0.08813283026805316, + "grad_norm": 5.11342505319437, + "learning_rate": 4.9244683824608205e-06, + "loss": 1.0866, + "step": 838 + }, + { + "epoch": 0.08823800070990048, + "grad_norm": 3.364235859687841, + "learning_rate": 4.924264591561275e-06, + "loss": 1.0374, + "step": 839 + }, + { + "epoch": 0.0883431711517478, + "grad_norm": 2.543264664118595, + "learning_rate": 4.924060530337394e-06, + "loss": 1.036, + "step": 840 + }, + { + "epoch": 0.08844834159359512, + "grad_norm": 2.2631599527781208, + "learning_rate": 4.9238561988119346e-06, + "loss": 1.0327, + "step": 841 + }, + { + "epoch": 0.08855351203544244, + "grad_norm": 3.3000605867237742, + "learning_rate": 4.923651597007679e-06, + "loss": 1.0459, + "step": 842 + }, + { + "epoch": 0.08865868247728975, + "grad_norm": 2.7652313110005657, + "learning_rate": 4.923446724947443e-06, + "loss": 1.0426, + "step": 843 + }, + { + "epoch": 0.08876385291913708, + "grad_norm": 3.1194029297763133, + "learning_rate": 4.923241582654071e-06, + "loss": 1.0286, + "step": 844 + }, + { + "epoch": 0.0888690233609844, + "grad_norm": 4.097579369777372, + "learning_rate": 4.923036170150438e-06, + "loss": 1.0021, + "step": 845 + }, + { + "epoch": 0.08897419380283171, + "grad_norm": 2.741301220555694, + "learning_rate": 4.922830487459449e-06, + "loss": 1.0333, + "step": 846 + }, + { + "epoch": 0.08907936424467904, + "grad_norm": 3.542962902887874, + "learning_rate": 4.92262453460404e-06, + "loss": 1.0349, + "step": 847 + }, + { + "epoch": 0.08918453468652635, + "grad_norm": 2.855697339700191, + "learning_rate": 4.922418311607176e-06, + "loss": 1.0311, + "step": 848 + }, + { + "epoch": 0.08928970512837367, + "grad_norm": 2.880135557426853, + "learning_rate": 4.922211818491852e-06, + "loss": 1.0054, + "step": 849 + }, + { + "epoch": 0.089394875570221, + "grad_norm": 4.213156822451781, + "learning_rate": 4.922005055281094e-06, + "loss": 1.0374, + "step": 850 + }, + { + "epoch": 0.0895000460120683, + "grad_norm": 3.6779910799103632, + "learning_rate": 4.921798021997957e-06, + "loss": 0.9997, + "step": 851 + }, + { + "epoch": 0.08960521645391563, + "grad_norm": 4.758221935382132, + "learning_rate": 4.921590718665527e-06, + "loss": 1.0371, + "step": 852 + }, + { + "epoch": 0.08971038689576294, + "grad_norm": 3.3975540812884444, + "learning_rate": 4.921383145306922e-06, + "loss": 1.0318, + "step": 853 + }, + { + "epoch": 0.08981555733761026, + "grad_norm": 2.530639515419742, + "learning_rate": 4.921175301945284e-06, + "loss": 1.0652, + "step": 854 + }, + { + "epoch": 0.08992072777945759, + "grad_norm": 2.8662710938283986, + "learning_rate": 4.920967188603794e-06, + "loss": 1.024, + "step": 855 + }, + { + "epoch": 0.0900258982213049, + "grad_norm": 3.4254687930282737, + "learning_rate": 4.920758805305654e-06, + "loss": 1.0682, + "step": 856 + }, + { + "epoch": 0.09013106866315222, + "grad_norm": 2.1300497289243707, + "learning_rate": 4.920550152074103e-06, + "loss": 1.0442, + "step": 857 + }, + { + "epoch": 0.09023623910499955, + "grad_norm": 2.7404728800806417, + "learning_rate": 4.920341228932406e-06, + "loss": 1.0344, + "step": 858 + }, + { + "epoch": 0.09034140954684686, + "grad_norm": 3.579891745395384, + "learning_rate": 4.9201320359038595e-06, + "loss": 1.016, + "step": 859 + }, + { + "epoch": 0.09044657998869418, + "grad_norm": 3.5079338774058355, + "learning_rate": 4.919922573011791e-06, + "loss": 1.017, + "step": 860 + }, + { + "epoch": 0.09055175043054149, + "grad_norm": 4.474680092802719, + "learning_rate": 4.919712840279559e-06, + "loss": 1.1165, + "step": 861 + }, + { + "epoch": 0.09065692087238882, + "grad_norm": 2.7950268131476217, + "learning_rate": 4.9195028377305465e-06, + "loss": 1.0025, + "step": 862 + }, + { + "epoch": 0.09076209131423614, + "grad_norm": 3.6168740028611377, + "learning_rate": 4.919292565388172e-06, + "loss": 1.0431, + "step": 863 + }, + { + "epoch": 0.09086726175608345, + "grad_norm": 3.569181810804004, + "learning_rate": 4.919082023275884e-06, + "loss": 0.9941, + "step": 864 + }, + { + "epoch": 0.09097243219793077, + "grad_norm": 2.687691480063549, + "learning_rate": 4.918871211417157e-06, + "loss": 1.0376, + "step": 865 + }, + { + "epoch": 0.09107760263977809, + "grad_norm": 3.481903769104546, + "learning_rate": 4.9186601298355e-06, + "loss": 1.0285, + "step": 866 + }, + { + "epoch": 0.09118277308162541, + "grad_norm": 3.2967751447777798, + "learning_rate": 4.91844877855445e-06, + "loss": 1.0255, + "step": 867 + }, + { + "epoch": 0.09128794352347273, + "grad_norm": 3.258456644130292, + "learning_rate": 4.918237157597574e-06, + "loss": 1.009, + "step": 868 + }, + { + "epoch": 0.09139311396532004, + "grad_norm": 4.155311114794585, + "learning_rate": 4.918025266988469e-06, + "loss": 1.0554, + "step": 869 + }, + { + "epoch": 0.09149828440716737, + "grad_norm": 2.719205408216921, + "learning_rate": 4.917813106750763e-06, + "loss": 1.0526, + "step": 870 + }, + { + "epoch": 0.09160345484901468, + "grad_norm": 3.7086484713956644, + "learning_rate": 4.917600676908114e-06, + "loss": 1.0841, + "step": 871 + }, + { + "epoch": 0.091708625290862, + "grad_norm": 3.6291790131528265, + "learning_rate": 4.9173879774842085e-06, + "loss": 1.0466, + "step": 872 + }, + { + "epoch": 0.09181379573270933, + "grad_norm": 4.901385161683924, + "learning_rate": 4.917175008502763e-06, + "loss": 1.0683, + "step": 873 + }, + { + "epoch": 0.09191896617455664, + "grad_norm": 2.6391942505342145, + "learning_rate": 4.91696176998753e-06, + "loss": 1.0368, + "step": 874 + }, + { + "epoch": 0.09202413661640396, + "grad_norm": 2.92397301023947, + "learning_rate": 4.916748261962282e-06, + "loss": 1.0502, + "step": 875 + }, + { + "epoch": 0.09212930705825127, + "grad_norm": 3.3487651603386643, + "learning_rate": 4.916534484450829e-06, + "loss": 1.0492, + "step": 876 + }, + { + "epoch": 0.0922344775000986, + "grad_norm": 3.4178169663489615, + "learning_rate": 4.9163204374770085e-06, + "loss": 1.0182, + "step": 877 + }, + { + "epoch": 0.09233964794194592, + "grad_norm": 2.660490436372227, + "learning_rate": 4.916106121064689e-06, + "loss": 1.0636, + "step": 878 + }, + { + "epoch": 0.09244481838379323, + "grad_norm": 3.647201920237282, + "learning_rate": 4.915891535237768e-06, + "loss": 1.0018, + "step": 879 + }, + { + "epoch": 0.09254998882564056, + "grad_norm": 2.7950649078172547, + "learning_rate": 4.915676680020173e-06, + "loss": 1.0415, + "step": 880 + }, + { + "epoch": 0.09265515926748787, + "grad_norm": 3.308392412587543, + "learning_rate": 4.915461555435863e-06, + "loss": 1.0095, + "step": 881 + }, + { + "epoch": 0.09276032970933519, + "grad_norm": 2.497275944616539, + "learning_rate": 4.915246161508825e-06, + "loss": 1.0251, + "step": 882 + }, + { + "epoch": 0.09286550015118251, + "grad_norm": 2.4296994999785975, + "learning_rate": 4.915030498263079e-06, + "loss": 1.0309, + "step": 883 + }, + { + "epoch": 0.09297067059302982, + "grad_norm": 4.151821039375308, + "learning_rate": 4.914814565722671e-06, + "loss": 1.0479, + "step": 884 + }, + { + "epoch": 0.09307584103487715, + "grad_norm": 2.935706432111893, + "learning_rate": 4.91459836391168e-06, + "loss": 1.0566, + "step": 885 + }, + { + "epoch": 0.09318101147672447, + "grad_norm": 3.4020244169255527, + "learning_rate": 4.914381892854214e-06, + "loss": 1.0392, + "step": 886 + }, + { + "epoch": 0.09328618191857178, + "grad_norm": 3.4742996375581985, + "learning_rate": 4.914165152574412e-06, + "loss": 1.0323, + "step": 887 + }, + { + "epoch": 0.09339135236041911, + "grad_norm": 4.274200371311763, + "learning_rate": 4.913948143096442e-06, + "loss": 1.0627, + "step": 888 + }, + { + "epoch": 0.09349652280226642, + "grad_norm": 5.300639145263014, + "learning_rate": 4.9137308644445e-06, + "loss": 1.015, + "step": 889 + }, + { + "epoch": 0.09360169324411374, + "grad_norm": 3.5689399063136884, + "learning_rate": 4.913513316642818e-06, + "loss": 1.0453, + "step": 890 + }, + { + "epoch": 0.09370686368596107, + "grad_norm": 4.104293284314525, + "learning_rate": 4.913295499715651e-06, + "loss": 1.039, + "step": 891 + }, + { + "epoch": 0.09381203412780838, + "grad_norm": 2.9248882626299046, + "learning_rate": 4.913077413687289e-06, + "loss": 1.0304, + "step": 892 + }, + { + "epoch": 0.0939172045696557, + "grad_norm": 2.565656733778235, + "learning_rate": 4.91285905858205e-06, + "loss": 1.0233, + "step": 893 + }, + { + "epoch": 0.09402237501150301, + "grad_norm": 4.317147708295007, + "learning_rate": 4.912640434424283e-06, + "loss": 1.0293, + "step": 894 + }, + { + "epoch": 0.09412754545335034, + "grad_norm": 4.1886808246014615, + "learning_rate": 4.912421541238365e-06, + "loss": 1.0664, + "step": 895 + }, + { + "epoch": 0.09423271589519766, + "grad_norm": 3.5710378646260996, + "learning_rate": 4.912202379048704e-06, + "loss": 1.0582, + "step": 896 + }, + { + "epoch": 0.09433788633704497, + "grad_norm": 3.8755773448584523, + "learning_rate": 4.91198294787974e-06, + "loss": 1.0265, + "step": 897 + }, + { + "epoch": 0.0944430567788923, + "grad_norm": 3.2106737884292484, + "learning_rate": 4.91176324775594e-06, + "loss": 1.0467, + "step": 898 + }, + { + "epoch": 0.0945482272207396, + "grad_norm": 2.239928932171705, + "learning_rate": 4.911543278701802e-06, + "loss": 1.0499, + "step": 899 + }, + { + "epoch": 0.09465339766258693, + "grad_norm": 4.27992709829204, + "learning_rate": 4.9113230407418565e-06, + "loss": 1.0314, + "step": 900 + }, + { + "epoch": 0.09475856810443425, + "grad_norm": 3.7991157017709134, + "learning_rate": 4.911102533900659e-06, + "loss": 1.019, + "step": 901 + }, + { + "epoch": 0.09486373854628156, + "grad_norm": 3.2843177713241345, + "learning_rate": 4.910881758202799e-06, + "loss": 1.0747, + "step": 902 + }, + { + "epoch": 0.09496890898812889, + "grad_norm": 3.3614094256905336, + "learning_rate": 4.910660713672895e-06, + "loss": 1.047, + "step": 903 + }, + { + "epoch": 0.0950740794299762, + "grad_norm": 2.546889529528352, + "learning_rate": 4.910439400335595e-06, + "loss": 1.0134, + "step": 904 + }, + { + "epoch": 0.09517924987182352, + "grad_norm": 3.5988909556040927, + "learning_rate": 4.910217818215576e-06, + "loss": 1.0236, + "step": 905 + }, + { + "epoch": 0.09528442031367085, + "grad_norm": 4.327738766483183, + "learning_rate": 4.909995967337548e-06, + "loss": 1.0353, + "step": 906 + }, + { + "epoch": 0.09538959075551816, + "grad_norm": 4.8588271006072326, + "learning_rate": 4.9097738477262466e-06, + "loss": 1.0939, + "step": 907 + }, + { + "epoch": 0.09549476119736548, + "grad_norm": 4.033803872334996, + "learning_rate": 4.9095514594064434e-06, + "loss": 1.0285, + "step": 908 + }, + { + "epoch": 0.09559993163921279, + "grad_norm": 3.75933079753237, + "learning_rate": 4.9093288024029325e-06, + "loss": 1.0483, + "step": 909 + }, + { + "epoch": 0.09570510208106012, + "grad_norm": 3.0548663786720534, + "learning_rate": 4.9091058767405455e-06, + "loss": 1.051, + "step": 910 + }, + { + "epoch": 0.09581027252290744, + "grad_norm": 2.784053866442591, + "learning_rate": 4.908882682444137e-06, + "loss": 1.059, + "step": 911 + }, + { + "epoch": 0.09591544296475475, + "grad_norm": 2.130590850889847, + "learning_rate": 4.908659219538598e-06, + "loss": 1.0208, + "step": 912 + }, + { + "epoch": 0.09602061340660208, + "grad_norm": 2.3000679174789456, + "learning_rate": 4.908435488048844e-06, + "loss": 1.0253, + "step": 913 + }, + { + "epoch": 0.09612578384844939, + "grad_norm": 3.0043936554319193, + "learning_rate": 4.908211487999825e-06, + "loss": 1.0579, + "step": 914 + }, + { + "epoch": 0.09623095429029671, + "grad_norm": 2.864142159496675, + "learning_rate": 4.9079872194165155e-06, + "loss": 1.0353, + "step": 915 + }, + { + "epoch": 0.09633612473214404, + "grad_norm": 3.8952660611202754, + "learning_rate": 4.907762682323926e-06, + "loss": 1.0701, + "step": 916 + }, + { + "epoch": 0.09644129517399135, + "grad_norm": 3.9160674317154913, + "learning_rate": 4.907537876747094e-06, + "loss": 1.0547, + "step": 917 + }, + { + "epoch": 0.09654646561583867, + "grad_norm": 2.7081904691236613, + "learning_rate": 4.907312802711086e-06, + "loss": 0.9969, + "step": 918 + }, + { + "epoch": 0.096651636057686, + "grad_norm": 4.209552166676453, + "learning_rate": 4.907087460240999e-06, + "loss": 1.0433, + "step": 919 + }, + { + "epoch": 0.0967568064995333, + "grad_norm": 2.3073683643555083, + "learning_rate": 4.906861849361962e-06, + "loss": 1.0019, + "step": 920 + }, + { + "epoch": 0.09686197694138063, + "grad_norm": 3.94540024204696, + "learning_rate": 4.906635970099131e-06, + "loss": 1.0257, + "step": 921 + }, + { + "epoch": 0.09696714738322794, + "grad_norm": 3.9569458882368895, + "learning_rate": 4.906409822477695e-06, + "loss": 1.0383, + "step": 922 + }, + { + "epoch": 0.09707231782507526, + "grad_norm": 3.4725916392841043, + "learning_rate": 4.906183406522869e-06, + "loss": 1.006, + "step": 923 + }, + { + "epoch": 0.09717748826692259, + "grad_norm": 3.5244971631589577, + "learning_rate": 4.9059567222599015e-06, + "loss": 0.992, + "step": 924 + }, + { + "epoch": 0.0972826587087699, + "grad_norm": 3.274079488886004, + "learning_rate": 4.90572976971407e-06, + "loss": 0.992, + "step": 925 + }, + { + "epoch": 0.09738782915061722, + "grad_norm": 2.5329141838670455, + "learning_rate": 4.905502548910681e-06, + "loss": 1.041, + "step": 926 + }, + { + "epoch": 0.09749299959246453, + "grad_norm": 2.8385757720404765, + "learning_rate": 4.90527505987507e-06, + "loss": 1.0303, + "step": 927 + }, + { + "epoch": 0.09759817003431186, + "grad_norm": 2.930548884609321, + "learning_rate": 4.905047302632606e-06, + "loss": 1.0359, + "step": 928 + }, + { + "epoch": 0.09770334047615918, + "grad_norm": 4.156000403439952, + "learning_rate": 4.904819277208685e-06, + "loss": 1.0979, + "step": 929 + }, + { + "epoch": 0.09780851091800649, + "grad_norm": 2.536688544557416, + "learning_rate": 4.904590983628732e-06, + "loss": 1.0609, + "step": 930 + }, + { + "epoch": 0.09791368135985382, + "grad_norm": 3.9076531761075897, + "learning_rate": 4.904362421918205e-06, + "loss": 1.0202, + "step": 931 + }, + { + "epoch": 0.09801885180170113, + "grad_norm": 3.5518902322982493, + "learning_rate": 4.904133592102591e-06, + "loss": 1.0553, + "step": 932 + }, + { + "epoch": 0.09812402224354845, + "grad_norm": 3.1936214651164696, + "learning_rate": 4.9039044942074055e-06, + "loss": 1.0646, + "step": 933 + }, + { + "epoch": 0.09822919268539577, + "grad_norm": 4.135090085556305, + "learning_rate": 4.903675128258194e-06, + "loss": 1.0229, + "step": 934 + }, + { + "epoch": 0.09833436312724309, + "grad_norm": 2.2494098003681673, + "learning_rate": 4.903445494280534e-06, + "loss": 1.0082, + "step": 935 + }, + { + "epoch": 0.09843953356909041, + "grad_norm": 3.148431264860253, + "learning_rate": 4.90321559230003e-06, + "loss": 1.03, + "step": 936 + }, + { + "epoch": 0.09854470401093772, + "grad_norm": 3.4526369350300383, + "learning_rate": 4.902985422342319e-06, + "loss": 1.0412, + "step": 937 + }, + { + "epoch": 0.09864987445278504, + "grad_norm": 1.9933016033441238, + "learning_rate": 4.902754984433067e-06, + "loss": 1.0032, + "step": 938 + }, + { + "epoch": 0.09875504489463237, + "grad_norm": 2.560711359722772, + "learning_rate": 4.902524278597969e-06, + "loss": 1.0586, + "step": 939 + }, + { + "epoch": 0.09886021533647968, + "grad_norm": 3.6578892637833404, + "learning_rate": 4.9022933048627496e-06, + "loss": 1.0093, + "step": 940 + }, + { + "epoch": 0.098965385778327, + "grad_norm": 3.2769623438585196, + "learning_rate": 4.902062063253165e-06, + "loss": 1.0349, + "step": 941 + }, + { + "epoch": 0.09907055622017431, + "grad_norm": 2.629492634710313, + "learning_rate": 4.901830553795001e-06, + "loss": 1.0607, + "step": 942 + }, + { + "epoch": 0.09917572666202164, + "grad_norm": 3.126871970690739, + "learning_rate": 4.9015987765140715e-06, + "loss": 1.0513, + "step": 943 + }, + { + "epoch": 0.09928089710386896, + "grad_norm": 5.290623571841186, + "learning_rate": 4.901366731436223e-06, + "loss": 1.0321, + "step": 944 + }, + { + "epoch": 0.09938606754571627, + "grad_norm": 2.549896186428418, + "learning_rate": 4.901134418587329e-06, + "loss": 1.0487, + "step": 945 + }, + { + "epoch": 0.0994912379875636, + "grad_norm": 3.3781283890627596, + "learning_rate": 4.900901837993295e-06, + "loss": 1.0193, + "step": 946 + }, + { + "epoch": 0.09959640842941092, + "grad_norm": 3.3355283132202738, + "learning_rate": 4.900668989680055e-06, + "loss": 0.9998, + "step": 947 + }, + { + "epoch": 0.09970157887125823, + "grad_norm": 2.6098798632254003, + "learning_rate": 4.900435873673574e-06, + "loss": 0.9725, + "step": 948 + }, + { + "epoch": 0.09980674931310556, + "grad_norm": 2.802666593583616, + "learning_rate": 4.900202489999845e-06, + "loss": 1.0438, + "step": 949 + }, + { + "epoch": 0.09991191975495287, + "grad_norm": 2.0938671069598858, + "learning_rate": 4.899968838684893e-06, + "loss": 1.0205, + "step": 950 + }, + { + "epoch": 0.10001709019680019, + "grad_norm": 3.6038776689006586, + "learning_rate": 4.8997349197547724e-06, + "loss": 1.0578, + "step": 951 + }, + { + "epoch": 0.10012226063864751, + "grad_norm": 4.970829658802114, + "learning_rate": 4.899500733235567e-06, + "loss": 1.025, + "step": 952 + }, + { + "epoch": 0.10022743108049482, + "grad_norm": 4.678353320775025, + "learning_rate": 4.899266279153388e-06, + "loss": 1.0622, + "step": 953 + }, + { + "epoch": 0.10033260152234215, + "grad_norm": 5.741743419399476, + "learning_rate": 4.899031557534383e-06, + "loss": 1.0874, + "step": 954 + }, + { + "epoch": 0.10043777196418946, + "grad_norm": 3.104804866637022, + "learning_rate": 4.8987965684047215e-06, + "loss": 1.0013, + "step": 955 + }, + { + "epoch": 0.10054294240603678, + "grad_norm": 3.03354521888864, + "learning_rate": 4.898561311790609e-06, + "loss": 1.004, + "step": 956 + }, + { + "epoch": 0.10064811284788411, + "grad_norm": 3.424553202851718, + "learning_rate": 4.898325787718277e-06, + "loss": 1.0363, + "step": 957 + }, + { + "epoch": 0.10075328328973142, + "grad_norm": 3.000149075657252, + "learning_rate": 4.898089996213988e-06, + "loss": 1.0371, + "step": 958 + }, + { + "epoch": 0.10085845373157874, + "grad_norm": 2.3914498005066194, + "learning_rate": 4.897853937304037e-06, + "loss": 1.0101, + "step": 959 + }, + { + "epoch": 0.10096362417342605, + "grad_norm": 3.7269730617951997, + "learning_rate": 4.897617611014744e-06, + "loss": 1.0741, + "step": 960 + }, + { + "epoch": 0.10106879461527338, + "grad_norm": 3.4685995382297, + "learning_rate": 4.897381017372462e-06, + "loss": 1.0635, + "step": 961 + }, + { + "epoch": 0.1011739650571207, + "grad_norm": 2.414965651072397, + "learning_rate": 4.897144156403573e-06, + "loss": 1.0601, + "step": 962 + }, + { + "epoch": 0.10127913549896801, + "grad_norm": 3.3913472210312117, + "learning_rate": 4.8969070281344895e-06, + "loss": 1.0682, + "step": 963 + }, + { + "epoch": 0.10138430594081534, + "grad_norm": 2.6184438328601387, + "learning_rate": 4.896669632591652e-06, + "loss": 1.0295, + "step": 964 + }, + { + "epoch": 0.10148947638266265, + "grad_norm": 3.3103117280466154, + "learning_rate": 4.8964319698015325e-06, + "loss": 1.0155, + "step": 965 + }, + { + "epoch": 0.10159464682450997, + "grad_norm": 2.7095856980343966, + "learning_rate": 4.896194039790632e-06, + "loss": 1.0172, + "step": 966 + }, + { + "epoch": 0.1016998172663573, + "grad_norm": 3.606502874104067, + "learning_rate": 4.895955842585483e-06, + "loss": 1.0368, + "step": 967 + }, + { + "epoch": 0.1018049877082046, + "grad_norm": 3.7818643892335024, + "learning_rate": 4.895717378212644e-06, + "loss": 1.0736, + "step": 968 + }, + { + "epoch": 0.10191015815005193, + "grad_norm": 3.9660970245154865, + "learning_rate": 4.895478646698707e-06, + "loss": 1.0143, + "step": 969 + }, + { + "epoch": 0.10201532859189924, + "grad_norm": 3.472975484177083, + "learning_rate": 4.895239648070292e-06, + "loss": 1.0414, + "step": 970 + }, + { + "epoch": 0.10212049903374656, + "grad_norm": 3.2923024045337277, + "learning_rate": 4.895000382354049e-06, + "loss": 1.0605, + "step": 971 + }, + { + "epoch": 0.10222566947559389, + "grad_norm": 4.414663168893738, + "learning_rate": 4.89476084957666e-06, + "loss": 1.0273, + "step": 972 + }, + { + "epoch": 0.1023308399174412, + "grad_norm": 5.295626697084187, + "learning_rate": 4.894521049764831e-06, + "loss": 1.0586, + "step": 973 + }, + { + "epoch": 0.10243601035928852, + "grad_norm": 2.9933688401634213, + "learning_rate": 4.8942809829453046e-06, + "loss": 1.0011, + "step": 974 + }, + { + "epoch": 0.10254118080113583, + "grad_norm": 3.1480184087773253, + "learning_rate": 4.894040649144849e-06, + "loss": 1.0485, + "step": 975 + }, + { + "epoch": 0.10264635124298316, + "grad_norm": 3.0397182716899254, + "learning_rate": 4.893800048390264e-06, + "loss": 1.0304, + "step": 976 + }, + { + "epoch": 0.10275152168483048, + "grad_norm": 3.4747456565622787, + "learning_rate": 4.893559180708378e-06, + "loss": 1.0068, + "step": 977 + }, + { + "epoch": 0.10285669212667779, + "grad_norm": 3.54567802483446, + "learning_rate": 4.8933180461260485e-06, + "loss": 1.0231, + "step": 978 + }, + { + "epoch": 0.10296186256852512, + "grad_norm": 3.4856402750957307, + "learning_rate": 4.893076644670166e-06, + "loss": 1.0518, + "step": 979 + }, + { + "epoch": 0.10306703301037244, + "grad_norm": 2.97684659916138, + "learning_rate": 4.892834976367647e-06, + "loss": 0.9815, + "step": 980 + }, + { + "epoch": 0.10317220345221975, + "grad_norm": 4.321536953737777, + "learning_rate": 4.8925930412454405e-06, + "loss": 1.0374, + "step": 981 + }, + { + "epoch": 0.10327737389406708, + "grad_norm": 3.6585034863508605, + "learning_rate": 4.8923508393305224e-06, + "loss": 0.9504, + "step": 982 + }, + { + "epoch": 0.10338254433591439, + "grad_norm": 3.3098381288492074, + "learning_rate": 4.892108370649902e-06, + "loss": 1.0276, + "step": 983 + }, + { + "epoch": 0.10348771477776171, + "grad_norm": 3.3462769999723383, + "learning_rate": 4.891865635230616e-06, + "loss": 1.0415, + "step": 984 + }, + { + "epoch": 0.10359288521960903, + "grad_norm": 3.1379133999705426, + "learning_rate": 4.891622633099731e-06, + "loss": 1.0503, + "step": 985 + }, + { + "epoch": 0.10369805566145635, + "grad_norm": 2.096943884410061, + "learning_rate": 4.8913793642843434e-06, + "loss": 1.0132, + "step": 986 + }, + { + "epoch": 0.10380322610330367, + "grad_norm": 3.49899699410188, + "learning_rate": 4.89113582881158e-06, + "loss": 1.036, + "step": 987 + }, + { + "epoch": 0.10390839654515098, + "grad_norm": 4.363397554483991, + "learning_rate": 4.890892026708596e-06, + "loss": 1.0641, + "step": 988 + }, + { + "epoch": 0.1040135669869983, + "grad_norm": 2.843253723260328, + "learning_rate": 4.8906479580025774e-06, + "loss": 1.0259, + "step": 989 + }, + { + "epoch": 0.10411873742884563, + "grad_norm": 3.720334378029494, + "learning_rate": 4.890403622720742e-06, + "loss": 1.0301, + "step": 990 + }, + { + "epoch": 0.10422390787069294, + "grad_norm": 3.4282471758253483, + "learning_rate": 4.890159020890333e-06, + "loss": 1.0388, + "step": 991 + }, + { + "epoch": 0.10432907831254026, + "grad_norm": 4.570380050609438, + "learning_rate": 4.889914152538625e-06, + "loss": 1.0445, + "step": 992 + }, + { + "epoch": 0.10443424875438757, + "grad_norm": 4.005408760979363, + "learning_rate": 4.889669017692924e-06, + "loss": 1.0779, + "step": 993 + }, + { + "epoch": 0.1045394191962349, + "grad_norm": 3.317322954004475, + "learning_rate": 4.889423616380564e-06, + "loss": 1.0502, + "step": 994 + }, + { + "epoch": 0.10464458963808222, + "grad_norm": 2.196763228351377, + "learning_rate": 4.889177948628908e-06, + "loss": 1.0265, + "step": 995 + }, + { + "epoch": 0.10474976007992953, + "grad_norm": 2.877333453688417, + "learning_rate": 4.8889320144653525e-06, + "loss": 1.0354, + "step": 996 + }, + { + "epoch": 0.10485493052177686, + "grad_norm": 1.912523311494541, + "learning_rate": 4.8886858139173185e-06, + "loss": 1.0046, + "step": 997 + }, + { + "epoch": 0.10496010096362417, + "grad_norm": 4.312918317607742, + "learning_rate": 4.88843934701226e-06, + "loss": 1.0117, + "step": 998 + }, + { + "epoch": 0.10506527140547149, + "grad_norm": 3.0476518763567144, + "learning_rate": 4.888192613777661e-06, + "loss": 1.0288, + "step": 999 + }, + { + "epoch": 0.10517044184731882, + "grad_norm": 2.9241631966551433, + "learning_rate": 4.887945614241034e-06, + "loss": 1.0364, + "step": 1000 + }, + { + "epoch": 0.10527561228916613, + "grad_norm": 2.0938595160121287, + "learning_rate": 4.88769834842992e-06, + "loss": 0.999, + "step": 1001 + }, + { + "epoch": 0.10538078273101345, + "grad_norm": 2.9042647406620397, + "learning_rate": 4.887450816371892e-06, + "loss": 1.0036, + "step": 1002 + }, + { + "epoch": 0.10548595317286076, + "grad_norm": 3.430761146907295, + "learning_rate": 4.887203018094552e-06, + "loss": 1.0187, + "step": 1003 + }, + { + "epoch": 0.10559112361470809, + "grad_norm": 3.564602317410496, + "learning_rate": 4.88695495362553e-06, + "loss": 1.0255, + "step": 1004 + }, + { + "epoch": 0.10569629405655541, + "grad_norm": 2.3078908692690443, + "learning_rate": 4.886706622992489e-06, + "loss": 1.0303, + "step": 1005 + }, + { + "epoch": 0.10580146449840272, + "grad_norm": 2.761094777536776, + "learning_rate": 4.886458026223118e-06, + "loss": 1.0259, + "step": 1006 + }, + { + "epoch": 0.10590663494025004, + "grad_norm": 3.4909322317604388, + "learning_rate": 4.88620916334514e-06, + "loss": 1.0701, + "step": 1007 + }, + { + "epoch": 0.10601180538209737, + "grad_norm": 3.042806287394382, + "learning_rate": 4.885960034386302e-06, + "loss": 1.0188, + "step": 1008 + }, + { + "epoch": 0.10611697582394468, + "grad_norm": 3.4715361900462773, + "learning_rate": 4.885710639374387e-06, + "loss": 1.0059, + "step": 1009 + }, + { + "epoch": 0.106222146265792, + "grad_norm": 2.7625550172439524, + "learning_rate": 4.885460978337201e-06, + "loss": 1.0592, + "step": 1010 + }, + { + "epoch": 0.10632731670763931, + "grad_norm": 2.7235208933430597, + "learning_rate": 4.885211051302586e-06, + "loss": 1.0214, + "step": 1011 + }, + { + "epoch": 0.10643248714948664, + "grad_norm": 2.7923910284329403, + "learning_rate": 4.88496085829841e-06, + "loss": 1.0122, + "step": 1012 + }, + { + "epoch": 0.10653765759133396, + "grad_norm": 4.09723932625378, + "learning_rate": 4.884710399352572e-06, + "loss": 1.0252, + "step": 1013 + }, + { + "epoch": 0.10664282803318127, + "grad_norm": 4.277407726951024, + "learning_rate": 4.884459674492997e-06, + "loss": 1.0445, + "step": 1014 + }, + { + "epoch": 0.1067479984750286, + "grad_norm": 2.741940824883468, + "learning_rate": 4.884208683747647e-06, + "loss": 1.0452, + "step": 1015 + }, + { + "epoch": 0.1068531689168759, + "grad_norm": 2.846019968822997, + "learning_rate": 4.883957427144507e-06, + "loss": 1.0386, + "step": 1016 + }, + { + "epoch": 0.10695833935872323, + "grad_norm": 3.0668518235856776, + "learning_rate": 4.8837059047115955e-06, + "loss": 1.073, + "step": 1017 + }, + { + "epoch": 0.10706350980057056, + "grad_norm": 2.7044021207813835, + "learning_rate": 4.883454116476957e-06, + "loss": 1.025, + "step": 1018 + }, + { + "epoch": 0.10716868024241787, + "grad_norm": 2.9884834738601533, + "learning_rate": 4.88320206246867e-06, + "loss": 1.0362, + "step": 1019 + }, + { + "epoch": 0.10727385068426519, + "grad_norm": 2.6459541792130823, + "learning_rate": 4.88294974271484e-06, + "loss": 1.035, + "step": 1020 + }, + { + "epoch": 0.1073790211261125, + "grad_norm": 3.296712064083779, + "learning_rate": 4.882697157243601e-06, + "loss": 1.0486, + "step": 1021 + }, + { + "epoch": 0.10748419156795982, + "grad_norm": 3.5876433786961885, + "learning_rate": 4.882444306083121e-06, + "loss": 1.0428, + "step": 1022 + }, + { + "epoch": 0.10758936200980715, + "grad_norm": 2.4991241646439426, + "learning_rate": 4.882191189261592e-06, + "loss": 1.0133, + "step": 1023 + }, + { + "epoch": 0.10769453245165446, + "grad_norm": 3.07812377252626, + "learning_rate": 4.881937806807241e-06, + "loss": 1.0337, + "step": 1024 + }, + { + "epoch": 0.10779970289350178, + "grad_norm": 2.229456998154068, + "learning_rate": 4.881684158748321e-06, + "loss": 1.0208, + "step": 1025 + }, + { + "epoch": 0.1079048733353491, + "grad_norm": 3.7922511732147144, + "learning_rate": 4.881430245113115e-06, + "loss": 1.0326, + "step": 1026 + }, + { + "epoch": 0.10801004377719642, + "grad_norm": 3.1647389772945695, + "learning_rate": 4.881176065929938e-06, + "loss": 1.0131, + "step": 1027 + }, + { + "epoch": 0.10811521421904374, + "grad_norm": 2.8627766739221787, + "learning_rate": 4.880921621227131e-06, + "loss": 1.024, + "step": 1028 + }, + { + "epoch": 0.10822038466089105, + "grad_norm": 2.5687462636056035, + "learning_rate": 4.880666911033068e-06, + "loss": 1.0409, + "step": 1029 + }, + { + "epoch": 0.10832555510273838, + "grad_norm": 2.916734582685727, + "learning_rate": 4.880411935376151e-06, + "loss": 1.0788, + "step": 1030 + }, + { + "epoch": 0.10843072554458569, + "grad_norm": 4.000486986234842, + "learning_rate": 4.880156694284811e-06, + "loss": 1.0599, + "step": 1031 + }, + { + "epoch": 0.10853589598643301, + "grad_norm": 3.5425566453452295, + "learning_rate": 4.87990118778751e-06, + "loss": 1.0306, + "step": 1032 + }, + { + "epoch": 0.10864106642828034, + "grad_norm": 3.259354111862751, + "learning_rate": 4.879645415912739e-06, + "loss": 1.0667, + "step": 1033 + }, + { + "epoch": 0.10874623687012765, + "grad_norm": 3.7603711436052585, + "learning_rate": 4.8793893786890186e-06, + "loss": 1.0272, + "step": 1034 + }, + { + "epoch": 0.10885140731197497, + "grad_norm": 2.650644535850584, + "learning_rate": 4.879133076144898e-06, + "loss": 1.0186, + "step": 1035 + }, + { + "epoch": 0.1089565777538223, + "grad_norm": 3.152266769786039, + "learning_rate": 4.8788765083089586e-06, + "loss": 1.0406, + "step": 1036 + }, + { + "epoch": 0.1090617481956696, + "grad_norm": 3.240897604655947, + "learning_rate": 4.878619675209809e-06, + "loss": 1.0398, + "step": 1037 + }, + { + "epoch": 0.10916691863751693, + "grad_norm": 2.145650055329262, + "learning_rate": 4.8783625768760865e-06, + "loss": 1.0149, + "step": 1038 + }, + { + "epoch": 0.10927208907936424, + "grad_norm": 2.495958954073448, + "learning_rate": 4.878105213336462e-06, + "loss": 1.0143, + "step": 1039 + }, + { + "epoch": 0.10937725952121156, + "grad_norm": 2.669064484615587, + "learning_rate": 4.877847584619632e-06, + "loss": 1.0499, + "step": 1040 + }, + { + "epoch": 0.10948242996305889, + "grad_norm": 2.877634072763941, + "learning_rate": 4.8775896907543245e-06, + "loss": 1.0111, + "step": 1041 + }, + { + "epoch": 0.1095876004049062, + "grad_norm": 2.928402468643089, + "learning_rate": 4.877331531769297e-06, + "loss": 1.0233, + "step": 1042 + }, + { + "epoch": 0.10969277084675352, + "grad_norm": 3.1475271796192756, + "learning_rate": 4.877073107693336e-06, + "loss": 1.0445, + "step": 1043 + }, + { + "epoch": 0.10979794128860083, + "grad_norm": 2.7416601116581, + "learning_rate": 4.876814418555257e-06, + "loss": 1.0513, + "step": 1044 + }, + { + "epoch": 0.10990311173044816, + "grad_norm": 2.468594567435823, + "learning_rate": 4.876555464383908e-06, + "loss": 1.0396, + "step": 1045 + }, + { + "epoch": 0.11000828217229548, + "grad_norm": 3.7894959288974306, + "learning_rate": 4.876296245208162e-06, + "loss": 1.0068, + "step": 1046 + }, + { + "epoch": 0.11011345261414279, + "grad_norm": 4.315993647092533, + "learning_rate": 4.876036761056925e-06, + "loss": 1.0244, + "step": 1047 + }, + { + "epoch": 0.11021862305599012, + "grad_norm": 2.7986247997518605, + "learning_rate": 4.875777011959131e-06, + "loss": 1.0252, + "step": 1048 + }, + { + "epoch": 0.11032379349783743, + "grad_norm": 2.872248737665653, + "learning_rate": 4.875516997943745e-06, + "loss": 1.0191, + "step": 1049 + }, + { + "epoch": 0.11042896393968475, + "grad_norm": 3.51973767363257, + "learning_rate": 4.8752567190397605e-06, + "loss": 1.0226, + "step": 1050 + }, + { + "epoch": 0.11053413438153208, + "grad_norm": 2.793611486570461, + "learning_rate": 4.874996175276199e-06, + "loss": 1.0259, + "step": 1051 + }, + { + "epoch": 0.11063930482337939, + "grad_norm": 3.5618775194605243, + "learning_rate": 4.8747353666821155e-06, + "loss": 1.0744, + "step": 1052 + }, + { + "epoch": 0.11074447526522671, + "grad_norm": 3.7992122350452693, + "learning_rate": 4.8744742932865905e-06, + "loss": 1.0727, + "step": 1053 + }, + { + "epoch": 0.11084964570707402, + "grad_norm": 2.1766727817716065, + "learning_rate": 4.874212955118736e-06, + "loss": 1.0345, + "step": 1054 + }, + { + "epoch": 0.11095481614892135, + "grad_norm": 2.988093572670979, + "learning_rate": 4.873951352207694e-06, + "loss": 0.9793, + "step": 1055 + }, + { + "epoch": 0.11105998659076867, + "grad_norm": 3.100468284349954, + "learning_rate": 4.873689484582634e-06, + "loss": 1.1039, + "step": 1056 + }, + { + "epoch": 0.11116515703261598, + "grad_norm": 2.234883911073863, + "learning_rate": 4.873427352272758e-06, + "loss": 1.0484, + "step": 1057 + }, + { + "epoch": 0.1112703274744633, + "grad_norm": 3.034027361644701, + "learning_rate": 4.8731649553072945e-06, + "loss": 1.0477, + "step": 1058 + }, + { + "epoch": 0.11137549791631061, + "grad_norm": 2.7701642330357976, + "learning_rate": 4.872902293715502e-06, + "loss": 0.9995, + "step": 1059 + }, + { + "epoch": 0.11148066835815794, + "grad_norm": 3.5596220939294545, + "learning_rate": 4.872639367526672e-06, + "loss": 1.0595, + "step": 1060 + }, + { + "epoch": 0.11158583880000526, + "grad_norm": 2.4396549311475755, + "learning_rate": 4.872376176770121e-06, + "loss": 1.0638, + "step": 1061 + }, + { + "epoch": 0.11169100924185257, + "grad_norm": 2.7581484936556913, + "learning_rate": 4.872112721475196e-06, + "loss": 1.0511, + "step": 1062 + }, + { + "epoch": 0.1117961796836999, + "grad_norm": 3.59528480669715, + "learning_rate": 4.871849001671276e-06, + "loss": 1.0209, + "step": 1063 + }, + { + "epoch": 0.11190135012554721, + "grad_norm": 3.0507361752299187, + "learning_rate": 4.871585017387767e-06, + "loss": 1.0476, + "step": 1064 + }, + { + "epoch": 0.11200652056739453, + "grad_norm": 3.6738695742416176, + "learning_rate": 4.871320768654105e-06, + "loss": 1.0659, + "step": 1065 + }, + { + "epoch": 0.11211169100924186, + "grad_norm": 3.8253088021331867, + "learning_rate": 4.871056255499758e-06, + "loss": 0.9856, + "step": 1066 + }, + { + "epoch": 0.11221686145108917, + "grad_norm": 3.1580111988558994, + "learning_rate": 4.870791477954218e-06, + "loss": 1.0082, + "step": 1067 + }, + { + "epoch": 0.11232203189293649, + "grad_norm": 3.0599335655636146, + "learning_rate": 4.87052643604701e-06, + "loss": 1.0662, + "step": 1068 + }, + { + "epoch": 0.11242720233478382, + "grad_norm": 2.2702607865189224, + "learning_rate": 4.870261129807692e-06, + "loss": 1.0522, + "step": 1069 + }, + { + "epoch": 0.11253237277663113, + "grad_norm": 2.6723836637796845, + "learning_rate": 4.869995559265844e-06, + "loss": 1.0286, + "step": 1070 + }, + { + "epoch": 0.11263754321847845, + "grad_norm": 4.21492156394967, + "learning_rate": 4.869729724451081e-06, + "loss": 1.0665, + "step": 1071 + }, + { + "epoch": 0.11274271366032576, + "grad_norm": 2.8588027598894703, + "learning_rate": 4.869463625393044e-06, + "loss": 1.0236, + "step": 1072 + }, + { + "epoch": 0.11284788410217308, + "grad_norm": 3.3016899697968882, + "learning_rate": 4.869197262121406e-06, + "loss": 1.0559, + "step": 1073 + }, + { + "epoch": 0.11295305454402041, + "grad_norm": 3.419706325986653, + "learning_rate": 4.8689306346658704e-06, + "loss": 1.011, + "step": 1074 + }, + { + "epoch": 0.11305822498586772, + "grad_norm": 2.0613879638549, + "learning_rate": 4.868663743056165e-06, + "loss": 0.9756, + "step": 1075 + }, + { + "epoch": 0.11316339542771504, + "grad_norm": 3.333749579337557, + "learning_rate": 4.868396587322053e-06, + "loss": 1.0821, + "step": 1076 + }, + { + "epoch": 0.11326856586956235, + "grad_norm": 3.0234091587963032, + "learning_rate": 4.868129167493322e-06, + "loss": 1.0416, + "step": 1077 + }, + { + "epoch": 0.11337373631140968, + "grad_norm": 3.1722297477760466, + "learning_rate": 4.867861483599793e-06, + "loss": 1.049, + "step": 1078 + }, + { + "epoch": 0.113478906753257, + "grad_norm": 3.123942723713283, + "learning_rate": 4.867593535671315e-06, + "loss": 1.0038, + "step": 1079 + }, + { + "epoch": 0.11358407719510431, + "grad_norm": 3.3849552815537054, + "learning_rate": 4.867325323737765e-06, + "loss": 0.9871, + "step": 1080 + }, + { + "epoch": 0.11368924763695164, + "grad_norm": 3.6000442180720498, + "learning_rate": 4.8670568478290515e-06, + "loss": 1.0303, + "step": 1081 + }, + { + "epoch": 0.11379441807879895, + "grad_norm": 3.0065860923788676, + "learning_rate": 4.866788107975111e-06, + "loss": 1.0284, + "step": 1082 + }, + { + "epoch": 0.11389958852064627, + "grad_norm": 3.307728595187707, + "learning_rate": 4.866519104205911e-06, + "loss": 1.0484, + "step": 1083 + }, + { + "epoch": 0.1140047589624936, + "grad_norm": 2.9561692789631078, + "learning_rate": 4.866249836551447e-06, + "loss": 1.012, + "step": 1084 + }, + { + "epoch": 0.1141099294043409, + "grad_norm": 2.5709168983222503, + "learning_rate": 4.865980305041746e-06, + "loss": 1.024, + "step": 1085 + }, + { + "epoch": 0.11421509984618823, + "grad_norm": 2.7856609101318104, + "learning_rate": 4.865710509706859e-06, + "loss": 1.0148, + "step": 1086 + }, + { + "epoch": 0.11432027028803554, + "grad_norm": 4.12784259860208, + "learning_rate": 4.8654404505768735e-06, + "loss": 1.0658, + "step": 1087 + }, + { + "epoch": 0.11442544072988287, + "grad_norm": 3.9354921027135186, + "learning_rate": 4.865170127681903e-06, + "loss": 1.0093, + "step": 1088 + }, + { + "epoch": 0.11453061117173019, + "grad_norm": 4.361516591990753, + "learning_rate": 4.8648995410520905e-06, + "loss": 1.0849, + "step": 1089 + }, + { + "epoch": 0.1146357816135775, + "grad_norm": 3.874224839358617, + "learning_rate": 4.864628690717607e-06, + "loss": 1.0004, + "step": 1090 + }, + { + "epoch": 0.11474095205542482, + "grad_norm": 2.713904318986021, + "learning_rate": 4.8643575767086555e-06, + "loss": 0.9988, + "step": 1091 + }, + { + "epoch": 0.11484612249727214, + "grad_norm": 4.127820079366728, + "learning_rate": 4.864086199055467e-06, + "loss": 1.0265, + "step": 1092 + }, + { + "epoch": 0.11495129293911946, + "grad_norm": 4.16373977878337, + "learning_rate": 4.863814557788303e-06, + "loss": 1.0445, + "step": 1093 + }, + { + "epoch": 0.11505646338096678, + "grad_norm": 4.340273179649079, + "learning_rate": 4.863542652937453e-06, + "loss": 1.0214, + "step": 1094 + }, + { + "epoch": 0.1151616338228141, + "grad_norm": 3.5979531619959593, + "learning_rate": 4.863270484533237e-06, + "loss": 0.9954, + "step": 1095 + }, + { + "epoch": 0.11526680426466142, + "grad_norm": 3.7563254164355624, + "learning_rate": 4.862998052606001e-06, + "loss": 1.0446, + "step": 1096 + }, + { + "epoch": 0.11537197470650874, + "grad_norm": 2.8311030399401904, + "learning_rate": 4.862725357186129e-06, + "loss": 0.9762, + "step": 1097 + }, + { + "epoch": 0.11547714514835605, + "grad_norm": 3.26148888613775, + "learning_rate": 4.862452398304024e-06, + "loss": 1.0667, + "step": 1098 + }, + { + "epoch": 0.11558231559020338, + "grad_norm": 2.2978414859803964, + "learning_rate": 4.862179175990124e-06, + "loss": 0.9832, + "step": 1099 + }, + { + "epoch": 0.11568748603205069, + "grad_norm": 2.8821374960941304, + "learning_rate": 4.861905690274896e-06, + "loss": 1.0225, + "step": 1100 + }, + { + "epoch": 0.11579265647389801, + "grad_norm": 2.657398753259678, + "learning_rate": 4.861631941188836e-06, + "loss": 1.036, + "step": 1101 + }, + { + "epoch": 0.11589782691574534, + "grad_norm": 1.7911673644478512, + "learning_rate": 4.861357928762468e-06, + "loss": 1.0355, + "step": 1102 + }, + { + "epoch": 0.11600299735759265, + "grad_norm": 3.1971189200731316, + "learning_rate": 4.8610836530263485e-06, + "loss": 1.0348, + "step": 1103 + }, + { + "epoch": 0.11610816779943997, + "grad_norm": 3.9670686015745438, + "learning_rate": 4.860809114011059e-06, + "loss": 1.0403, + "step": 1104 + }, + { + "epoch": 0.11621333824128728, + "grad_norm": 3.4009369343225426, + "learning_rate": 4.860534311747215e-06, + "loss": 1.0572, + "step": 1105 + }, + { + "epoch": 0.1163185086831346, + "grad_norm": 4.1414177645044745, + "learning_rate": 4.860259246265456e-06, + "loss": 1.0702, + "step": 1106 + }, + { + "epoch": 0.11642367912498193, + "grad_norm": 4.123107172342563, + "learning_rate": 4.859983917596458e-06, + "loss": 1.0605, + "step": 1107 + }, + { + "epoch": 0.11652884956682924, + "grad_norm": 3.969640922287132, + "learning_rate": 4.859708325770919e-06, + "loss": 1.008, + "step": 1108 + }, + { + "epoch": 0.11663402000867656, + "grad_norm": 2.90888044600641, + "learning_rate": 4.859432470819572e-06, + "loss": 1.0459, + "step": 1109 + }, + { + "epoch": 0.11673919045052387, + "grad_norm": 2.831787199338218, + "learning_rate": 4.859156352773174e-06, + "loss": 1.0261, + "step": 1110 + }, + { + "epoch": 0.1168443608923712, + "grad_norm": 3.987477080469216, + "learning_rate": 4.858879971662518e-06, + "loss": 1.0415, + "step": 1111 + }, + { + "epoch": 0.11694953133421852, + "grad_norm": 3.9586675261870274, + "learning_rate": 4.85860332751842e-06, + "loss": 1.0675, + "step": 1112 + }, + { + "epoch": 0.11705470177606583, + "grad_norm": 2.6528061023001386, + "learning_rate": 4.858326420371728e-06, + "loss": 1.0326, + "step": 1113 + }, + { + "epoch": 0.11715987221791316, + "grad_norm": 3.2831047642667297, + "learning_rate": 4.858049250253321e-06, + "loss": 1.0562, + "step": 1114 + }, + { + "epoch": 0.11726504265976047, + "grad_norm": 3.144259175603822, + "learning_rate": 4.8577718171941036e-06, + "loss": 1.0119, + "step": 1115 + }, + { + "epoch": 0.11737021310160779, + "grad_norm": 3.6872799055696244, + "learning_rate": 4.857494121225014e-06, + "loss": 1.0456, + "step": 1116 + }, + { + "epoch": 0.11747538354345512, + "grad_norm": 3.1976523712311273, + "learning_rate": 4.857216162377015e-06, + "loss": 1.0582, + "step": 1117 + }, + { + "epoch": 0.11758055398530243, + "grad_norm": 2.9008275495280342, + "learning_rate": 4.8569379406811034e-06, + "loss": 1.0335, + "step": 1118 + }, + { + "epoch": 0.11768572442714975, + "grad_norm": 3.5671660679987265, + "learning_rate": 4.856659456168301e-06, + "loss": 1.0251, + "step": 1119 + }, + { + "epoch": 0.11779089486899706, + "grad_norm": 3.9412822209537426, + "learning_rate": 4.856380708869663e-06, + "loss": 1.0671, + "step": 1120 + }, + { + "epoch": 0.11789606531084439, + "grad_norm": 3.191017359982901, + "learning_rate": 4.85610169881627e-06, + "loss": 1.0434, + "step": 1121 + }, + { + "epoch": 0.11800123575269171, + "grad_norm": 3.195874922814993, + "learning_rate": 4.855822426039236e-06, + "loss": 0.9924, + "step": 1122 + }, + { + "epoch": 0.11810640619453902, + "grad_norm": 2.751515168083024, + "learning_rate": 4.855542890569701e-06, + "loss": 1.0373, + "step": 1123 + }, + { + "epoch": 0.11821157663638635, + "grad_norm": 2.896579359878487, + "learning_rate": 4.855263092438834e-06, + "loss": 1.0463, + "step": 1124 + }, + { + "epoch": 0.11831674707823366, + "grad_norm": 3.367419965470842, + "learning_rate": 4.8549830316778365e-06, + "loss": 1.0156, + "step": 1125 + }, + { + "epoch": 0.11842191752008098, + "grad_norm": 2.2007559598381357, + "learning_rate": 4.854702708317937e-06, + "loss": 1.0342, + "step": 1126 + }, + { + "epoch": 0.1185270879619283, + "grad_norm": 2.729565964737428, + "learning_rate": 4.8544221223903925e-06, + "loss": 1.0192, + "step": 1127 + }, + { + "epoch": 0.11863225840377561, + "grad_norm": 4.234177007738469, + "learning_rate": 4.854141273926492e-06, + "loss": 0.9931, + "step": 1128 + }, + { + "epoch": 0.11873742884562294, + "grad_norm": 4.1159850072468025, + "learning_rate": 4.8538601629575525e-06, + "loss": 1.048, + "step": 1129 + }, + { + "epoch": 0.11884259928747026, + "grad_norm": 3.030352704362315, + "learning_rate": 4.8535787895149186e-06, + "loss": 1.0481, + "step": 1130 + }, + { + "epoch": 0.11894776972931757, + "grad_norm": 2.416745852348571, + "learning_rate": 4.853297153629967e-06, + "loss": 1.0426, + "step": 1131 + }, + { + "epoch": 0.1190529401711649, + "grad_norm": 2.953961483423199, + "learning_rate": 4.853015255334101e-06, + "loss": 1.0233, + "step": 1132 + }, + { + "epoch": 0.11915811061301221, + "grad_norm": 3.1479898039953103, + "learning_rate": 4.852733094658756e-06, + "loss": 1.0074, + "step": 1133 + }, + { + "epoch": 0.11926328105485953, + "grad_norm": 2.0774548323641397, + "learning_rate": 4.852450671635395e-06, + "loss": 1.01, + "step": 1134 + }, + { + "epoch": 0.11936845149670686, + "grad_norm": 2.51715458277697, + "learning_rate": 4.852167986295508e-06, + "loss": 1.0401, + "step": 1135 + }, + { + "epoch": 0.11947362193855417, + "grad_norm": 3.6879308333975587, + "learning_rate": 4.851885038670618e-06, + "loss": 1.0577, + "step": 1136 + }, + { + "epoch": 0.11957879238040149, + "grad_norm": 3.27065440253345, + "learning_rate": 4.851601828792278e-06, + "loss": 1.0163, + "step": 1137 + }, + { + "epoch": 0.1196839628222488, + "grad_norm": 2.2130774677775853, + "learning_rate": 4.8513183566920654e-06, + "loss": 1.0543, + "step": 1138 + }, + { + "epoch": 0.11978913326409613, + "grad_norm": 2.8314329893329093, + "learning_rate": 4.8510346224015896e-06, + "loss": 1.0796, + "step": 1139 + }, + { + "epoch": 0.11989430370594345, + "grad_norm": 3.7243358852096953, + "learning_rate": 4.85075062595249e-06, + "loss": 1.0379, + "step": 1140 + }, + { + "epoch": 0.11999947414779076, + "grad_norm": 3.7144900747943623, + "learning_rate": 4.850466367376435e-06, + "loss": 1.0259, + "step": 1141 + }, + { + "epoch": 0.12010464458963808, + "grad_norm": 3.231616766464792, + "learning_rate": 4.850181846705121e-06, + "loss": 1.0482, + "step": 1142 + }, + { + "epoch": 0.1202098150314854, + "grad_norm": 3.3082034220519416, + "learning_rate": 4.8498970639702745e-06, + "loss": 1.0284, + "step": 1143 + }, + { + "epoch": 0.12031498547333272, + "grad_norm": 2.9443493452344183, + "learning_rate": 4.84961201920365e-06, + "loss": 1.0511, + "step": 1144 + }, + { + "epoch": 0.12042015591518004, + "grad_norm": 3.196336464928972, + "learning_rate": 4.849326712437033e-06, + "loss": 1.0723, + "step": 1145 + }, + { + "epoch": 0.12052532635702735, + "grad_norm": 2.71928604807591, + "learning_rate": 4.849041143702238e-06, + "loss": 1.0112, + "step": 1146 + }, + { + "epoch": 0.12063049679887468, + "grad_norm": 3.533369997436239, + "learning_rate": 4.8487553130311065e-06, + "loss": 0.9867, + "step": 1147 + }, + { + "epoch": 0.12073566724072199, + "grad_norm": 2.455468182286396, + "learning_rate": 4.848469220455512e-06, + "loss": 0.9962, + "step": 1148 + }, + { + "epoch": 0.12084083768256931, + "grad_norm": 2.3850247504745465, + "learning_rate": 4.848182866007356e-06, + "loss": 1.0686, + "step": 1149 + }, + { + "epoch": 0.12094600812441664, + "grad_norm": 2.5387686707247843, + "learning_rate": 4.84789624971857e-06, + "loss": 1.0425, + "step": 1150 + }, + { + "epoch": 0.12105117856626395, + "grad_norm": 3.482360328905752, + "learning_rate": 4.8476093716211125e-06, + "loss": 1.0271, + "step": 1151 + }, + { + "epoch": 0.12115634900811127, + "grad_norm": 3.6287895041841702, + "learning_rate": 4.847322231746973e-06, + "loss": 1.0491, + "step": 1152 + }, + { + "epoch": 0.12126151944995858, + "grad_norm": 2.7406507151068404, + "learning_rate": 4.84703483012817e-06, + "loss": 1.0579, + "step": 1153 + }, + { + "epoch": 0.1213666898918059, + "grad_norm": 2.093402317749306, + "learning_rate": 4.846747166796751e-06, + "loss": 1.0114, + "step": 1154 + }, + { + "epoch": 0.12147186033365323, + "grad_norm": 2.7833406137397896, + "learning_rate": 4.846459241784793e-06, + "loss": 1.0164, + "step": 1155 + }, + { + "epoch": 0.12157703077550054, + "grad_norm": 3.5971545223060626, + "learning_rate": 4.846171055124401e-06, + "loss": 1.0388, + "step": 1156 + }, + { + "epoch": 0.12168220121734787, + "grad_norm": 4.388318975491837, + "learning_rate": 4.845882606847712e-06, + "loss": 1.0544, + "step": 1157 + }, + { + "epoch": 0.12178737165919519, + "grad_norm": 3.471550657627086, + "learning_rate": 4.845593896986888e-06, + "loss": 1.0936, + "step": 1158 + }, + { + "epoch": 0.1218925421010425, + "grad_norm": 3.0123013262369924, + "learning_rate": 4.845304925574122e-06, + "loss": 0.9964, + "step": 1159 + }, + { + "epoch": 0.12199771254288982, + "grad_norm": 2.2624703252236453, + "learning_rate": 4.84501569264164e-06, + "loss": 1.0276, + "step": 1160 + }, + { + "epoch": 0.12210288298473713, + "grad_norm": 2.675714197722898, + "learning_rate": 4.8447261982216905e-06, + "loss": 0.9954, + "step": 1161 + }, + { + "epoch": 0.12220805342658446, + "grad_norm": 4.13088015952959, + "learning_rate": 4.8444364423465555e-06, + "loss": 0.9864, + "step": 1162 + }, + { + "epoch": 0.12231322386843178, + "grad_norm": 4.229159717585945, + "learning_rate": 4.844146425048545e-06, + "loss": 1.0392, + "step": 1163 + }, + { + "epoch": 0.1224183943102791, + "grad_norm": 3.419828767103554, + "learning_rate": 4.843856146359999e-06, + "loss": 1.0395, + "step": 1164 + }, + { + "epoch": 0.12252356475212642, + "grad_norm": 4.317347247396286, + "learning_rate": 4.843565606313283e-06, + "loss": 1.0242, + "step": 1165 + }, + { + "epoch": 0.12262873519397373, + "grad_norm": 3.754270500912085, + "learning_rate": 4.843274804940798e-06, + "loss": 1.052, + "step": 1166 + }, + { + "epoch": 0.12273390563582105, + "grad_norm": 2.8312910701769294, + "learning_rate": 4.8429837422749695e-06, + "loss": 1.0404, + "step": 1167 + }, + { + "epoch": 0.12283907607766838, + "grad_norm": 3.823314861494254, + "learning_rate": 4.842692418348251e-06, + "loss": 1.0094, + "step": 1168 + }, + { + "epoch": 0.12294424651951569, + "grad_norm": 3.9925424351563095, + "learning_rate": 4.842400833193131e-06, + "loss": 1.0336, + "step": 1169 + }, + { + "epoch": 0.12304941696136301, + "grad_norm": 4.5156463343872755, + "learning_rate": 4.84210898684212e-06, + "loss": 1.0294, + "step": 1170 + }, + { + "epoch": 0.12315458740321032, + "grad_norm": 4.134735379560587, + "learning_rate": 4.841816879327764e-06, + "loss": 1.0681, + "step": 1171 + }, + { + "epoch": 0.12325975784505765, + "grad_norm": 3.5718684762008586, + "learning_rate": 4.8415245106826335e-06, + "loss": 1.0457, + "step": 1172 + }, + { + "epoch": 0.12336492828690497, + "grad_norm": 2.8524025783294835, + "learning_rate": 4.841231880939331e-06, + "loss": 1.051, + "step": 1173 + }, + { + "epoch": 0.12347009872875228, + "grad_norm": 2.1112242358528794, + "learning_rate": 4.840938990130486e-06, + "loss": 0.9749, + "step": 1174 + }, + { + "epoch": 0.1235752691705996, + "grad_norm": 3.489044644429071, + "learning_rate": 4.840645838288759e-06, + "loss": 1.0839, + "step": 1175 + }, + { + "epoch": 0.12368043961244692, + "grad_norm": 3.7851298708992007, + "learning_rate": 4.840352425446838e-06, + "loss": 1.0008, + "step": 1176 + }, + { + "epoch": 0.12378561005429424, + "grad_norm": 3.534928614423682, + "learning_rate": 4.840058751637441e-06, + "loss": 0.9969, + "step": 1177 + }, + { + "epoch": 0.12389078049614156, + "grad_norm": 3.866088509887153, + "learning_rate": 4.839764816893315e-06, + "loss": 1.0296, + "step": 1178 + }, + { + "epoch": 0.12399595093798887, + "grad_norm": 3.8493227086097836, + "learning_rate": 4.839470621247235e-06, + "loss": 0.9829, + "step": 1179 + }, + { + "epoch": 0.1241011213798362, + "grad_norm": 3.863296748292079, + "learning_rate": 4.839176164732009e-06, + "loss": 1.0043, + "step": 1180 + }, + { + "epoch": 0.12420629182168351, + "grad_norm": 3.044556390384978, + "learning_rate": 4.838881447380468e-06, + "loss": 1.0635, + "step": 1181 + }, + { + "epoch": 0.12431146226353083, + "grad_norm": 3.036859307808052, + "learning_rate": 4.838586469225477e-06, + "loss": 1.068, + "step": 1182 + }, + { + "epoch": 0.12441663270537816, + "grad_norm": 3.4852695563768936, + "learning_rate": 4.838291230299927e-06, + "loss": 1.0264, + "step": 1183 + }, + { + "epoch": 0.12452180314722547, + "grad_norm": 3.541328436650173, + "learning_rate": 4.837995730636742e-06, + "loss": 1.0692, + "step": 1184 + }, + { + "epoch": 0.12462697358907279, + "grad_norm": 2.403850262061819, + "learning_rate": 4.83769997026887e-06, + "loss": 1.0131, + "step": 1185 + }, + { + "epoch": 0.12473214403092012, + "grad_norm": 2.0179723281866204, + "learning_rate": 4.837403949229291e-06, + "loss": 1.033, + "step": 1186 + }, + { + "epoch": 0.12483731447276743, + "grad_norm": 3.288715529975734, + "learning_rate": 4.837107667551015e-06, + "loss": 1.0143, + "step": 1187 + }, + { + "epoch": 0.12494248491461475, + "grad_norm": 2.488652116770584, + "learning_rate": 4.8368111252670776e-06, + "loss": 1.0064, + "step": 1188 + }, + { + "epoch": 0.12504765535646206, + "grad_norm": 2.099768655554014, + "learning_rate": 4.836514322410548e-06, + "loss": 1.0311, + "step": 1189 + }, + { + "epoch": 0.1251528257983094, + "grad_norm": 2.61507115515271, + "learning_rate": 4.83621725901452e-06, + "loss": 1.0037, + "step": 1190 + }, + { + "epoch": 0.1252579962401567, + "grad_norm": 3.5627001839530887, + "learning_rate": 4.83591993511212e-06, + "loss": 1.0104, + "step": 1191 + }, + { + "epoch": 0.12536316668200403, + "grad_norm": 2.7611171047918854, + "learning_rate": 4.835622350736499e-06, + "loss": 1.0015, + "step": 1192 + }, + { + "epoch": 0.12546833712385133, + "grad_norm": 2.6167789911665333, + "learning_rate": 4.835324505920845e-06, + "loss": 1.0208, + "step": 1193 + }, + { + "epoch": 0.12557350756569866, + "grad_norm": 3.3090032658379624, + "learning_rate": 4.835026400698366e-06, + "loss": 1.0339, + "step": 1194 + }, + { + "epoch": 0.12567867800754598, + "grad_norm": 2.7592196627213172, + "learning_rate": 4.8347280351023044e-06, + "loss": 1.021, + "step": 1195 + }, + { + "epoch": 0.1257838484493933, + "grad_norm": 3.529997579629742, + "learning_rate": 4.83442940916593e-06, + "loss": 1.0269, + "step": 1196 + }, + { + "epoch": 0.12588901889124063, + "grad_norm": 3.0771754582486324, + "learning_rate": 4.834130522922541e-06, + "loss": 1.0385, + "step": 1197 + }, + { + "epoch": 0.12599418933308792, + "grad_norm": 3.019098612661189, + "learning_rate": 4.8338313764054676e-06, + "loss": 1.0067, + "step": 1198 + }, + { + "epoch": 0.12609935977493525, + "grad_norm": 3.1455190399414774, + "learning_rate": 4.8335319696480655e-06, + "loss": 0.9901, + "step": 1199 + }, + { + "epoch": 0.12620453021678257, + "grad_norm": 2.815592993177342, + "learning_rate": 4.833232302683721e-06, + "loss": 1.0099, + "step": 1200 + }, + { + "epoch": 0.1263097006586299, + "grad_norm": 2.1662241166960974, + "learning_rate": 4.83293237554585e-06, + "loss": 1.0209, + "step": 1201 + }, + { + "epoch": 0.12641487110047722, + "grad_norm": 2.5510337259609717, + "learning_rate": 4.832632188267896e-06, + "loss": 1.0417, + "step": 1202 + }, + { + "epoch": 0.12652004154232452, + "grad_norm": 2.7666819520333146, + "learning_rate": 4.832331740883333e-06, + "loss": 1.0352, + "step": 1203 + }, + { + "epoch": 0.12662521198417184, + "grad_norm": 3.0449829988980937, + "learning_rate": 4.832031033425663e-06, + "loss": 1.0291, + "step": 1204 + }, + { + "epoch": 0.12673038242601917, + "grad_norm": 2.6400119751853564, + "learning_rate": 4.831730065928416e-06, + "loss": 1.0378, + "step": 1205 + }, + { + "epoch": 0.1268355528678665, + "grad_norm": 2.94538447937034, + "learning_rate": 4.831428838425153e-06, + "loss": 1.0311, + "step": 1206 + }, + { + "epoch": 0.12694072330971382, + "grad_norm": 2.3594393856071614, + "learning_rate": 4.8311273509494635e-06, + "loss": 1.0124, + "step": 1207 + }, + { + "epoch": 0.1270458937515611, + "grad_norm": 2.153915292661592, + "learning_rate": 4.830825603534967e-06, + "loss": 1.0277, + "step": 1208 + }, + { + "epoch": 0.12715106419340844, + "grad_norm": 3.456220143305514, + "learning_rate": 4.8305235962153075e-06, + "loss": 1.0868, + "step": 1209 + }, + { + "epoch": 0.12725623463525576, + "grad_norm": 2.7043626503920337, + "learning_rate": 4.830221329024163e-06, + "loss": 1.0537, + "step": 1210 + }, + { + "epoch": 0.12736140507710308, + "grad_norm": 3.589165033013744, + "learning_rate": 4.829918801995239e-06, + "loss": 1.0796, + "step": 1211 + }, + { + "epoch": 0.1274665755189504, + "grad_norm": 3.373260416606364, + "learning_rate": 4.829616015162269e-06, + "loss": 1.0191, + "step": 1212 + }, + { + "epoch": 0.1275717459607977, + "grad_norm": 3.5871990636482796, + "learning_rate": 4.8293129685590164e-06, + "loss": 1.0407, + "step": 1213 + }, + { + "epoch": 0.12767691640264503, + "grad_norm": 4.2127544793207745, + "learning_rate": 4.8290096622192735e-06, + "loss": 1.0372, + "step": 1214 + }, + { + "epoch": 0.12778208684449235, + "grad_norm": 3.511106624322245, + "learning_rate": 4.82870609617686e-06, + "loss": 1.0408, + "step": 1215 + }, + { + "epoch": 0.12788725728633968, + "grad_norm": 2.8132894727294055, + "learning_rate": 4.828402270465628e-06, + "loss": 0.9943, + "step": 1216 + }, + { + "epoch": 0.127992427728187, + "grad_norm": 2.4459083353657807, + "learning_rate": 4.828098185119454e-06, + "loss": 1.0259, + "step": 1217 + }, + { + "epoch": 0.1280975981700343, + "grad_norm": 2.3329129365329337, + "learning_rate": 4.827793840172247e-06, + "loss": 1.0269, + "step": 1218 + }, + { + "epoch": 0.12820276861188162, + "grad_norm": 2.8128127508148015, + "learning_rate": 4.827489235657944e-06, + "loss": 1.0505, + "step": 1219 + }, + { + "epoch": 0.12830793905372895, + "grad_norm": 2.339660968358773, + "learning_rate": 4.827184371610511e-06, + "loss": 1.0047, + "step": 1220 + }, + { + "epoch": 0.12841310949557627, + "grad_norm": 3.3186117188660584, + "learning_rate": 4.826879248063943e-06, + "loss": 1.0416, + "step": 1221 + }, + { + "epoch": 0.1285182799374236, + "grad_norm": 3.0496345780537077, + "learning_rate": 4.826573865052261e-06, + "loss": 1.0285, + "step": 1222 + }, + { + "epoch": 0.1286234503792709, + "grad_norm": 3.0696030887083134, + "learning_rate": 4.8262682226095215e-06, + "loss": 1.0221, + "step": 1223 + }, + { + "epoch": 0.12872862082111822, + "grad_norm": 2.911026501529926, + "learning_rate": 4.825962320769804e-06, + "loss": 0.996, + "step": 1224 + }, + { + "epoch": 0.12883379126296554, + "grad_norm": 2.6415248186701414, + "learning_rate": 4.825656159567218e-06, + "loss": 1.0077, + "step": 1225 + }, + { + "epoch": 0.12893896170481287, + "grad_norm": 3.015861478144796, + "learning_rate": 4.8253497390359035e-06, + "loss": 1.0199, + "step": 1226 + }, + { + "epoch": 0.1290441321466602, + "grad_norm": 2.968448842373254, + "learning_rate": 4.82504305921003e-06, + "loss": 1.0586, + "step": 1227 + }, + { + "epoch": 0.1291493025885075, + "grad_norm": 3.6356255518516805, + "learning_rate": 4.824736120123794e-06, + "loss": 1.0877, + "step": 1228 + }, + { + "epoch": 0.1292544730303548, + "grad_norm": 2.2178443735382616, + "learning_rate": 4.824428921811421e-06, + "loss": 1.0224, + "step": 1229 + }, + { + "epoch": 0.12935964347220213, + "grad_norm": 2.5367079837422497, + "learning_rate": 4.824121464307168e-06, + "loss": 1.0118, + "step": 1230 + }, + { + "epoch": 0.12946481391404946, + "grad_norm": 3.130091176692925, + "learning_rate": 4.823813747645315e-06, + "loss": 1.0146, + "step": 1231 + }, + { + "epoch": 0.12956998435589678, + "grad_norm": 2.4669259065457902, + "learning_rate": 4.823505771860178e-06, + "loss": 1.0262, + "step": 1232 + }, + { + "epoch": 0.1296751547977441, + "grad_norm": 2.3946588572460015, + "learning_rate": 4.823197536986098e-06, + "loss": 1.0202, + "step": 1233 + }, + { + "epoch": 0.1297803252395914, + "grad_norm": 3.28718019202575, + "learning_rate": 4.822889043057446e-06, + "loss": 0.996, + "step": 1234 + }, + { + "epoch": 0.12988549568143873, + "grad_norm": 3.172840137269599, + "learning_rate": 4.82258029010862e-06, + "loss": 0.9892, + "step": 1235 + }, + { + "epoch": 0.12999066612328605, + "grad_norm": 3.469596633957251, + "learning_rate": 4.82227127817405e-06, + "loss": 1.0587, + "step": 1236 + }, + { + "epoch": 0.13009583656513338, + "grad_norm": 2.555342312316807, + "learning_rate": 4.821962007288191e-06, + "loss": 1.0267, + "step": 1237 + }, + { + "epoch": 0.1302010070069807, + "grad_norm": 2.5256812029218128, + "learning_rate": 4.821652477485531e-06, + "loss": 1.0289, + "step": 1238 + }, + { + "epoch": 0.130306177448828, + "grad_norm": 3.1983565580222875, + "learning_rate": 4.821342688800586e-06, + "loss": 1.0321, + "step": 1239 + }, + { + "epoch": 0.13041134789067532, + "grad_norm": 3.620474013802864, + "learning_rate": 4.821032641267897e-06, + "loss": 0.9785, + "step": 1240 + }, + { + "epoch": 0.13051651833252265, + "grad_norm": 2.661892927357515, + "learning_rate": 4.820722334922039e-06, + "loss": 1.039, + "step": 1241 + }, + { + "epoch": 0.13062168877436997, + "grad_norm": 3.34786582341296, + "learning_rate": 4.820411769797611e-06, + "loss": 1.0484, + "step": 1242 + }, + { + "epoch": 0.1307268592162173, + "grad_norm": 3.9689276628030985, + "learning_rate": 4.820100945929247e-06, + "loss": 1.0423, + "step": 1243 + }, + { + "epoch": 0.1308320296580646, + "grad_norm": 2.883967378632545, + "learning_rate": 4.819789863351603e-06, + "loss": 1.0503, + "step": 1244 + }, + { + "epoch": 0.13093720009991192, + "grad_norm": 3.105797028005016, + "learning_rate": 4.819478522099369e-06, + "loss": 1.0227, + "step": 1245 + }, + { + "epoch": 0.13104237054175924, + "grad_norm": 1.6808339529344005, + "learning_rate": 4.819166922207261e-06, + "loss": 1.0181, + "step": 1246 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 2.7847369838128326, + "learning_rate": 4.818855063710026e-06, + "loss": 1.0094, + "step": 1247 + }, + { + "epoch": 0.1312527114254539, + "grad_norm": 2.8619801152801365, + "learning_rate": 4.8185429466424375e-06, + "loss": 1.0377, + "step": 1248 + }, + { + "epoch": 0.13135788186730118, + "grad_norm": 3.00420826947291, + "learning_rate": 4.8182305710393e-06, + "loss": 1.0415, + "step": 1249 + }, + { + "epoch": 0.1314630523091485, + "grad_norm": 3.106603153157244, + "learning_rate": 4.8179179369354445e-06, + "loss": 1.0087, + "step": 1250 + }, + { + "epoch": 0.13156822275099583, + "grad_norm": 2.9567174545180936, + "learning_rate": 4.817605044365733e-06, + "loss": 1.0042, + "step": 1251 + }, + { + "epoch": 0.13167339319284316, + "grad_norm": 3.6684630626815578, + "learning_rate": 4.817291893365055e-06, + "loss": 1.0208, + "step": 1252 + }, + { + "epoch": 0.13177856363469048, + "grad_norm": 3.8333923409888406, + "learning_rate": 4.8169784839683295e-06, + "loss": 1.032, + "step": 1253 + }, + { + "epoch": 0.13188373407653778, + "grad_norm": 3.076774148237722, + "learning_rate": 4.816664816210505e-06, + "loss": 1.0345, + "step": 1254 + }, + { + "epoch": 0.1319889045183851, + "grad_norm": 2.528183859455226, + "learning_rate": 4.816350890126556e-06, + "loss": 1.0463, + "step": 1255 + }, + { + "epoch": 0.13209407496023243, + "grad_norm": 3.2080429059373765, + "learning_rate": 4.816036705751489e-06, + "loss": 1.0083, + "step": 1256 + }, + { + "epoch": 0.13219924540207975, + "grad_norm": 2.996843724364759, + "learning_rate": 4.8157222631203374e-06, + "loss": 0.9658, + "step": 1257 + }, + { + "epoch": 0.13230441584392708, + "grad_norm": 2.823099339288846, + "learning_rate": 4.815407562268165e-06, + "loss": 1.0478, + "step": 1258 + }, + { + "epoch": 0.13240958628577437, + "grad_norm": 2.5147629342097635, + "learning_rate": 4.815092603230063e-06, + "loss": 0.9942, + "step": 1259 + }, + { + "epoch": 0.1325147567276217, + "grad_norm": 3.1482493227003316, + "learning_rate": 4.81477738604115e-06, + "loss": 0.9881, + "step": 1260 + }, + { + "epoch": 0.13261992716946902, + "grad_norm": 3.020760053243842, + "learning_rate": 4.814461910736578e-06, + "loss": 1.0364, + "step": 1261 + }, + { + "epoch": 0.13272509761131634, + "grad_norm": 2.221968445422404, + "learning_rate": 4.814146177351523e-06, + "loss": 1.0265, + "step": 1262 + }, + { + "epoch": 0.13283026805316367, + "grad_norm": 3.0176270858205565, + "learning_rate": 4.8138301859211925e-06, + "loss": 1.0519, + "step": 1263 + }, + { + "epoch": 0.13293543849501097, + "grad_norm": 2.2986812981864317, + "learning_rate": 4.813513936480821e-06, + "loss": 1.03, + "step": 1264 + }, + { + "epoch": 0.1330406089368583, + "grad_norm": 2.8952298044973057, + "learning_rate": 4.8131974290656745e-06, + "loss": 1.0342, + "step": 1265 + }, + { + "epoch": 0.13314577937870561, + "grad_norm": 2.8246589734902225, + "learning_rate": 4.812880663711045e-06, + "loss": 1.0491, + "step": 1266 + }, + { + "epoch": 0.13325094982055294, + "grad_norm": 2.5406616352440134, + "learning_rate": 4.812563640452254e-06, + "loss": 1.0069, + "step": 1267 + }, + { + "epoch": 0.13335612026240026, + "grad_norm": 2.3100739684989167, + "learning_rate": 4.812246359324653e-06, + "loss": 1.002, + "step": 1268 + }, + { + "epoch": 0.13346129070424756, + "grad_norm": 2.9322165786317997, + "learning_rate": 4.811928820363621e-06, + "loss": 1.0207, + "step": 1269 + }, + { + "epoch": 0.13356646114609488, + "grad_norm": 2.3628042524685533, + "learning_rate": 4.811611023604564e-06, + "loss": 1.0314, + "step": 1270 + }, + { + "epoch": 0.1336716315879422, + "grad_norm": 3.3682424272817895, + "learning_rate": 4.811292969082922e-06, + "loss": 0.9997, + "step": 1271 + }, + { + "epoch": 0.13377680202978953, + "grad_norm": 2.796544714713144, + "learning_rate": 4.8109746568341595e-06, + "loss": 1.0163, + "step": 1272 + }, + { + "epoch": 0.13388197247163686, + "grad_norm": 3.2927205255203997, + "learning_rate": 4.81065608689377e-06, + "loss": 1.0652, + "step": 1273 + }, + { + "epoch": 0.13398714291348415, + "grad_norm": 4.2986772325983935, + "learning_rate": 4.810337259297277e-06, + "loss": 1.0425, + "step": 1274 + }, + { + "epoch": 0.13409231335533148, + "grad_norm": 2.5363498831942675, + "learning_rate": 4.810018174080233e-06, + "loss": 1.0201, + "step": 1275 + }, + { + "epoch": 0.1341974837971788, + "grad_norm": 3.5836252312590973, + "learning_rate": 4.809698831278217e-06, + "loss": 1.0456, + "step": 1276 + }, + { + "epoch": 0.13430265423902613, + "grad_norm": 3.425235368215492, + "learning_rate": 4.80937923092684e-06, + "loss": 1.0208, + "step": 1277 + }, + { + "epoch": 0.13440782468087345, + "grad_norm": 1.9745893346472898, + "learning_rate": 4.809059373061737e-06, + "loss": 0.9924, + "step": 1278 + }, + { + "epoch": 0.13451299512272075, + "grad_norm": 3.0646315393918835, + "learning_rate": 4.808739257718578e-06, + "loss": 0.9939, + "step": 1279 + }, + { + "epoch": 0.13461816556456807, + "grad_norm": 2.479949101456566, + "learning_rate": 4.808418884933056e-06, + "loss": 1.0327, + "step": 1280 + }, + { + "epoch": 0.1347233360064154, + "grad_norm": 2.922429890182779, + "learning_rate": 4.808098254740897e-06, + "loss": 1.0177, + "step": 1281 + }, + { + "epoch": 0.13482850644826272, + "grad_norm": 1.924186395677562, + "learning_rate": 4.807777367177851e-06, + "loss": 0.9983, + "step": 1282 + }, + { + "epoch": 0.13493367689011004, + "grad_norm": 3.935674804500333, + "learning_rate": 4.807456222279703e-06, + "loss": 1.0269, + "step": 1283 + }, + { + "epoch": 0.13503884733195734, + "grad_norm": 2.938713767635199, + "learning_rate": 4.80713482008226e-06, + "loss": 1.0239, + "step": 1284 + }, + { + "epoch": 0.13514401777380466, + "grad_norm": 3.1136862313700497, + "learning_rate": 4.806813160621362e-06, + "loss": 0.9802, + "step": 1285 + }, + { + "epoch": 0.135249188215652, + "grad_norm": 3.3108785416004722, + "learning_rate": 4.806491243932877e-06, + "loss": 1.0428, + "step": 1286 + }, + { + "epoch": 0.1353543586574993, + "grad_norm": 2.1210612289323048, + "learning_rate": 4.806169070052701e-06, + "loss": 0.9905, + "step": 1287 + }, + { + "epoch": 0.13545952909934664, + "grad_norm": 3.1193765406401797, + "learning_rate": 4.805846639016759e-06, + "loss": 1.0397, + "step": 1288 + }, + { + "epoch": 0.13556469954119393, + "grad_norm": 3.3591628388988757, + "learning_rate": 4.805523950861004e-06, + "loss": 1.0248, + "step": 1289 + }, + { + "epoch": 0.13566986998304126, + "grad_norm": 2.4188726077223746, + "learning_rate": 4.8052010056214184e-06, + "loss": 1.0516, + "step": 1290 + }, + { + "epoch": 0.13577504042488858, + "grad_norm": 3.4651111802218093, + "learning_rate": 4.804877803334013e-06, + "loss": 1.0804, + "step": 1291 + }, + { + "epoch": 0.1358802108667359, + "grad_norm": 2.919536255232778, + "learning_rate": 4.8045543440348276e-06, + "loss": 1.0286, + "step": 1292 + }, + { + "epoch": 0.13598538130858323, + "grad_norm": 2.2477800261870344, + "learning_rate": 4.804230627759931e-06, + "loss": 1.0185, + "step": 1293 + }, + { + "epoch": 0.13609055175043056, + "grad_norm": 2.571200906414569, + "learning_rate": 4.803906654545419e-06, + "loss": 1.018, + "step": 1294 + }, + { + "epoch": 0.13619572219227785, + "grad_norm": 2.760586584220439, + "learning_rate": 4.803582424427419e-06, + "loss": 1.0139, + "step": 1295 + }, + { + "epoch": 0.13630089263412518, + "grad_norm": 2.566832369387431, + "learning_rate": 4.803257937442082e-06, + "loss": 1.0429, + "step": 1296 + }, + { + "epoch": 0.1364060630759725, + "grad_norm": 4.222863669394409, + "learning_rate": 4.802933193625593e-06, + "loss": 1.0491, + "step": 1297 + }, + { + "epoch": 0.13651123351781982, + "grad_norm": 3.8303058287728664, + "learning_rate": 4.802608193014164e-06, + "loss": 1.0619, + "step": 1298 + }, + { + "epoch": 0.13661640395966715, + "grad_norm": 2.6093520415311167, + "learning_rate": 4.802282935644034e-06, + "loss": 1.0233, + "step": 1299 + }, + { + "epoch": 0.13672157440151445, + "grad_norm": 2.9601576956633706, + "learning_rate": 4.8019574215514705e-06, + "loss": 1.0153, + "step": 1300 + }, + { + "epoch": 0.13682674484336177, + "grad_norm": 2.1944445212698986, + "learning_rate": 4.801631650772774e-06, + "loss": 0.9904, + "step": 1301 + }, + { + "epoch": 0.1369319152852091, + "grad_norm": 2.6684642895304327, + "learning_rate": 4.801305623344268e-06, + "loss": 1.0708, + "step": 1302 + }, + { + "epoch": 0.13703708572705642, + "grad_norm": 2.686121392463823, + "learning_rate": 4.800979339302308e-06, + "loss": 1.0116, + "step": 1303 + }, + { + "epoch": 0.13714225616890374, + "grad_norm": 2.8185256267861836, + "learning_rate": 4.800652798683277e-06, + "loss": 1.055, + "step": 1304 + }, + { + "epoch": 0.13724742661075104, + "grad_norm": 2.793491565244081, + "learning_rate": 4.800326001523586e-06, + "loss": 0.9955, + "step": 1305 + }, + { + "epoch": 0.13735259705259836, + "grad_norm": 4.301260101751436, + "learning_rate": 4.799998947859678e-06, + "loss": 1.0288, + "step": 1306 + }, + { + "epoch": 0.1374577674944457, + "grad_norm": 2.1242732201639183, + "learning_rate": 4.799671637728019e-06, + "loss": 1.0248, + "step": 1307 + }, + { + "epoch": 0.137562937936293, + "grad_norm": 3.5698968807461364, + "learning_rate": 4.799344071165108e-06, + "loss": 1.0036, + "step": 1308 + }, + { + "epoch": 0.13766810837814034, + "grad_norm": 2.5835211133401237, + "learning_rate": 4.799016248207472e-06, + "loss": 1.0235, + "step": 1309 + }, + { + "epoch": 0.13777327881998763, + "grad_norm": 3.0261001648609454, + "learning_rate": 4.7986881688916646e-06, + "loss": 1.0384, + "step": 1310 + }, + { + "epoch": 0.13787844926183496, + "grad_norm": 3.474891168157717, + "learning_rate": 4.798359833254269e-06, + "loss": 1.0484, + "step": 1311 + }, + { + "epoch": 0.13798361970368228, + "grad_norm": 3.1716499888572036, + "learning_rate": 4.798031241331898e-06, + "loss": 1.0298, + "step": 1312 + }, + { + "epoch": 0.1380887901455296, + "grad_norm": 2.7269924098282514, + "learning_rate": 4.7977023931611916e-06, + "loss": 1.0619, + "step": 1313 + }, + { + "epoch": 0.13819396058737693, + "grad_norm": 3.0594179433515247, + "learning_rate": 4.797373288778819e-06, + "loss": 1.0194, + "step": 1314 + }, + { + "epoch": 0.13829913102922423, + "grad_norm": 2.6485215434411704, + "learning_rate": 4.797043928221479e-06, + "loss": 1.0012, + "step": 1315 + }, + { + "epoch": 0.13840430147107155, + "grad_norm": 2.387465962136887, + "learning_rate": 4.796714311525897e-06, + "loss": 0.9964, + "step": 1316 + }, + { + "epoch": 0.13850947191291887, + "grad_norm": 3.2239747245175883, + "learning_rate": 4.796384438728827e-06, + "loss": 1.0092, + "step": 1317 + }, + { + "epoch": 0.1386146423547662, + "grad_norm": 3.378894724491444, + "learning_rate": 4.796054309867053e-06, + "loss": 1.063, + "step": 1318 + }, + { + "epoch": 0.13871981279661352, + "grad_norm": 2.9523255723706203, + "learning_rate": 4.7957239249773876e-06, + "loss": 1.0547, + "step": 1319 + }, + { + "epoch": 0.13882498323846082, + "grad_norm": 2.7245192269142366, + "learning_rate": 4.795393284096671e-06, + "loss": 1.0321, + "step": 1320 + }, + { + "epoch": 0.13893015368030814, + "grad_norm": 3.402757468965259, + "learning_rate": 4.7950623872617705e-06, + "loss": 1.0365, + "step": 1321 + }, + { + "epoch": 0.13903532412215547, + "grad_norm": 2.0645918619669907, + "learning_rate": 4.7947312345095865e-06, + "loss": 1.0378, + "step": 1322 + }, + { + "epoch": 0.1391404945640028, + "grad_norm": 3.1649331722389027, + "learning_rate": 4.794399825877044e-06, + "loss": 1.0502, + "step": 1323 + }, + { + "epoch": 0.13924566500585012, + "grad_norm": 3.0438185017979316, + "learning_rate": 4.794068161401097e-06, + "loss": 1.0202, + "step": 1324 + }, + { + "epoch": 0.1393508354476974, + "grad_norm": 2.8567093139911126, + "learning_rate": 4.793736241118729e-06, + "loss": 0.9863, + "step": 1325 + }, + { + "epoch": 0.13945600588954474, + "grad_norm": 3.1861700342675783, + "learning_rate": 4.793404065066951e-06, + "loss": 1.052, + "step": 1326 + }, + { + "epoch": 0.13956117633139206, + "grad_norm": 2.268534254398867, + "learning_rate": 4.7930716332828055e-06, + "loss": 1.0266, + "step": 1327 + }, + { + "epoch": 0.13966634677323939, + "grad_norm": 2.7798949911303215, + "learning_rate": 4.792738945803358e-06, + "loss": 1.0042, + "step": 1328 + }, + { + "epoch": 0.1397715172150867, + "grad_norm": 2.9625802390516673, + "learning_rate": 4.792406002665708e-06, + "loss": 1.0692, + "step": 1329 + }, + { + "epoch": 0.139876687656934, + "grad_norm": 3.3097071364207076, + "learning_rate": 4.792072803906982e-06, + "loss": 1.0495, + "step": 1330 + }, + { + "epoch": 0.13998185809878133, + "grad_norm": 3.3911434469914252, + "learning_rate": 4.791739349564332e-06, + "loss": 1.0219, + "step": 1331 + }, + { + "epoch": 0.14008702854062866, + "grad_norm": 2.4415653316509323, + "learning_rate": 4.791405639674941e-06, + "loss": 1.0159, + "step": 1332 + }, + { + "epoch": 0.14019219898247598, + "grad_norm": 2.673194071720424, + "learning_rate": 4.791071674276021e-06, + "loss": 1.0428, + "step": 1333 + }, + { + "epoch": 0.1402973694243233, + "grad_norm": 3.4270530074992207, + "learning_rate": 4.790737453404812e-06, + "loss": 1.0427, + "step": 1334 + }, + { + "epoch": 0.1404025398661706, + "grad_norm": 2.370560923133824, + "learning_rate": 4.790402977098583e-06, + "loss": 1.0541, + "step": 1335 + }, + { + "epoch": 0.14050771030801792, + "grad_norm": 3.178115791238882, + "learning_rate": 4.7900682453946276e-06, + "loss": 1.0314, + "step": 1336 + }, + { + "epoch": 0.14061288074986525, + "grad_norm": 2.6802658373773176, + "learning_rate": 4.7897332583302745e-06, + "loss": 1.0589, + "step": 1337 + }, + { + "epoch": 0.14071805119171257, + "grad_norm": 2.848166188577203, + "learning_rate": 4.789398015942875e-06, + "loss": 1.0272, + "step": 1338 + }, + { + "epoch": 0.1408232216335599, + "grad_norm": 3.1907811105881922, + "learning_rate": 4.789062518269812e-06, + "loss": 1.034, + "step": 1339 + }, + { + "epoch": 0.1409283920754072, + "grad_norm": 2.9382816317350375, + "learning_rate": 4.7887267653484965e-06, + "loss": 1.015, + "step": 1340 + }, + { + "epoch": 0.14103356251725452, + "grad_norm": 4.496979260313411, + "learning_rate": 4.788390757216367e-06, + "loss": 1.0533, + "step": 1341 + }, + { + "epoch": 0.14113873295910184, + "grad_norm": 2.6947544952945037, + "learning_rate": 4.7880544939108915e-06, + "loss": 1.045, + "step": 1342 + }, + { + "epoch": 0.14124390340094917, + "grad_norm": 2.6878213842926644, + "learning_rate": 4.787717975469566e-06, + "loss": 1.0424, + "step": 1343 + }, + { + "epoch": 0.1413490738427965, + "grad_norm": 2.9207345197550323, + "learning_rate": 4.787381201929913e-06, + "loss": 1.0479, + "step": 1344 + }, + { + "epoch": 0.1414542442846438, + "grad_norm": 2.8600747613086024, + "learning_rate": 4.787044173329489e-06, + "loss": 1.0018, + "step": 1345 + }, + { + "epoch": 0.1415594147264911, + "grad_norm": 2.456864436992996, + "learning_rate": 4.7867068897058725e-06, + "loss": 1.0043, + "step": 1346 + }, + { + "epoch": 0.14166458516833844, + "grad_norm": 3.0962033894183087, + "learning_rate": 4.7863693510966735e-06, + "loss": 0.9958, + "step": 1347 + }, + { + "epoch": 0.14176975561018576, + "grad_norm": 2.6323675844789016, + "learning_rate": 4.786031557539532e-06, + "loss": 1.0502, + "step": 1348 + }, + { + "epoch": 0.14187492605203308, + "grad_norm": 2.694477249584249, + "learning_rate": 4.785693509072112e-06, + "loss": 1.0372, + "step": 1349 + }, + { + "epoch": 0.14198009649388038, + "grad_norm": 3.3629384103067452, + "learning_rate": 4.785355205732111e-06, + "loss": 0.9938, + "step": 1350 + }, + { + "epoch": 0.1420852669357277, + "grad_norm": 3.334830674838776, + "learning_rate": 4.78501664755725e-06, + "loss": 1.0405, + "step": 1351 + }, + { + "epoch": 0.14219043737757503, + "grad_norm": 2.2938533826137455, + "learning_rate": 4.784677834585283e-06, + "loss": 1.0207, + "step": 1352 + }, + { + "epoch": 0.14229560781942235, + "grad_norm": 2.5803341129447896, + "learning_rate": 4.784338766853989e-06, + "loss": 1.0151, + "step": 1353 + }, + { + "epoch": 0.14240077826126968, + "grad_norm": 2.9122499893707383, + "learning_rate": 4.783999444401178e-06, + "loss": 1.0644, + "step": 1354 + }, + { + "epoch": 0.142505948703117, + "grad_norm": 2.091992322725459, + "learning_rate": 4.783659867264685e-06, + "loss": 0.9981, + "step": 1355 + }, + { + "epoch": 0.1426111191449643, + "grad_norm": 3.065753050616509, + "learning_rate": 4.783320035482378e-06, + "loss": 0.9894, + "step": 1356 + }, + { + "epoch": 0.14271628958681162, + "grad_norm": 3.1525287256847903, + "learning_rate": 4.7829799490921485e-06, + "loss": 1.077, + "step": 1357 + }, + { + "epoch": 0.14282146002865895, + "grad_norm": 3.3013362649199065, + "learning_rate": 4.7826396081319206e-06, + "loss": 1.0439, + "step": 1358 + }, + { + "epoch": 0.14292663047050627, + "grad_norm": 2.6267213566754295, + "learning_rate": 4.782299012639644e-06, + "loss": 1.0254, + "step": 1359 + }, + { + "epoch": 0.1430318009123536, + "grad_norm": 3.664812917190621, + "learning_rate": 4.781958162653298e-06, + "loss": 1.0252, + "step": 1360 + }, + { + "epoch": 0.1431369713542009, + "grad_norm": 2.3021182933754103, + "learning_rate": 4.781617058210889e-06, + "loss": 1.0615, + "step": 1361 + }, + { + "epoch": 0.14324214179604822, + "grad_norm": 2.881195846130054, + "learning_rate": 4.781275699350455e-06, + "loss": 1.0005, + "step": 1362 + }, + { + "epoch": 0.14334731223789554, + "grad_norm": 3.7980632477387632, + "learning_rate": 4.780934086110059e-06, + "loss": 1.0062, + "step": 1363 + }, + { + "epoch": 0.14345248267974287, + "grad_norm": 2.4526767159057505, + "learning_rate": 4.780592218527793e-06, + "loss": 1.0588, + "step": 1364 + }, + { + "epoch": 0.1435576531215902, + "grad_norm": 2.7801413202576177, + "learning_rate": 4.780250096641778e-06, + "loss": 1.004, + "step": 1365 + }, + { + "epoch": 0.1436628235634375, + "grad_norm": 3.2606411773565442, + "learning_rate": 4.779907720490164e-06, + "loss": 1.0199, + "step": 1366 + }, + { + "epoch": 0.1437679940052848, + "grad_norm": 3.222765541930548, + "learning_rate": 4.779565090111129e-06, + "loss": 0.9907, + "step": 1367 + }, + { + "epoch": 0.14387316444713213, + "grad_norm": 3.1387995556419357, + "learning_rate": 4.779222205542877e-06, + "loss": 1.0304, + "step": 1368 + }, + { + "epoch": 0.14397833488897946, + "grad_norm": 2.389261805095194, + "learning_rate": 4.778879066823644e-06, + "loss": 1.0449, + "step": 1369 + }, + { + "epoch": 0.14408350533082678, + "grad_norm": 2.724316181670948, + "learning_rate": 4.778535673991692e-06, + "loss": 1.0199, + "step": 1370 + }, + { + "epoch": 0.14418867577267408, + "grad_norm": 3.0155113196278585, + "learning_rate": 4.7781920270853126e-06, + "loss": 1.0333, + "step": 1371 + }, + { + "epoch": 0.1442938462145214, + "grad_norm": 3.5146591280021164, + "learning_rate": 4.777848126142824e-06, + "loss": 1.0147, + "step": 1372 + }, + { + "epoch": 0.14439901665636873, + "grad_norm": 2.61346142352295, + "learning_rate": 4.777503971202574e-06, + "loss": 1.0076, + "step": 1373 + }, + { + "epoch": 0.14450418709821605, + "grad_norm": 2.7874977280860618, + "learning_rate": 4.77715956230294e-06, + "loss": 0.994, + "step": 1374 + }, + { + "epoch": 0.14460935754006338, + "grad_norm": 3.1736286121545536, + "learning_rate": 4.776814899482325e-06, + "loss": 1.0834, + "step": 1375 + }, + { + "epoch": 0.14471452798191067, + "grad_norm": 3.6191087311210732, + "learning_rate": 4.776469982779161e-06, + "loss": 1.0153, + "step": 1376 + }, + { + "epoch": 0.144819698423758, + "grad_norm": 3.745985677561027, + "learning_rate": 4.7761248122319105e-06, + "loss": 0.9798, + "step": 1377 + }, + { + "epoch": 0.14492486886560532, + "grad_norm": 2.3804794207514406, + "learning_rate": 4.775779387879061e-06, + "loss": 1.0106, + "step": 1378 + }, + { + "epoch": 0.14503003930745265, + "grad_norm": 2.0323842954289346, + "learning_rate": 4.7754337097591315e-06, + "loss": 1.031, + "step": 1379 + }, + { + "epoch": 0.14513520974929997, + "grad_norm": 2.633921230163831, + "learning_rate": 4.7750877779106666e-06, + "loss": 1.0546, + "step": 1380 + }, + { + "epoch": 0.14524038019114727, + "grad_norm": 3.477178349518341, + "learning_rate": 4.774741592372242e-06, + "loss": 1.0429, + "step": 1381 + }, + { + "epoch": 0.1453455506329946, + "grad_norm": 3.025833331172734, + "learning_rate": 4.774395153182459e-06, + "loss": 1.0074, + "step": 1382 + }, + { + "epoch": 0.14545072107484192, + "grad_norm": 2.8582583766175005, + "learning_rate": 4.774048460379947e-06, + "loss": 1.0239, + "step": 1383 + }, + { + "epoch": 0.14555589151668924, + "grad_norm": 3.5823549566490933, + "learning_rate": 4.773701514003367e-06, + "loss": 1.0257, + "step": 1384 + }, + { + "epoch": 0.14566106195853656, + "grad_norm": 2.6124292693903612, + "learning_rate": 4.773354314091405e-06, + "loss": 0.9493, + "step": 1385 + }, + { + "epoch": 0.14576623240038386, + "grad_norm": 2.8268960445709226, + "learning_rate": 4.773006860682777e-06, + "loss": 1.0065, + "step": 1386 + }, + { + "epoch": 0.14587140284223118, + "grad_norm": 3.9272556505800273, + "learning_rate": 4.772659153816228e-06, + "loss": 1.0455, + "step": 1387 + }, + { + "epoch": 0.1459765732840785, + "grad_norm": 3.071553519913472, + "learning_rate": 4.7723111935305275e-06, + "loss": 1.0151, + "step": 1388 + }, + { + "epoch": 0.14608174372592583, + "grad_norm": 2.713380086603046, + "learning_rate": 4.7719629798644775e-06, + "loss": 1.0394, + "step": 1389 + }, + { + "epoch": 0.14618691416777316, + "grad_norm": 2.5543229953092608, + "learning_rate": 4.7716145128569054e-06, + "loss": 1.0254, + "step": 1390 + }, + { + "epoch": 0.14629208460962045, + "grad_norm": 2.5719448084092216, + "learning_rate": 4.771265792546669e-06, + "loss": 0.9972, + "step": 1391 + }, + { + "epoch": 0.14639725505146778, + "grad_norm": 2.360610533233657, + "learning_rate": 4.770916818972653e-06, + "loss": 1.0448, + "step": 1392 + }, + { + "epoch": 0.1465024254933151, + "grad_norm": 2.8007366713065274, + "learning_rate": 4.77056759217377e-06, + "loss": 1.0058, + "step": 1393 + }, + { + "epoch": 0.14660759593516243, + "grad_norm": 2.524726744796562, + "learning_rate": 4.770218112188964e-06, + "loss": 1.0684, + "step": 1394 + }, + { + "epoch": 0.14671276637700975, + "grad_norm": 3.3903828921485943, + "learning_rate": 4.769868379057201e-06, + "loss": 1.1003, + "step": 1395 + }, + { + "epoch": 0.14681793681885705, + "grad_norm": 3.4978205266954037, + "learning_rate": 4.7695183928174804e-06, + "loss": 1.037, + "step": 1396 + }, + { + "epoch": 0.14692310726070437, + "grad_norm": 3.3446642465813152, + "learning_rate": 4.76916815350883e-06, + "loss": 1.0366, + "step": 1397 + }, + { + "epoch": 0.1470282777025517, + "grad_norm": 2.2392665764959836, + "learning_rate": 4.768817661170302e-06, + "loss": 1.0026, + "step": 1398 + }, + { + "epoch": 0.14713344814439902, + "grad_norm": 2.8612244152019963, + "learning_rate": 4.768466915840981e-06, + "loss": 1.0446, + "step": 1399 + }, + { + "epoch": 0.14723861858624634, + "grad_norm": 3.3281982579588085, + "learning_rate": 4.768115917559976e-06, + "loss": 1.0825, + "step": 1400 + }, + { + "epoch": 0.14734378902809364, + "grad_norm": 2.2985012611056597, + "learning_rate": 4.767764666366427e-06, + "loss": 1.0283, + "step": 1401 + }, + { + "epoch": 0.14744895946994097, + "grad_norm": 2.6342734587607755, + "learning_rate": 4.767413162299501e-06, + "loss": 0.9836, + "step": 1402 + }, + { + "epoch": 0.1475541299117883, + "grad_norm": 2.555811598919671, + "learning_rate": 4.7670614053983945e-06, + "loss": 0.9753, + "step": 1403 + }, + { + "epoch": 0.14765930035363561, + "grad_norm": 4.202733894529233, + "learning_rate": 4.766709395702329e-06, + "loss": 1.0445, + "step": 1404 + }, + { + "epoch": 0.14776447079548294, + "grad_norm": 2.8209004214002, + "learning_rate": 4.766357133250558e-06, + "loss": 1.0223, + "step": 1405 + }, + { + "epoch": 0.14786964123733023, + "grad_norm": 2.8446733034162657, + "learning_rate": 4.766004618082361e-06, + "loss": 1.0289, + "step": 1406 + }, + { + "epoch": 0.14797481167917756, + "grad_norm": 4.068733400424128, + "learning_rate": 4.765651850237046e-06, + "loss": 1.0016, + "step": 1407 + }, + { + "epoch": 0.14807998212102488, + "grad_norm": 3.844649508056965, + "learning_rate": 4.765298829753949e-06, + "loss": 0.9918, + "step": 1408 + }, + { + "epoch": 0.1481851525628722, + "grad_norm": 2.919584733622149, + "learning_rate": 4.764945556672435e-06, + "loss": 1.0238, + "step": 1409 + }, + { + "epoch": 0.14829032300471953, + "grad_norm": 3.7015807840351584, + "learning_rate": 4.764592031031898e-06, + "loss": 1.0315, + "step": 1410 + }, + { + "epoch": 0.14839549344656686, + "grad_norm": 4.273538161097895, + "learning_rate": 4.7642382528717565e-06, + "loss": 1.0426, + "step": 1411 + }, + { + "epoch": 0.14850066388841415, + "grad_norm": 3.509784403656781, + "learning_rate": 4.763884222231461e-06, + "loss": 0.9692, + "step": 1412 + }, + { + "epoch": 0.14860583433026148, + "grad_norm": 3.524891382474383, + "learning_rate": 4.763529939150489e-06, + "loss": 1.023, + "step": 1413 + }, + { + "epoch": 0.1487110047721088, + "grad_norm": 2.860810743236083, + "learning_rate": 4.763175403668344e-06, + "loss": 0.9906, + "step": 1414 + }, + { + "epoch": 0.14881617521395613, + "grad_norm": 3.051115350337341, + "learning_rate": 4.762820615824561e-06, + "loss": 1.0535, + "step": 1415 + }, + { + "epoch": 0.14892134565580345, + "grad_norm": 2.868633232025887, + "learning_rate": 4.7624655756587e-06, + "loss": 1.0226, + "step": 1416 + }, + { + "epoch": 0.14902651609765075, + "grad_norm": 3.2091074851672823, + "learning_rate": 4.762110283210353e-06, + "loss": 1.0125, + "step": 1417 + }, + { + "epoch": 0.14913168653949807, + "grad_norm": 3.4892610849853614, + "learning_rate": 4.761754738519136e-06, + "loss": 1.0046, + "step": 1418 + }, + { + "epoch": 0.1492368569813454, + "grad_norm": 3.5426727194444623, + "learning_rate": 4.761398941624696e-06, + "loss": 1.012, + "step": 1419 + }, + { + "epoch": 0.14934202742319272, + "grad_norm": 2.648969769760997, + "learning_rate": 4.761042892566707e-06, + "loss": 1.0218, + "step": 1420 + }, + { + "epoch": 0.14944719786504004, + "grad_norm": 3.2305776271347746, + "learning_rate": 4.7606865913848725e-06, + "loss": 1.0702, + "step": 1421 + }, + { + "epoch": 0.14955236830688734, + "grad_norm": 3.1832515293962027, + "learning_rate": 4.760330038118919e-06, + "loss": 1.0464, + "step": 1422 + }, + { + "epoch": 0.14965753874873466, + "grad_norm": 3.4113754972892743, + "learning_rate": 4.759973232808609e-06, + "loss": 1.0189, + "step": 1423 + }, + { + "epoch": 0.149762709190582, + "grad_norm": 2.720125937598134, + "learning_rate": 4.759616175493726e-06, + "loss": 0.9823, + "step": 1424 + }, + { + "epoch": 0.1498678796324293, + "grad_norm": 3.4131804917283537, + "learning_rate": 4.7592588662140876e-06, + "loss": 1.0335, + "step": 1425 + }, + { + "epoch": 0.14997305007427664, + "grad_norm": 2.5441205030353937, + "learning_rate": 4.7589013050095345e-06, + "loss": 1.0354, + "step": 1426 + }, + { + "epoch": 0.15007822051612393, + "grad_norm": 3.064947944270619, + "learning_rate": 4.758543491919938e-06, + "loss": 1.0068, + "step": 1427 + }, + { + "epoch": 0.15018339095797126, + "grad_norm": 3.2381546378652417, + "learning_rate": 4.7581854269851975e-06, + "loss": 1.0831, + "step": 1428 + }, + { + "epoch": 0.15028856139981858, + "grad_norm": 3.0616456434518002, + "learning_rate": 4.75782711024524e-06, + "loss": 1.0218, + "step": 1429 + }, + { + "epoch": 0.1503937318416659, + "grad_norm": 2.923899250348212, + "learning_rate": 4.757468541740019e-06, + "loss": 1.0323, + "step": 1430 + }, + { + "epoch": 0.15049890228351323, + "grad_norm": 2.8260428144309735, + "learning_rate": 4.75710972150952e-06, + "loss": 1.0136, + "step": 1431 + }, + { + "epoch": 0.15060407272536053, + "grad_norm": 2.9366911956981085, + "learning_rate": 4.756750649593753e-06, + "loss": 1.0514, + "step": 1432 + }, + { + "epoch": 0.15070924316720785, + "grad_norm": 2.4211017023085635, + "learning_rate": 4.756391326032757e-06, + "loss": 1.0258, + "step": 1433 + }, + { + "epoch": 0.15081441360905518, + "grad_norm": 2.819024865923823, + "learning_rate": 4.756031750866601e-06, + "loss": 1.0051, + "step": 1434 + }, + { + "epoch": 0.1509195840509025, + "grad_norm": 3.1999851809479063, + "learning_rate": 4.755671924135379e-06, + "loss": 1.0053, + "step": 1435 + }, + { + "epoch": 0.15102475449274982, + "grad_norm": 3.4482793978325272, + "learning_rate": 4.755311845879214e-06, + "loss": 1.0168, + "step": 1436 + }, + { + "epoch": 0.15112992493459712, + "grad_norm": 3.2963820376471755, + "learning_rate": 4.754951516138259e-06, + "loss": 1.0435, + "step": 1437 + }, + { + "epoch": 0.15123509537644445, + "grad_norm": 2.6226202887722674, + "learning_rate": 4.754590934952692e-06, + "loss": 1.0022, + "step": 1438 + }, + { + "epoch": 0.15134026581829177, + "grad_norm": 4.119115148771308, + "learning_rate": 4.754230102362723e-06, + "loss": 1.0604, + "step": 1439 + }, + { + "epoch": 0.1514454362601391, + "grad_norm": 3.3530136943675606, + "learning_rate": 4.7538690184085845e-06, + "loss": 1.0374, + "step": 1440 + }, + { + "epoch": 0.15155060670198642, + "grad_norm": 3.0643118980396307, + "learning_rate": 4.7535076831305425e-06, + "loss": 1.0299, + "step": 1441 + }, + { + "epoch": 0.15165577714383371, + "grad_norm": 2.6808250795383914, + "learning_rate": 4.753146096568888e-06, + "loss": 1.0616, + "step": 1442 + }, + { + "epoch": 0.15176094758568104, + "grad_norm": 2.762417572989777, + "learning_rate": 4.752784258763941e-06, + "loss": 1.043, + "step": 1443 + }, + { + "epoch": 0.15186611802752836, + "grad_norm": 2.069150811277505, + "learning_rate": 4.752422169756048e-06, + "loss": 0.9979, + "step": 1444 + }, + { + "epoch": 0.1519712884693757, + "grad_norm": 2.952492663200309, + "learning_rate": 4.7520598295855866e-06, + "loss": 1.0638, + "step": 1445 + }, + { + "epoch": 0.152076458911223, + "grad_norm": 2.4223572836685126, + "learning_rate": 4.751697238292959e-06, + "loss": 1.0165, + "step": 1446 + }, + { + "epoch": 0.1521816293530703, + "grad_norm": 3.035954168307138, + "learning_rate": 4.751334395918598e-06, + "loss": 1.0296, + "step": 1447 + }, + { + "epoch": 0.15228679979491763, + "grad_norm": 2.5050761120596396, + "learning_rate": 4.7509713025029624e-06, + "loss": 1.0469, + "step": 1448 + }, + { + "epoch": 0.15239197023676496, + "grad_norm": 3.6719733529575693, + "learning_rate": 4.750607958086541e-06, + "loss": 1.0405, + "step": 1449 + }, + { + "epoch": 0.15249714067861228, + "grad_norm": 2.122334436297666, + "learning_rate": 4.750244362709848e-06, + "loss": 1.0358, + "step": 1450 + }, + { + "epoch": 0.1526023111204596, + "grad_norm": 3.307644507612065, + "learning_rate": 4.749880516413428e-06, + "loss": 1.0013, + "step": 1451 + }, + { + "epoch": 0.1527074815623069, + "grad_norm": 2.26272280730536, + "learning_rate": 4.749516419237854e-06, + "loss": 1.0286, + "step": 1452 + }, + { + "epoch": 0.15281265200415423, + "grad_norm": 3.4033816929506444, + "learning_rate": 4.749152071223724e-06, + "loss": 1.0483, + "step": 1453 + }, + { + "epoch": 0.15291782244600155, + "grad_norm": 3.19081203769598, + "learning_rate": 4.748787472411665e-06, + "loss": 1.054, + "step": 1454 + }, + { + "epoch": 0.15302299288784887, + "grad_norm": 2.863548719789538, + "learning_rate": 4.748422622842335e-06, + "loss": 1.0378, + "step": 1455 + }, + { + "epoch": 0.1531281633296962, + "grad_norm": 2.918561339352643, + "learning_rate": 4.748057522556415e-06, + "loss": 1.064, + "step": 1456 + }, + { + "epoch": 0.1532333337715435, + "grad_norm": 3.235112382704938, + "learning_rate": 4.747692171594619e-06, + "loss": 1.0154, + "step": 1457 + }, + { + "epoch": 0.15333850421339082, + "grad_norm": 2.9715433832923392, + "learning_rate": 4.747326569997684e-06, + "loss": 0.9907, + "step": 1458 + }, + { + "epoch": 0.15344367465523814, + "grad_norm": 3.7107017840666696, + "learning_rate": 4.746960717806379e-06, + "loss": 0.998, + "step": 1459 + }, + { + "epoch": 0.15354884509708547, + "grad_norm": 3.1443639363904614, + "learning_rate": 4.7465946150615e-06, + "loss": 1.0366, + "step": 1460 + }, + { + "epoch": 0.1536540155389328, + "grad_norm": 3.281239643947147, + "learning_rate": 4.746228261803868e-06, + "loss": 1.0366, + "step": 1461 + }, + { + "epoch": 0.1537591859807801, + "grad_norm": 3.382772295417614, + "learning_rate": 4.745861658074336e-06, + "loss": 1.0344, + "step": 1462 + }, + { + "epoch": 0.1538643564226274, + "grad_norm": 2.466197221139934, + "learning_rate": 4.745494803913781e-06, + "loss": 1.0028, + "step": 1463 + }, + { + "epoch": 0.15396952686447474, + "grad_norm": 3.360579078456302, + "learning_rate": 4.745127699363115e-06, + "loss": 1.0104, + "step": 1464 + }, + { + "epoch": 0.15407469730632206, + "grad_norm": 4.356696217441136, + "learning_rate": 4.744760344463267e-06, + "loss": 1.0674, + "step": 1465 + }, + { + "epoch": 0.15417986774816939, + "grad_norm": 3.2306718414761044, + "learning_rate": 4.744392739255203e-06, + "loss": 1.0717, + "step": 1466 + }, + { + "epoch": 0.15428503819001668, + "grad_norm": 2.2164328768086197, + "learning_rate": 4.744024883779915e-06, + "loss": 1.0049, + "step": 1467 + }, + { + "epoch": 0.154390208631864, + "grad_norm": 2.932022946712518, + "learning_rate": 4.74365677807842e-06, + "loss": 1.0116, + "step": 1468 + }, + { + "epoch": 0.15449537907371133, + "grad_norm": 2.590086012736109, + "learning_rate": 4.743288422191764e-06, + "loss": 1.0302, + "step": 1469 + }, + { + "epoch": 0.15460054951555866, + "grad_norm": 2.1435708761875665, + "learning_rate": 4.7429198161610225e-06, + "loss": 1.0083, + "step": 1470 + }, + { + "epoch": 0.15470571995740598, + "grad_norm": 3.630046810094376, + "learning_rate": 4.7425509600272974e-06, + "loss": 0.9893, + "step": 1471 + }, + { + "epoch": 0.1548108903992533, + "grad_norm": 2.719889362421328, + "learning_rate": 4.742181853831721e-06, + "loss": 1.0572, + "step": 1472 + }, + { + "epoch": 0.1549160608411006, + "grad_norm": 2.327018336326447, + "learning_rate": 4.74181249761545e-06, + "loss": 0.9962, + "step": 1473 + }, + { + "epoch": 0.15502123128294792, + "grad_norm": 2.3785350704725112, + "learning_rate": 4.74144289141967e-06, + "loss": 0.9893, + "step": 1474 + }, + { + "epoch": 0.15512640172479525, + "grad_norm": 2.59518918661727, + "learning_rate": 4.741073035285595e-06, + "loss": 0.9894, + "step": 1475 + }, + { + "epoch": 0.15523157216664257, + "grad_norm": 3.4946347737501915, + "learning_rate": 4.7407029292544675e-06, + "loss": 1.0119, + "step": 1476 + }, + { + "epoch": 0.1553367426084899, + "grad_norm": 2.6605925838001627, + "learning_rate": 4.740332573367557e-06, + "loss": 1.0098, + "step": 1477 + }, + { + "epoch": 0.1554419130503372, + "grad_norm": 3.8403647907126643, + "learning_rate": 4.739961967666161e-06, + "loss": 1.0622, + "step": 1478 + }, + { + "epoch": 0.15554708349218452, + "grad_norm": 2.4061249592344356, + "learning_rate": 4.739591112191605e-06, + "loss": 0.9919, + "step": 1479 + }, + { + "epoch": 0.15565225393403184, + "grad_norm": 3.619438134390123, + "learning_rate": 4.739220006985243e-06, + "loss": 1.0495, + "step": 1480 + }, + { + "epoch": 0.15575742437587917, + "grad_norm": 2.7851891041711814, + "learning_rate": 4.738848652088454e-06, + "loss": 1.0299, + "step": 1481 + }, + { + "epoch": 0.1558625948177265, + "grad_norm": 2.975714954933627, + "learning_rate": 4.738477047542649e-06, + "loss": 1.0205, + "step": 1482 + }, + { + "epoch": 0.1559677652595738, + "grad_norm": 3.878474331685192, + "learning_rate": 4.738105193389264e-06, + "loss": 0.988, + "step": 1483 + }, + { + "epoch": 0.1560729357014211, + "grad_norm": 3.449910786924835, + "learning_rate": 4.737733089669764e-06, + "loss": 1.0652, + "step": 1484 + }, + { + "epoch": 0.15617810614326844, + "grad_norm": 2.3712070448593803, + "learning_rate": 4.737360736425641e-06, + "loss": 1.0411, + "step": 1485 + }, + { + "epoch": 0.15628327658511576, + "grad_norm": 2.6454413280340874, + "learning_rate": 4.736988133698416e-06, + "loss": 1.0204, + "step": 1486 + }, + { + "epoch": 0.15638844702696308, + "grad_norm": 3.004156559443189, + "learning_rate": 4.736615281529635e-06, + "loss": 1.0051, + "step": 1487 + }, + { + "epoch": 0.15649361746881038, + "grad_norm": 3.7915743827388186, + "learning_rate": 4.736242179960877e-06, + "loss": 1.0261, + "step": 1488 + }, + { + "epoch": 0.1565987879106577, + "grad_norm": 3.602538359343123, + "learning_rate": 4.735868829033744e-06, + "loss": 1.0642, + "step": 1489 + }, + { + "epoch": 0.15670395835250503, + "grad_norm": 1.966550689023779, + "learning_rate": 4.735495228789867e-06, + "loss": 1.0363, + "step": 1490 + }, + { + "epoch": 0.15680912879435235, + "grad_norm": 2.511000996166126, + "learning_rate": 4.735121379270907e-06, + "loss": 1.025, + "step": 1491 + }, + { + "epoch": 0.15691429923619968, + "grad_norm": 2.563979152422195, + "learning_rate": 4.734747280518549e-06, + "loss": 1.0361, + "step": 1492 + }, + { + "epoch": 0.15701946967804697, + "grad_norm": 2.2916478990627946, + "learning_rate": 4.73437293257451e-06, + "loss": 1.0462, + "step": 1493 + }, + { + "epoch": 0.1571246401198943, + "grad_norm": 2.2807919186264436, + "learning_rate": 4.733998335480532e-06, + "loss": 1.0184, + "step": 1494 + }, + { + "epoch": 0.15722981056174162, + "grad_norm": 2.7351488990346535, + "learning_rate": 4.733623489278385e-06, + "loss": 1.0148, + "step": 1495 + }, + { + "epoch": 0.15733498100358895, + "grad_norm": 2.893572157477807, + "learning_rate": 4.733248394009867e-06, + "loss": 1.0028, + "step": 1496 + }, + { + "epoch": 0.15744015144543627, + "grad_norm": 2.6060141894891595, + "learning_rate": 4.732873049716805e-06, + "loss": 0.998, + "step": 1497 + }, + { + "epoch": 0.15754532188728357, + "grad_norm": 2.7708906663010313, + "learning_rate": 4.732497456441052e-06, + "loss": 1.0671, + "step": 1498 + }, + { + "epoch": 0.1576504923291309, + "grad_norm": 3.472148650720611, + "learning_rate": 4.732121614224491e-06, + "loss": 1.0178, + "step": 1499 + }, + { + "epoch": 0.15775566277097822, + "grad_norm": 2.5207390547929003, + "learning_rate": 4.731745523109029e-06, + "loss": 1.0238, + "step": 1500 + }, + { + "epoch": 0.15786083321282554, + "grad_norm": 3.4771528545364316, + "learning_rate": 4.731369183136605e-06, + "loss": 1.0706, + "step": 1501 + }, + { + "epoch": 0.15796600365467287, + "grad_norm": 2.65521118845941, + "learning_rate": 4.730992594349183e-06, + "loss": 1.0402, + "step": 1502 + }, + { + "epoch": 0.15807117409652016, + "grad_norm": 3.10957683893925, + "learning_rate": 4.730615756788756e-06, + "loss": 0.9857, + "step": 1503 + }, + { + "epoch": 0.15817634453836749, + "grad_norm": 2.6542173527792463, + "learning_rate": 4.730238670497345e-06, + "loss": 1.0336, + "step": 1504 + }, + { + "epoch": 0.1582815149802148, + "grad_norm": 3.77211681230834, + "learning_rate": 4.729861335516995e-06, + "loss": 1.0299, + "step": 1505 + }, + { + "epoch": 0.15838668542206213, + "grad_norm": 2.6411474196043727, + "learning_rate": 4.7294837518897855e-06, + "loss": 1.0441, + "step": 1506 + }, + { + "epoch": 0.15849185586390946, + "grad_norm": 2.99873384055499, + "learning_rate": 4.729105919657818e-06, + "loss": 1.0002, + "step": 1507 + }, + { + "epoch": 0.15859702630575676, + "grad_norm": 2.8324948060160904, + "learning_rate": 4.728727838863224e-06, + "loss": 1.0513, + "step": 1508 + }, + { + "epoch": 0.15870219674760408, + "grad_norm": 2.455839993218039, + "learning_rate": 4.728349509548163e-06, + "loss": 1.0082, + "step": 1509 + }, + { + "epoch": 0.1588073671894514, + "grad_norm": 3.5575923268239027, + "learning_rate": 4.7279709317548215e-06, + "loss": 1.0516, + "step": 1510 + }, + { + "epoch": 0.15891253763129873, + "grad_norm": 2.940216951551406, + "learning_rate": 4.727592105525413e-06, + "loss": 1.0099, + "step": 1511 + }, + { + "epoch": 0.15901770807314605, + "grad_norm": 2.8061910489333397, + "learning_rate": 4.72721303090218e-06, + "loss": 1.0284, + "step": 1512 + }, + { + "epoch": 0.15912287851499335, + "grad_norm": 2.8488993729192567, + "learning_rate": 4.726833707927393e-06, + "loss": 1.0029, + "step": 1513 + }, + { + "epoch": 0.15922804895684067, + "grad_norm": 2.3760044861779015, + "learning_rate": 4.7264541366433495e-06, + "loss": 1.064, + "step": 1514 + }, + { + "epoch": 0.159333219398688, + "grad_norm": 3.03368894409861, + "learning_rate": 4.726074317092373e-06, + "loss": 1.057, + "step": 1515 + }, + { + "epoch": 0.15943838984053532, + "grad_norm": 2.6274139546172273, + "learning_rate": 4.7256942493168176e-06, + "loss": 1.0468, + "step": 1516 + }, + { + "epoch": 0.15954356028238265, + "grad_norm": 2.5208665813866533, + "learning_rate": 4.725313933359064e-06, + "loss": 1.01, + "step": 1517 + }, + { + "epoch": 0.15964873072422994, + "grad_norm": 3.0144732325896313, + "learning_rate": 4.724933369261519e-06, + "loss": 1.0093, + "step": 1518 + }, + { + "epoch": 0.15975390116607727, + "grad_norm": 3.3250090964686274, + "learning_rate": 4.72455255706662e-06, + "loss": 0.9936, + "step": 1519 + }, + { + "epoch": 0.1598590716079246, + "grad_norm": 3.306440325363034, + "learning_rate": 4.724171496816831e-06, + "loss": 1.002, + "step": 1520 + }, + { + "epoch": 0.15996424204977192, + "grad_norm": 2.978656372246788, + "learning_rate": 4.7237901885546405e-06, + "loss": 1.0175, + "step": 1521 + }, + { + "epoch": 0.16006941249161924, + "grad_norm": 3.4879449084728056, + "learning_rate": 4.72340863232257e-06, + "loss": 1.05, + "step": 1522 + }, + { + "epoch": 0.16017458293346654, + "grad_norm": 2.7810346030912405, + "learning_rate": 4.723026828163164e-06, + "loss": 1.0297, + "step": 1523 + }, + { + "epoch": 0.16027975337531386, + "grad_norm": 2.2616816284442205, + "learning_rate": 4.722644776118999e-06, + "loss": 1.0186, + "step": 1524 + }, + { + "epoch": 0.16038492381716118, + "grad_norm": 2.729508624247899, + "learning_rate": 4.722262476232674e-06, + "loss": 1.0272, + "step": 1525 + }, + { + "epoch": 0.1604900942590085, + "grad_norm": 3.219956094139774, + "learning_rate": 4.72187992854682e-06, + "loss": 1.0983, + "step": 1526 + }, + { + "epoch": 0.16059526470085583, + "grad_norm": 3.9250885931506914, + "learning_rate": 4.7214971331040945e-06, + "loss": 1.0105, + "step": 1527 + }, + { + "epoch": 0.16070043514270313, + "grad_norm": 4.60638365380386, + "learning_rate": 4.721114089947181e-06, + "loss": 1.0343, + "step": 1528 + }, + { + "epoch": 0.16080560558455045, + "grad_norm": 4.087070072305575, + "learning_rate": 4.720730799118792e-06, + "loss": 1.072, + "step": 1529 + }, + { + "epoch": 0.16091077602639778, + "grad_norm": 2.114182974470423, + "learning_rate": 4.7203472606616685e-06, + "loss": 1.0551, + "step": 1530 + }, + { + "epoch": 0.1610159464682451, + "grad_norm": 2.7564719333605305, + "learning_rate": 4.719963474618576e-06, + "loss": 1.0046, + "step": 1531 + }, + { + "epoch": 0.16112111691009243, + "grad_norm": 2.194526925536452, + "learning_rate": 4.719579441032312e-06, + "loss": 0.9904, + "step": 1532 + }, + { + "epoch": 0.16122628735193975, + "grad_norm": 3.069579977353765, + "learning_rate": 4.7191951599456974e-06, + "loss": 1.0293, + "step": 1533 + }, + { + "epoch": 0.16133145779378705, + "grad_norm": 2.6614677051330355, + "learning_rate": 4.718810631401584e-06, + "loss": 0.969, + "step": 1534 + }, + { + "epoch": 0.16143662823563437, + "grad_norm": 2.0239894866349384, + "learning_rate": 4.718425855442847e-06, + "loss": 1.0166, + "step": 1535 + }, + { + "epoch": 0.1615417986774817, + "grad_norm": 2.6814734955623747, + "learning_rate": 4.718040832112396e-06, + "loss": 0.9991, + "step": 1536 + }, + { + "epoch": 0.16164696911932902, + "grad_norm": 3.3899909411502156, + "learning_rate": 4.717655561453161e-06, + "loss": 1.0229, + "step": 1537 + }, + { + "epoch": 0.16175213956117634, + "grad_norm": 3.6416888404436514, + "learning_rate": 4.7172700435081024e-06, + "loss": 1.0336, + "step": 1538 + }, + { + "epoch": 0.16185731000302364, + "grad_norm": 3.297177936886574, + "learning_rate": 4.71688427832021e-06, + "loss": 1.0382, + "step": 1539 + }, + { + "epoch": 0.16196248044487097, + "grad_norm": 2.9251719307297983, + "learning_rate": 4.716498265932501e-06, + "loss": 1.0379, + "step": 1540 + }, + { + "epoch": 0.1620676508867183, + "grad_norm": 3.1867271581724332, + "learning_rate": 4.716112006388015e-06, + "loss": 1.0093, + "step": 1541 + }, + { + "epoch": 0.16217282132856561, + "grad_norm": 2.7382289471102927, + "learning_rate": 4.715725499729826e-06, + "loss": 1.0596, + "step": 1542 + }, + { + "epoch": 0.16227799177041294, + "grad_norm": 2.482681348080373, + "learning_rate": 4.715338746001031e-06, + "loss": 1.0241, + "step": 1543 + }, + { + "epoch": 0.16238316221226023, + "grad_norm": 3.321185423836697, + "learning_rate": 4.7149517452447565e-06, + "loss": 1.065, + "step": 1544 + }, + { + "epoch": 0.16248833265410756, + "grad_norm": 2.2553546129954034, + "learning_rate": 4.714564497504156e-06, + "loss": 1.0071, + "step": 1545 + }, + { + "epoch": 0.16259350309595488, + "grad_norm": 2.9289085282753473, + "learning_rate": 4.714177002822411e-06, + "loss": 1.0756, + "step": 1546 + }, + { + "epoch": 0.1626986735378022, + "grad_norm": 2.6229389236880514, + "learning_rate": 4.7137892612427296e-06, + "loss": 0.9867, + "step": 1547 + }, + { + "epoch": 0.16280384397964953, + "grad_norm": 2.5687055044071756, + "learning_rate": 4.713401272808348e-06, + "loss": 1.0312, + "step": 1548 + }, + { + "epoch": 0.16290901442149683, + "grad_norm": 3.075733116731394, + "learning_rate": 4.713013037562531e-06, + "loss": 0.999, + "step": 1549 + }, + { + "epoch": 0.16301418486334415, + "grad_norm": 2.4489321077973316, + "learning_rate": 4.712624555548568e-06, + "loss": 1.0424, + "step": 1550 + }, + { + "epoch": 0.16311935530519148, + "grad_norm": 2.786568331849303, + "learning_rate": 4.712235826809779e-06, + "loss": 1.0076, + "step": 1551 + }, + { + "epoch": 0.1632245257470388, + "grad_norm": 4.261177829125735, + "learning_rate": 4.711846851389511e-06, + "loss": 1.055, + "step": 1552 + }, + { + "epoch": 0.16332969618888613, + "grad_norm": 3.0252873549859878, + "learning_rate": 4.711457629331136e-06, + "loss": 1.0215, + "step": 1553 + }, + { + "epoch": 0.16343486663073342, + "grad_norm": 2.127523638438989, + "learning_rate": 4.711068160678056e-06, + "loss": 1.035, + "step": 1554 + }, + { + "epoch": 0.16354003707258075, + "grad_norm": 3.1748982619719097, + "learning_rate": 4.710678445473701e-06, + "loss": 1.0526, + "step": 1555 + }, + { + "epoch": 0.16364520751442807, + "grad_norm": 3.0204839143683087, + "learning_rate": 4.710288483761524e-06, + "loss": 1.0283, + "step": 1556 + }, + { + "epoch": 0.1637503779562754, + "grad_norm": 2.5309450768958883, + "learning_rate": 4.709898275585013e-06, + "loss": 1.0435, + "step": 1557 + }, + { + "epoch": 0.16385554839812272, + "grad_norm": 2.9972306044536636, + "learning_rate": 4.709507820987676e-06, + "loss": 0.9908, + "step": 1558 + }, + { + "epoch": 0.16396071883997002, + "grad_norm": 3.6133112485444188, + "learning_rate": 4.709117120013054e-06, + "loss": 1.0428, + "step": 1559 + }, + { + "epoch": 0.16406588928181734, + "grad_norm": 2.2177026935577637, + "learning_rate": 4.708726172704712e-06, + "loss": 1.0373, + "step": 1560 + }, + { + "epoch": 0.16417105972366466, + "grad_norm": 3.665100322013887, + "learning_rate": 4.708334979106243e-06, + "loss": 1.017, + "step": 1561 + }, + { + "epoch": 0.164276230165512, + "grad_norm": 2.120634042862793, + "learning_rate": 4.707943539261269e-06, + "loss": 1.0423, + "step": 1562 + }, + { + "epoch": 0.1643814006073593, + "grad_norm": 3.3096318778798346, + "learning_rate": 4.707551853213439e-06, + "loss": 1.0324, + "step": 1563 + }, + { + "epoch": 0.1644865710492066, + "grad_norm": 2.722439105462851, + "learning_rate": 4.7071599210064275e-06, + "loss": 1.0437, + "step": 1564 + }, + { + "epoch": 0.16459174149105393, + "grad_norm": 2.9680950262706105, + "learning_rate": 4.70676774268394e-06, + "loss": 1.0011, + "step": 1565 + }, + { + "epoch": 0.16469691193290126, + "grad_norm": 2.7946642826043, + "learning_rate": 4.706375318289706e-06, + "loss": 0.9863, + "step": 1566 + }, + { + "epoch": 0.16480208237474858, + "grad_norm": 3.008921981513297, + "learning_rate": 4.705982647867484e-06, + "loss": 1.0011, + "step": 1567 + }, + { + "epoch": 0.1649072528165959, + "grad_norm": 2.438967195615608, + "learning_rate": 4.705589731461061e-06, + "loss": 1.0553, + "step": 1568 + }, + { + "epoch": 0.1650124232584432, + "grad_norm": 2.0590569324727492, + "learning_rate": 4.705196569114248e-06, + "loss": 1.0026, + "step": 1569 + }, + { + "epoch": 0.16511759370029053, + "grad_norm": 2.751042822546601, + "learning_rate": 4.704803160870888e-06, + "loss": 1.0228, + "step": 1570 + }, + { + "epoch": 0.16522276414213785, + "grad_norm": 1.9114644938599432, + "learning_rate": 4.704409506774848e-06, + "loss": 1.0235, + "step": 1571 + }, + { + "epoch": 0.16532793458398518, + "grad_norm": 2.7725485760210433, + "learning_rate": 4.7040156068700225e-06, + "loss": 1.039, + "step": 1572 + }, + { + "epoch": 0.1654331050258325, + "grad_norm": 3.3207748456848956, + "learning_rate": 4.703621461200337e-06, + "loss": 1.0551, + "step": 1573 + }, + { + "epoch": 0.1655382754676798, + "grad_norm": 2.2142667900370805, + "learning_rate": 4.703227069809739e-06, + "loss": 1.0284, + "step": 1574 + }, + { + "epoch": 0.16564344590952712, + "grad_norm": 2.4762983701232666, + "learning_rate": 4.702832432742208e-06, + "loss": 1.0301, + "step": 1575 + }, + { + "epoch": 0.16574861635137444, + "grad_norm": 2.7474342790405464, + "learning_rate": 4.702437550041749e-06, + "loss": 1.0002, + "step": 1576 + }, + { + "epoch": 0.16585378679322177, + "grad_norm": 2.490632975494078, + "learning_rate": 4.702042421752393e-06, + "loss": 0.9741, + "step": 1577 + }, + { + "epoch": 0.1659589572350691, + "grad_norm": 2.7515934794089683, + "learning_rate": 4.701647047918202e-06, + "loss": 1.0435, + "step": 1578 + }, + { + "epoch": 0.1660641276769164, + "grad_norm": 2.3948584711688916, + "learning_rate": 4.701251428583261e-06, + "loss": 1.0609, + "step": 1579 + }, + { + "epoch": 0.16616929811876371, + "grad_norm": 2.792702623447517, + "learning_rate": 4.700855563791686e-06, + "loss": 1.0161, + "step": 1580 + }, + { + "epoch": 0.16627446856061104, + "grad_norm": 2.81166121024242, + "learning_rate": 4.700459453587619e-06, + "loss": 1.0005, + "step": 1581 + }, + { + "epoch": 0.16637963900245836, + "grad_norm": 3.2253366610207723, + "learning_rate": 4.70006309801523e-06, + "loss": 1.0329, + "step": 1582 + }, + { + "epoch": 0.1664848094443057, + "grad_norm": 3.306319989253321, + "learning_rate": 4.699666497118714e-06, + "loss": 1.0728, + "step": 1583 + }, + { + "epoch": 0.16658997988615298, + "grad_norm": 2.3503420314034082, + "learning_rate": 4.699269650942296e-06, + "loss": 1.006, + "step": 1584 + }, + { + "epoch": 0.1666951503280003, + "grad_norm": 2.8744464659676634, + "learning_rate": 4.6988725595302275e-06, + "loss": 1.06, + "step": 1585 + }, + { + "epoch": 0.16680032076984763, + "grad_norm": 3.3878922195539247, + "learning_rate": 4.698475222926788e-06, + "loss": 1.0133, + "step": 1586 + }, + { + "epoch": 0.16690549121169496, + "grad_norm": 2.9490960578463747, + "learning_rate": 4.698077641176282e-06, + "loss": 0.9812, + "step": 1587 + }, + { + "epoch": 0.16701066165354228, + "grad_norm": 3.898835572720005, + "learning_rate": 4.697679814323044e-06, + "loss": 1.0266, + "step": 1588 + }, + { + "epoch": 0.16711583209538958, + "grad_norm": 2.4198826439223278, + "learning_rate": 4.6972817424114335e-06, + "loss": 0.9853, + "step": 1589 + }, + { + "epoch": 0.1672210025372369, + "grad_norm": 3.2157863216591065, + "learning_rate": 4.6968834254858405e-06, + "loss": 0.9761, + "step": 1590 + }, + { + "epoch": 0.16732617297908423, + "grad_norm": 3.42945668773291, + "learning_rate": 4.69648486359068e-06, + "loss": 1.0218, + "step": 1591 + }, + { + "epoch": 0.16743134342093155, + "grad_norm": 3.8221661628298316, + "learning_rate": 4.6960860567703935e-06, + "loss": 1.0389, + "step": 1592 + }, + { + "epoch": 0.16753651386277887, + "grad_norm": 2.5084187696585563, + "learning_rate": 4.6956870050694524e-06, + "loss": 1.046, + "step": 1593 + }, + { + "epoch": 0.1676416843046262, + "grad_norm": 2.2140737319132415, + "learning_rate": 4.695287708532353e-06, + "loss": 1.0125, + "step": 1594 + }, + { + "epoch": 0.1677468547464735, + "grad_norm": 2.906928468843093, + "learning_rate": 4.6948881672036205e-06, + "loss": 1.0367, + "step": 1595 + }, + { + "epoch": 0.16785202518832082, + "grad_norm": 2.9231721544834257, + "learning_rate": 4.694488381127808e-06, + "loss": 1.0004, + "step": 1596 + }, + { + "epoch": 0.16795719563016814, + "grad_norm": 2.2518787810723015, + "learning_rate": 4.6940883503494925e-06, + "loss": 1.0029, + "step": 1597 + }, + { + "epoch": 0.16806236607201547, + "grad_norm": 2.75505444461088, + "learning_rate": 4.693688074913282e-06, + "loss": 1.0651, + "step": 1598 + }, + { + "epoch": 0.1681675365138628, + "grad_norm": 2.9638750280320094, + "learning_rate": 4.69328755486381e-06, + "loss": 0.981, + "step": 1599 + }, + { + "epoch": 0.1682727069557101, + "grad_norm": 2.2358315107683038, + "learning_rate": 4.692886790245738e-06, + "loss": 1.0402, + "step": 1600 + }, + { + "epoch": 0.1683778773975574, + "grad_norm": 2.554442640835414, + "learning_rate": 4.692485781103753e-06, + "loss": 0.9991, + "step": 1601 + }, + { + "epoch": 0.16848304783940474, + "grad_norm": 3.6587203735508886, + "learning_rate": 4.692084527482572e-06, + "loss": 1.0296, + "step": 1602 + }, + { + "epoch": 0.16858821828125206, + "grad_norm": 2.874856178936171, + "learning_rate": 4.691683029426938e-06, + "loss": 1.034, + "step": 1603 + }, + { + "epoch": 0.16869338872309939, + "grad_norm": 2.0504221476945466, + "learning_rate": 4.691281286981619e-06, + "loss": 1.0412, + "step": 1604 + }, + { + "epoch": 0.16879855916494668, + "grad_norm": 4.299453983662296, + "learning_rate": 4.690879300191416e-06, + "loss": 1.0454, + "step": 1605 + }, + { + "epoch": 0.168903729606794, + "grad_norm": 3.0645533844066875, + "learning_rate": 4.690477069101151e-06, + "loss": 1.0127, + "step": 1606 + }, + { + "epoch": 0.16900890004864133, + "grad_norm": 3.8020279455543893, + "learning_rate": 4.690074593755676e-06, + "loss": 1.0422, + "step": 1607 + }, + { + "epoch": 0.16911407049048865, + "grad_norm": 3.0926763271379034, + "learning_rate": 4.689671874199871e-06, + "loss": 0.9955, + "step": 1608 + }, + { + "epoch": 0.16921924093233598, + "grad_norm": 3.0460948789795594, + "learning_rate": 4.6892689104786425e-06, + "loss": 0.9942, + "step": 1609 + }, + { + "epoch": 0.16932441137418328, + "grad_norm": 2.562365194025863, + "learning_rate": 4.6888657026369235e-06, + "loss": 0.9697, + "step": 1610 + }, + { + "epoch": 0.1694295818160306, + "grad_norm": 2.8814563767503456, + "learning_rate": 4.688462250719675e-06, + "loss": 1.0186, + "step": 1611 + }, + { + "epoch": 0.16953475225787792, + "grad_norm": 3.0019075219247844, + "learning_rate": 4.6880585547718845e-06, + "loss": 1.0168, + "step": 1612 + }, + { + "epoch": 0.16963992269972525, + "grad_norm": 3.3566378728550226, + "learning_rate": 4.6876546148385685e-06, + "loss": 1.0138, + "step": 1613 + }, + { + "epoch": 0.16974509314157257, + "grad_norm": 3.815106274251855, + "learning_rate": 4.687250430964768e-06, + "loss": 1.0534, + "step": 1614 + }, + { + "epoch": 0.16985026358341987, + "grad_norm": 3.955757242671359, + "learning_rate": 4.6868460031955535e-06, + "loss": 1.0462, + "step": 1615 + }, + { + "epoch": 0.1699554340252672, + "grad_norm": 3.1546888936551762, + "learning_rate": 4.686441331576021e-06, + "loss": 0.9697, + "step": 1616 + }, + { + "epoch": 0.17006060446711452, + "grad_norm": 2.987499231967973, + "learning_rate": 4.686036416151296e-06, + "loss": 1.0134, + "step": 1617 + }, + { + "epoch": 0.17016577490896184, + "grad_norm": 3.2641730627495567, + "learning_rate": 4.6856312569665285e-06, + "loss": 1.0178, + "step": 1618 + }, + { + "epoch": 0.17027094535080917, + "grad_norm": 2.5005821127937096, + "learning_rate": 4.685225854066897e-06, + "loss": 1.0002, + "step": 1619 + }, + { + "epoch": 0.17037611579265646, + "grad_norm": 2.3448731740329434, + "learning_rate": 4.684820207497608e-06, + "loss": 1.0644, + "step": 1620 + }, + { + "epoch": 0.1704812862345038, + "grad_norm": 3.060151169504036, + "learning_rate": 4.684414317303894e-06, + "loss": 1.0934, + "step": 1621 + }, + { + "epoch": 0.1705864566763511, + "grad_norm": 2.00150049947049, + "learning_rate": 4.6840081835310135e-06, + "loss": 1.0504, + "step": 1622 + }, + { + "epoch": 0.17069162711819844, + "grad_norm": 2.708844063331739, + "learning_rate": 4.683601806224255e-06, + "loss": 1.0163, + "step": 1623 + }, + { + "epoch": 0.17079679756004576, + "grad_norm": 3.4472196380160978, + "learning_rate": 4.683195185428932e-06, + "loss": 0.9911, + "step": 1624 + }, + { + "epoch": 0.17090196800189306, + "grad_norm": 2.8820764062472026, + "learning_rate": 4.6827883211903865e-06, + "loss": 1.0349, + "step": 1625 + }, + { + "epoch": 0.17100713844374038, + "grad_norm": 2.2574433822521067, + "learning_rate": 4.682381213553986e-06, + "loss": 1.0159, + "step": 1626 + }, + { + "epoch": 0.1711123088855877, + "grad_norm": 3.089754095729008, + "learning_rate": 4.681973862565128e-06, + "loss": 1.0047, + "step": 1627 + }, + { + "epoch": 0.17121747932743503, + "grad_norm": 3.057788643389871, + "learning_rate": 4.681566268269233e-06, + "loss": 1.0697, + "step": 1628 + }, + { + "epoch": 0.17132264976928235, + "grad_norm": 2.5667766260752085, + "learning_rate": 4.681158430711753e-06, + "loss": 1.0402, + "step": 1629 + }, + { + "epoch": 0.17142782021112965, + "grad_norm": 2.9763268275620116, + "learning_rate": 4.680750349938164e-06, + "loss": 1.0168, + "step": 1630 + }, + { + "epoch": 0.17153299065297697, + "grad_norm": 2.9070930547134566, + "learning_rate": 4.68034202599397e-06, + "loss": 1.0487, + "step": 1631 + }, + { + "epoch": 0.1716381610948243, + "grad_norm": 2.6307238317161494, + "learning_rate": 4.679933458924702e-06, + "loss": 1.0303, + "step": 1632 + }, + { + "epoch": 0.17174333153667162, + "grad_norm": 3.6659807629189913, + "learning_rate": 4.67952464877592e-06, + "loss": 0.9981, + "step": 1633 + }, + { + "epoch": 0.17184850197851895, + "grad_norm": 2.736348178417095, + "learning_rate": 4.679115595593208e-06, + "loss": 0.9939, + "step": 1634 + }, + { + "epoch": 0.17195367242036624, + "grad_norm": 3.0409494428632993, + "learning_rate": 4.67870629942218e-06, + "loss": 1.0585, + "step": 1635 + }, + { + "epoch": 0.17205884286221357, + "grad_norm": 2.4454258516009593, + "learning_rate": 4.678296760308474e-06, + "loss": 1.0395, + "step": 1636 + }, + { + "epoch": 0.1721640133040609, + "grad_norm": 2.612925874097498, + "learning_rate": 4.677886978297758e-06, + "loss": 0.9964, + "step": 1637 + }, + { + "epoch": 0.17226918374590822, + "grad_norm": 3.0527819045438793, + "learning_rate": 4.677476953435725e-06, + "loss": 1.0306, + "step": 1638 + }, + { + "epoch": 0.17237435418775554, + "grad_norm": 3.7722857272845616, + "learning_rate": 4.677066685768097e-06, + "loss": 1.0256, + "step": 1639 + }, + { + "epoch": 0.17247952462960284, + "grad_norm": 4.671732853843867, + "learning_rate": 4.676656175340621e-06, + "loss": 1.0539, + "step": 1640 + }, + { + "epoch": 0.17258469507145016, + "grad_norm": 3.402158905492146, + "learning_rate": 4.676245422199073e-06, + "loss": 1.041, + "step": 1641 + }, + { + "epoch": 0.17268986551329749, + "grad_norm": 2.7043484924050913, + "learning_rate": 4.675834426389254e-06, + "loss": 0.9748, + "step": 1642 + }, + { + "epoch": 0.1727950359551448, + "grad_norm": 3.045874999700864, + "learning_rate": 4.675423187956995e-06, + "loss": 1.0199, + "step": 1643 + }, + { + "epoch": 0.17290020639699213, + "grad_norm": 2.0391498549902325, + "learning_rate": 4.675011706948151e-06, + "loss": 0.9807, + "step": 1644 + }, + { + "epoch": 0.17300537683883943, + "grad_norm": 2.8730116912157517, + "learning_rate": 4.674599983408605e-06, + "loss": 1.0442, + "step": 1645 + }, + { + "epoch": 0.17311054728068676, + "grad_norm": 3.6137100268246387, + "learning_rate": 4.674188017384269e-06, + "loss": 1.0149, + "step": 1646 + }, + { + "epoch": 0.17321571772253408, + "grad_norm": 3.388019574947697, + "learning_rate": 4.673775808921078e-06, + "loss": 1.0567, + "step": 1647 + }, + { + "epoch": 0.1733208881643814, + "grad_norm": 2.7619468248957775, + "learning_rate": 4.673363358065e-06, + "loss": 0.9992, + "step": 1648 + }, + { + "epoch": 0.17342605860622873, + "grad_norm": 3.196115448528693, + "learning_rate": 4.672950664862022e-06, + "loss": 1.0359, + "step": 1649 + }, + { + "epoch": 0.17353122904807602, + "grad_norm": 3.1251623630514054, + "learning_rate": 4.672537729358166e-06, + "loss": 1.0346, + "step": 1650 + }, + { + "epoch": 0.17363639948992335, + "grad_norm": 2.3482592044116024, + "learning_rate": 4.672124551599476e-06, + "loss": 0.99, + "step": 1651 + }, + { + "epoch": 0.17374156993177067, + "grad_norm": 3.2990493587317995, + "learning_rate": 4.671711131632025e-06, + "loss": 1.0203, + "step": 1652 + }, + { + "epoch": 0.173846740373618, + "grad_norm": 2.2221626940474923, + "learning_rate": 4.671297469501912e-06, + "loss": 1.0141, + "step": 1653 + }, + { + "epoch": 0.17395191081546532, + "grad_norm": 3.4954782095000776, + "learning_rate": 4.670883565255264e-06, + "loss": 1.003, + "step": 1654 + }, + { + "epoch": 0.17405708125731265, + "grad_norm": 4.077759847846728, + "learning_rate": 4.670469418938235e-06, + "loss": 1.04, + "step": 1655 + }, + { + "epoch": 0.17416225169915994, + "grad_norm": 3.7217727086743913, + "learning_rate": 4.670055030597004e-06, + "loss": 1.0416, + "step": 1656 + }, + { + "epoch": 0.17426742214100727, + "grad_norm": 2.924757634673209, + "learning_rate": 4.66964040027778e-06, + "loss": 1.0284, + "step": 1657 + }, + { + "epoch": 0.1743725925828546, + "grad_norm": 2.5219450852454037, + "learning_rate": 4.669225528026797e-06, + "loss": 1.0186, + "step": 1658 + }, + { + "epoch": 0.17447776302470192, + "grad_norm": 2.9803154223037223, + "learning_rate": 4.668810413890318e-06, + "loss": 1.0058, + "step": 1659 + }, + { + "epoch": 0.17458293346654924, + "grad_norm": 2.8822355518190004, + "learning_rate": 4.668395057914627e-06, + "loss": 1.0453, + "step": 1660 + }, + { + "epoch": 0.17468810390839654, + "grad_norm": 3.6402941948880283, + "learning_rate": 4.667979460146045e-06, + "loss": 1.0375, + "step": 1661 + }, + { + "epoch": 0.17479327435024386, + "grad_norm": 2.5715274816378746, + "learning_rate": 4.6675636206309105e-06, + "loss": 1.013, + "step": 1662 + }, + { + "epoch": 0.17489844479209118, + "grad_norm": 3.0432398948655734, + "learning_rate": 4.667147539415594e-06, + "loss": 1.0241, + "step": 1663 + }, + { + "epoch": 0.1750036152339385, + "grad_norm": 3.395961606680768, + "learning_rate": 4.666731216546492e-06, + "loss": 0.9814, + "step": 1664 + }, + { + "epoch": 0.17510878567578583, + "grad_norm": 2.863033871229343, + "learning_rate": 4.6663146520700275e-06, + "loss": 1.0259, + "step": 1665 + }, + { + "epoch": 0.17521395611763313, + "grad_norm": 2.905956162367102, + "learning_rate": 4.665897846032651e-06, + "loss": 1.0223, + "step": 1666 + }, + { + "epoch": 0.17531912655948045, + "grad_norm": 2.9044821170497563, + "learning_rate": 4.66548079848084e-06, + "loss": 1.0568, + "step": 1667 + }, + { + "epoch": 0.17542429700132778, + "grad_norm": 2.5251868732416978, + "learning_rate": 4.665063509461098e-06, + "loss": 1.0291, + "step": 1668 + }, + { + "epoch": 0.1755294674431751, + "grad_norm": 1.9745812484711869, + "learning_rate": 4.664645979019954e-06, + "loss": 1.0088, + "step": 1669 + }, + { + "epoch": 0.17563463788502243, + "grad_norm": 2.9571167671583933, + "learning_rate": 4.6642282072039694e-06, + "loss": 1.038, + "step": 1670 + }, + { + "epoch": 0.17573980832686972, + "grad_norm": 3.162873132329408, + "learning_rate": 4.663810194059727e-06, + "loss": 1.039, + "step": 1671 + }, + { + "epoch": 0.17584497876871705, + "grad_norm": 2.707745488716953, + "learning_rate": 4.663391939633839e-06, + "loss": 1.0412, + "step": 1672 + }, + { + "epoch": 0.17595014921056437, + "grad_norm": 2.390831077180832, + "learning_rate": 4.662973443972943e-06, + "loss": 1.0217, + "step": 1673 + }, + { + "epoch": 0.1760553196524117, + "grad_norm": 2.6697672577896103, + "learning_rate": 4.662554707123707e-06, + "loss": 0.9758, + "step": 1674 + }, + { + "epoch": 0.17616049009425902, + "grad_norm": 2.5536432355735417, + "learning_rate": 4.662135729132821e-06, + "loss": 1.0093, + "step": 1675 + }, + { + "epoch": 0.17626566053610632, + "grad_norm": 4.454966321168614, + "learning_rate": 4.661716510047005e-06, + "loss": 1.0656, + "step": 1676 + }, + { + "epoch": 0.17637083097795364, + "grad_norm": 2.9059225841826586, + "learning_rate": 4.661297049913005e-06, + "loss": 1.054, + "step": 1677 + }, + { + "epoch": 0.17647600141980097, + "grad_norm": 2.7078515707870063, + "learning_rate": 4.660877348777595e-06, + "loss": 0.9475, + "step": 1678 + }, + { + "epoch": 0.1765811718616483, + "grad_norm": 2.714819257387345, + "learning_rate": 4.660457406687574e-06, + "loss": 1.008, + "step": 1679 + }, + { + "epoch": 0.1766863423034956, + "grad_norm": 3.560183070646618, + "learning_rate": 4.66003722368977e-06, + "loss": 1.0123, + "step": 1680 + }, + { + "epoch": 0.1767915127453429, + "grad_norm": 4.011571407375856, + "learning_rate": 4.659616799831035e-06, + "loss": 1.0113, + "step": 1681 + }, + { + "epoch": 0.17689668318719023, + "grad_norm": 3.512773712732269, + "learning_rate": 4.659196135158251e-06, + "loss": 1.0847, + "step": 1682 + }, + { + "epoch": 0.17700185362903756, + "grad_norm": 3.5710928774903783, + "learning_rate": 4.658775229718323e-06, + "loss": 0.9836, + "step": 1683 + }, + { + "epoch": 0.17710702407088488, + "grad_norm": 2.576288670084694, + "learning_rate": 4.6583540835581885e-06, + "loss": 0.9977, + "step": 1684 + }, + { + "epoch": 0.1772121945127322, + "grad_norm": 2.67065462027682, + "learning_rate": 4.657932696724807e-06, + "loss": 1.0019, + "step": 1685 + }, + { + "epoch": 0.1773173649545795, + "grad_norm": 3.0304864854926286, + "learning_rate": 4.657511069265166e-06, + "loss": 1.0513, + "step": 1686 + }, + { + "epoch": 0.17742253539642683, + "grad_norm": 2.756757768961673, + "learning_rate": 4.6570892012262806e-06, + "loss": 1.055, + "step": 1687 + }, + { + "epoch": 0.17752770583827415, + "grad_norm": 3.3322689645027306, + "learning_rate": 4.656667092655192e-06, + "loss": 1.0246, + "step": 1688 + }, + { + "epoch": 0.17763287628012148, + "grad_norm": 2.51855374758776, + "learning_rate": 4.65624474359897e-06, + "loss": 0.9991, + "step": 1689 + }, + { + "epoch": 0.1777380467219688, + "grad_norm": 1.793782237595271, + "learning_rate": 4.655822154104708e-06, + "loss": 0.9936, + "step": 1690 + }, + { + "epoch": 0.1778432171638161, + "grad_norm": 2.06409532213173, + "learning_rate": 4.655399324219529e-06, + "loss": 1.0011, + "step": 1691 + }, + { + "epoch": 0.17794838760566342, + "grad_norm": 3.1724819528325843, + "learning_rate": 4.654976253990582e-06, + "loss": 1.0609, + "step": 1692 + }, + { + "epoch": 0.17805355804751075, + "grad_norm": 3.173700150317756, + "learning_rate": 4.654552943465042e-06, + "loss": 1.0357, + "step": 1693 + }, + { + "epoch": 0.17815872848935807, + "grad_norm": 2.630712209918402, + "learning_rate": 4.654129392690111e-06, + "loss": 1.0445, + "step": 1694 + }, + { + "epoch": 0.1782638989312054, + "grad_norm": 3.195364242863823, + "learning_rate": 4.653705601713019e-06, + "loss": 1.0546, + "step": 1695 + }, + { + "epoch": 0.1783690693730527, + "grad_norm": 2.9685233327726737, + "learning_rate": 4.653281570581023e-06, + "loss": 0.9853, + "step": 1696 + }, + { + "epoch": 0.17847423981490002, + "grad_norm": 2.683882737708656, + "learning_rate": 4.6528572993414036e-06, + "loss": 1.0162, + "step": 1697 + }, + { + "epoch": 0.17857941025674734, + "grad_norm": 3.4527829667595067, + "learning_rate": 4.652432788041471e-06, + "loss": 1.0187, + "step": 1698 + }, + { + "epoch": 0.17868458069859466, + "grad_norm": 3.5173119907210735, + "learning_rate": 4.652008036728563e-06, + "loss": 1.0463, + "step": 1699 + }, + { + "epoch": 0.178789751140442, + "grad_norm": 3.388433179291527, + "learning_rate": 4.651583045450041e-06, + "loss": 1.0248, + "step": 1700 + }, + { + "epoch": 0.17889492158228928, + "grad_norm": 2.1485938375897313, + "learning_rate": 4.651157814253295e-06, + "loss": 1.0051, + "step": 1701 + }, + { + "epoch": 0.1790000920241366, + "grad_norm": 3.1044973840896732, + "learning_rate": 4.650732343185743e-06, + "loss": 1.0274, + "step": 1702 + }, + { + "epoch": 0.17910526246598393, + "grad_norm": 2.6481193563942718, + "learning_rate": 4.6503066322948264e-06, + "loss": 1.0212, + "step": 1703 + }, + { + "epoch": 0.17921043290783126, + "grad_norm": 2.9567702455119496, + "learning_rate": 4.649880681628016e-06, + "loss": 1.0586, + "step": 1704 + }, + { + "epoch": 0.17931560334967858, + "grad_norm": 2.847681704947885, + "learning_rate": 4.649454491232809e-06, + "loss": 1.0353, + "step": 1705 + }, + { + "epoch": 0.17942077379152588, + "grad_norm": 2.544316932295647, + "learning_rate": 4.649028061156728e-06, + "loss": 1.0109, + "step": 1706 + }, + { + "epoch": 0.1795259442333732, + "grad_norm": 2.5651350900643863, + "learning_rate": 4.648601391447325e-06, + "loss": 1.0665, + "step": 1707 + }, + { + "epoch": 0.17963111467522053, + "grad_norm": 3.305216400055959, + "learning_rate": 4.648174482152176e-06, + "loss": 1.0119, + "step": 1708 + }, + { + "epoch": 0.17973628511706785, + "grad_norm": 2.907862648072626, + "learning_rate": 4.647747333318884e-06, + "loss": 1.0051, + "step": 1709 + }, + { + "epoch": 0.17984145555891518, + "grad_norm": 2.7406970823992785, + "learning_rate": 4.64731994499508e-06, + "loss": 0.9843, + "step": 1710 + }, + { + "epoch": 0.17994662600076247, + "grad_norm": 2.986361487648303, + "learning_rate": 4.646892317228422e-06, + "loss": 1.0133, + "step": 1711 + }, + { + "epoch": 0.1800517964426098, + "grad_norm": 2.897968238151226, + "learning_rate": 4.646464450066592e-06, + "loss": 1.0311, + "step": 1712 + }, + { + "epoch": 0.18015696688445712, + "grad_norm": 3.4099728159788008, + "learning_rate": 4.646036343557302e-06, + "loss": 0.9871, + "step": 1713 + }, + { + "epoch": 0.18026213732630444, + "grad_norm": 3.7576035434231745, + "learning_rate": 4.6456079977482885e-06, + "loss": 1.032, + "step": 1714 + }, + { + "epoch": 0.18036730776815177, + "grad_norm": 2.569078677988013, + "learning_rate": 4.645179412687316e-06, + "loss": 1.024, + "step": 1715 + }, + { + "epoch": 0.1804724782099991, + "grad_norm": 3.286151517668889, + "learning_rate": 4.644750588422174e-06, + "loss": 1.062, + "step": 1716 + }, + { + "epoch": 0.1805776486518464, + "grad_norm": 3.179805762126629, + "learning_rate": 4.644321525000681e-06, + "loss": 1.0809, + "step": 1717 + }, + { + "epoch": 0.18068281909369371, + "grad_norm": 2.154906179564819, + "learning_rate": 4.64389222247068e-06, + "loss": 1.0338, + "step": 1718 + }, + { + "epoch": 0.18078798953554104, + "grad_norm": 3.4239950119409195, + "learning_rate": 4.643462680880042e-06, + "loss": 1.0273, + "step": 1719 + }, + { + "epoch": 0.18089315997738836, + "grad_norm": 2.9055090835022632, + "learning_rate": 4.643032900276664e-06, + "loss": 1.0144, + "step": 1720 + }, + { + "epoch": 0.1809983304192357, + "grad_norm": 2.4602189314080776, + "learning_rate": 4.642602880708469e-06, + "loss": 0.9683, + "step": 1721 + }, + { + "epoch": 0.18110350086108298, + "grad_norm": 2.90218657552329, + "learning_rate": 4.642172622223409e-06, + "loss": 1.0738, + "step": 1722 + }, + { + "epoch": 0.1812086713029303, + "grad_norm": 2.3552760906435983, + "learning_rate": 4.641742124869461e-06, + "loss": 0.9947, + "step": 1723 + }, + { + "epoch": 0.18131384174477763, + "grad_norm": 2.8906978212652032, + "learning_rate": 4.641311388694629e-06, + "loss": 1.0405, + "step": 1724 + }, + { + "epoch": 0.18141901218662496, + "grad_norm": 2.2774591530752653, + "learning_rate": 4.640880413746942e-06, + "loss": 1.0631, + "step": 1725 + }, + { + "epoch": 0.18152418262847228, + "grad_norm": 2.909505385998881, + "learning_rate": 4.640449200074459e-06, + "loss": 1.0374, + "step": 1726 + }, + { + "epoch": 0.18162935307031958, + "grad_norm": 2.2639846101073484, + "learning_rate": 4.6400177477252615e-06, + "loss": 1.0135, + "step": 1727 + }, + { + "epoch": 0.1817345235121669, + "grad_norm": 3.409130796181564, + "learning_rate": 4.639586056747461e-06, + "loss": 1.0132, + "step": 1728 + }, + { + "epoch": 0.18183969395401423, + "grad_norm": 2.1982637333179915, + "learning_rate": 4.639154127189195e-06, + "loss": 0.989, + "step": 1729 + }, + { + "epoch": 0.18194486439586155, + "grad_norm": 3.8360953153189, + "learning_rate": 4.638721959098626e-06, + "loss": 1.0704, + "step": 1730 + }, + { + "epoch": 0.18205003483770887, + "grad_norm": 2.4398191847689934, + "learning_rate": 4.638289552523944e-06, + "loss": 1.0277, + "step": 1731 + }, + { + "epoch": 0.18215520527955617, + "grad_norm": 2.6670166772713646, + "learning_rate": 4.637856907513366e-06, + "loss": 1.0189, + "step": 1732 + }, + { + "epoch": 0.1822603757214035, + "grad_norm": 2.6746238004492495, + "learning_rate": 4.637424024115136e-06, + "loss": 1.0083, + "step": 1733 + }, + { + "epoch": 0.18236554616325082, + "grad_norm": 4.189496479103646, + "learning_rate": 4.636990902377523e-06, + "loss": 1.0884, + "step": 1734 + }, + { + "epoch": 0.18247071660509814, + "grad_norm": 3.617312839139946, + "learning_rate": 4.636557542348823e-06, + "loss": 1.02, + "step": 1735 + }, + { + "epoch": 0.18257588704694547, + "grad_norm": 2.7303718078659025, + "learning_rate": 4.6361239440773595e-06, + "loss": 1.033, + "step": 1736 + }, + { + "epoch": 0.18268105748879276, + "grad_norm": 2.7591255551753426, + "learning_rate": 4.635690107611483e-06, + "loss": 1.0249, + "step": 1737 + }, + { + "epoch": 0.1827862279306401, + "grad_norm": 2.698775360170187, + "learning_rate": 4.635256032999569e-06, + "loss": 1.0181, + "step": 1738 + }, + { + "epoch": 0.1828913983724874, + "grad_norm": 2.9039375552443776, + "learning_rate": 4.63482172029002e-06, + "loss": 1.0174, + "step": 1739 + }, + { + "epoch": 0.18299656881433474, + "grad_norm": 2.819682186528183, + "learning_rate": 4.6343871695312646e-06, + "loss": 0.9973, + "step": 1740 + }, + { + "epoch": 0.18310173925618206, + "grad_norm": 2.752650866563116, + "learning_rate": 4.63395238077176e-06, + "loss": 0.9959, + "step": 1741 + }, + { + "epoch": 0.18320690969802936, + "grad_norm": 2.037156561103997, + "learning_rate": 4.6335173540599875e-06, + "loss": 1.0232, + "step": 1742 + }, + { + "epoch": 0.18331208013987668, + "grad_norm": 2.954334928188035, + "learning_rate": 4.633082089444457e-06, + "loss": 0.9881, + "step": 1743 + }, + { + "epoch": 0.183417250581724, + "grad_norm": 2.7393783209030946, + "learning_rate": 4.632646586973702e-06, + "loss": 1.0439, + "step": 1744 + }, + { + "epoch": 0.18352242102357133, + "grad_norm": 3.1124274526696385, + "learning_rate": 4.6322108466962865e-06, + "loss": 0.9901, + "step": 1745 + }, + { + "epoch": 0.18362759146541865, + "grad_norm": 2.827497506273816, + "learning_rate": 4.631774868660798e-06, + "loss": 1.0557, + "step": 1746 + }, + { + "epoch": 0.18373276190726595, + "grad_norm": 2.7113839252801495, + "learning_rate": 4.631338652915851e-06, + "loss": 1.0363, + "step": 1747 + }, + { + "epoch": 0.18383793234911328, + "grad_norm": 3.0202399372617896, + "learning_rate": 4.6309021995100875e-06, + "loss": 0.9892, + "step": 1748 + }, + { + "epoch": 0.1839431027909606, + "grad_norm": 3.0853410173378117, + "learning_rate": 4.630465508492176e-06, + "loss": 0.9995, + "step": 1749 + }, + { + "epoch": 0.18404827323280792, + "grad_norm": 3.8771840606848347, + "learning_rate": 4.630028579910809e-06, + "loss": 1.0325, + "step": 1750 + }, + { + "epoch": 0.18415344367465525, + "grad_norm": 2.924444671474204, + "learning_rate": 4.629591413814709e-06, + "loss": 1.0269, + "step": 1751 + }, + { + "epoch": 0.18425861411650254, + "grad_norm": 3.855054949681042, + "learning_rate": 4.629154010252624e-06, + "loss": 1.0047, + "step": 1752 + }, + { + "epoch": 0.18436378455834987, + "grad_norm": 2.94820330025766, + "learning_rate": 4.628716369273326e-06, + "loss": 1.0513, + "step": 1753 + }, + { + "epoch": 0.1844689550001972, + "grad_norm": 3.1410382438209057, + "learning_rate": 4.628278490925617e-06, + "loss": 1.0178, + "step": 1754 + }, + { + "epoch": 0.18457412544204452, + "grad_norm": 3.430775164691166, + "learning_rate": 4.6278403752583235e-06, + "loss": 1.0179, + "step": 1755 + }, + { + "epoch": 0.18467929588389184, + "grad_norm": 3.935774284929484, + "learning_rate": 4.627402022320298e-06, + "loss": 1.0026, + "step": 1756 + }, + { + "epoch": 0.18478446632573914, + "grad_norm": 4.002646697081782, + "learning_rate": 4.626963432160421e-06, + "loss": 1.0347, + "step": 1757 + }, + { + "epoch": 0.18488963676758646, + "grad_norm": 2.5892402072879275, + "learning_rate": 4.626524604827598e-06, + "loss": 1.0222, + "step": 1758 + }, + { + "epoch": 0.1849948072094338, + "grad_norm": 3.069956064773937, + "learning_rate": 4.6260855403707625e-06, + "loss": 1.0151, + "step": 1759 + }, + { + "epoch": 0.1850999776512811, + "grad_norm": 2.7469752838222714, + "learning_rate": 4.625646238838873e-06, + "loss": 1.0214, + "step": 1760 + }, + { + "epoch": 0.18520514809312844, + "grad_norm": 2.5645297933313875, + "learning_rate": 4.6252067002809155e-06, + "loss": 1.007, + "step": 1761 + }, + { + "epoch": 0.18531031853497573, + "grad_norm": 3.4751336669098194, + "learning_rate": 4.6247669247459015e-06, + "loss": 1.0049, + "step": 1762 + }, + { + "epoch": 0.18541548897682306, + "grad_norm": 2.641984983795442, + "learning_rate": 4.62432691228287e-06, + "loss": 0.9999, + "step": 1763 + }, + { + "epoch": 0.18552065941867038, + "grad_norm": 2.4784360351295875, + "learning_rate": 4.623886662940885e-06, + "loss": 0.9663, + "step": 1764 + }, + { + "epoch": 0.1856258298605177, + "grad_norm": 3.7083059171189814, + "learning_rate": 4.6234461767690384e-06, + "loss": 0.9815, + "step": 1765 + }, + { + "epoch": 0.18573100030236503, + "grad_norm": 3.085101182950583, + "learning_rate": 4.623005453816447e-06, + "loss": 1.0183, + "step": 1766 + }, + { + "epoch": 0.18583617074421233, + "grad_norm": 3.8717416513609164, + "learning_rate": 4.622564494132256e-06, + "loss": 1.0394, + "step": 1767 + }, + { + "epoch": 0.18594134118605965, + "grad_norm": 2.6220312567865682, + "learning_rate": 4.622123297765636e-06, + "loss": 0.9954, + "step": 1768 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 3.2456038569135313, + "learning_rate": 4.621681864765783e-06, + "loss": 1.0068, + "step": 1769 + }, + { + "epoch": 0.1861516820697543, + "grad_norm": 2.99049524146547, + "learning_rate": 4.621240195181918e-06, + "loss": 1.0466, + "step": 1770 + }, + { + "epoch": 0.18625685251160162, + "grad_norm": 3.7023703331482074, + "learning_rate": 4.620798289063295e-06, + "loss": 1.028, + "step": 1771 + }, + { + "epoch": 0.18636202295344895, + "grad_norm": 2.284086379818769, + "learning_rate": 4.620356146459187e-06, + "loss": 1.0119, + "step": 1772 + }, + { + "epoch": 0.18646719339529624, + "grad_norm": 3.7163945954226514, + "learning_rate": 4.619913767418898e-06, + "loss": 1.0508, + "step": 1773 + }, + { + "epoch": 0.18657236383714357, + "grad_norm": 3.8367678119936324, + "learning_rate": 4.619471151991755e-06, + "loss": 1.0105, + "step": 1774 + }, + { + "epoch": 0.1866775342789909, + "grad_norm": 2.8688910090189923, + "learning_rate": 4.619028300227114e-06, + "loss": 1.0038, + "step": 1775 + }, + { + "epoch": 0.18678270472083822, + "grad_norm": 2.650967656775131, + "learning_rate": 4.618585212174357e-06, + "loss": 0.9438, + "step": 1776 + }, + { + "epoch": 0.18688787516268554, + "grad_norm": 4.345745813091422, + "learning_rate": 4.618141887882891e-06, + "loss": 1.025, + "step": 1777 + }, + { + "epoch": 0.18699304560453284, + "grad_norm": 2.868036629003295, + "learning_rate": 4.6176983274021495e-06, + "loss": 1.0249, + "step": 1778 + }, + { + "epoch": 0.18709821604638016, + "grad_norm": 2.372708709831453, + "learning_rate": 4.617254530781594e-06, + "loss": 1.0195, + "step": 1779 + }, + { + "epoch": 0.18720338648822749, + "grad_norm": 3.358894346037336, + "learning_rate": 4.6168104980707105e-06, + "loss": 1.0665, + "step": 1780 + }, + { + "epoch": 0.1873085569300748, + "grad_norm": 3.3819142018228643, + "learning_rate": 4.6163662293190135e-06, + "loss": 1.0316, + "step": 1781 + }, + { + "epoch": 0.18741372737192213, + "grad_norm": 2.7812505645074777, + "learning_rate": 4.61592172457604e-06, + "loss": 1.0267, + "step": 1782 + }, + { + "epoch": 0.18751889781376943, + "grad_norm": 2.666048271062157, + "learning_rate": 4.615476983891359e-06, + "loss": 1.0273, + "step": 1783 + }, + { + "epoch": 0.18762406825561675, + "grad_norm": 2.8496484883881994, + "learning_rate": 4.61503200731456e-06, + "loss": 0.9927, + "step": 1784 + }, + { + "epoch": 0.18772923869746408, + "grad_norm": 2.604297527248957, + "learning_rate": 4.6145867948952605e-06, + "loss": 1.0363, + "step": 1785 + }, + { + "epoch": 0.1878344091393114, + "grad_norm": 2.25491398075674, + "learning_rate": 4.614141346683109e-06, + "loss": 1.038, + "step": 1786 + }, + { + "epoch": 0.18793957958115873, + "grad_norm": 2.264636873435953, + "learning_rate": 4.6136956627277725e-06, + "loss": 0.9992, + "step": 1787 + }, + { + "epoch": 0.18804475002300602, + "grad_norm": 2.704549825776999, + "learning_rate": 4.61324974307895e-06, + "loss": 1.0082, + "step": 1788 + }, + { + "epoch": 0.18814992046485335, + "grad_norm": 2.353105958418805, + "learning_rate": 4.612803587786366e-06, + "loss": 1.0102, + "step": 1789 + }, + { + "epoch": 0.18825509090670067, + "grad_norm": 2.948441358243538, + "learning_rate": 4.612357196899768e-06, + "loss": 1.033, + "step": 1790 + }, + { + "epoch": 0.188360261348548, + "grad_norm": 1.9059658097973773, + "learning_rate": 4.611910570468933e-06, + "loss": 1.002, + "step": 1791 + }, + { + "epoch": 0.18846543179039532, + "grad_norm": 2.6032758990156712, + "learning_rate": 4.611463708543665e-06, + "loss": 1.0658, + "step": 1792 + }, + { + "epoch": 0.18857060223224262, + "grad_norm": 2.230556596976156, + "learning_rate": 4.61101661117379e-06, + "loss": 1.0214, + "step": 1793 + }, + { + "epoch": 0.18867577267408994, + "grad_norm": 3.0702976065770664, + "learning_rate": 4.610569278409164e-06, + "loss": 1.0007, + "step": 1794 + }, + { + "epoch": 0.18878094311593727, + "grad_norm": 3.007760617807363, + "learning_rate": 4.610121710299668e-06, + "loss": 1.0181, + "step": 1795 + }, + { + "epoch": 0.1888861135577846, + "grad_norm": 2.7344843254797127, + "learning_rate": 4.609673906895208e-06, + "loss": 1.0361, + "step": 1796 + }, + { + "epoch": 0.18899128399963191, + "grad_norm": 1.9281901603612084, + "learning_rate": 4.609225868245721e-06, + "loss": 1.0006, + "step": 1797 + }, + { + "epoch": 0.1890964544414792, + "grad_norm": 3.2114203189511152, + "learning_rate": 4.608777594401164e-06, + "loss": 1.0043, + "step": 1798 + }, + { + "epoch": 0.18920162488332654, + "grad_norm": 2.145414047326945, + "learning_rate": 4.608329085411523e-06, + "loss": 1.0424, + "step": 1799 + }, + { + "epoch": 0.18930679532517386, + "grad_norm": 2.51024154366095, + "learning_rate": 4.607880341326812e-06, + "loss": 1.0284, + "step": 1800 + }, + { + "epoch": 0.18941196576702118, + "grad_norm": 3.0454022502767804, + "learning_rate": 4.607431362197067e-06, + "loss": 0.9817, + "step": 1801 + }, + { + "epoch": 0.1895171362088685, + "grad_norm": 3.050295307561754, + "learning_rate": 4.6069821480723545e-06, + "loss": 1.0539, + "step": 1802 + }, + { + "epoch": 0.1896223066507158, + "grad_norm": 4.406095657800138, + "learning_rate": 4.6065326990027656e-06, + "loss": 1.0129, + "step": 1803 + }, + { + "epoch": 0.18972747709256313, + "grad_norm": 2.37443582116452, + "learning_rate": 4.606083015038416e-06, + "loss": 0.9949, + "step": 1804 + }, + { + "epoch": 0.18983264753441045, + "grad_norm": 3.2183447058232253, + "learning_rate": 4.6056330962294496e-06, + "loss": 1.0636, + "step": 1805 + }, + { + "epoch": 0.18993781797625778, + "grad_norm": 2.8177933591262407, + "learning_rate": 4.605182942626037e-06, + "loss": 1.0327, + "step": 1806 + }, + { + "epoch": 0.1900429884181051, + "grad_norm": 2.3566286140592654, + "learning_rate": 4.604732554278371e-06, + "loss": 1.0276, + "step": 1807 + }, + { + "epoch": 0.1901481588599524, + "grad_norm": 3.0654404715127024, + "learning_rate": 4.604281931236675e-06, + "loss": 1.0586, + "step": 1808 + }, + { + "epoch": 0.19025332930179972, + "grad_norm": 2.099265216978414, + "learning_rate": 4.6038310735511985e-06, + "loss": 1.0363, + "step": 1809 + }, + { + "epoch": 0.19035849974364705, + "grad_norm": 2.521236583898647, + "learning_rate": 4.603379981272213e-06, + "loss": 0.9839, + "step": 1810 + }, + { + "epoch": 0.19046367018549437, + "grad_norm": 2.528668148495521, + "learning_rate": 4.6029286544500205e-06, + "loss": 1.009, + "step": 1811 + }, + { + "epoch": 0.1905688406273417, + "grad_norm": 3.7980630261576125, + "learning_rate": 4.602477093134947e-06, + "loss": 0.9996, + "step": 1812 + }, + { + "epoch": 0.190674011069189, + "grad_norm": 2.965801661136027, + "learning_rate": 4.602025297377345e-06, + "loss": 1.0368, + "step": 1813 + }, + { + "epoch": 0.19077918151103632, + "grad_norm": 2.5523632367350104, + "learning_rate": 4.6015732672275925e-06, + "loss": 0.9936, + "step": 1814 + }, + { + "epoch": 0.19088435195288364, + "grad_norm": 3.02733722515206, + "learning_rate": 4.601121002736095e-06, + "loss": 1.0129, + "step": 1815 + }, + { + "epoch": 0.19098952239473097, + "grad_norm": 4.357521284066693, + "learning_rate": 4.600668503953285e-06, + "loss": 1.0212, + "step": 1816 + }, + { + "epoch": 0.1910946928365783, + "grad_norm": 3.1172216407619318, + "learning_rate": 4.600215770929617e-06, + "loss": 1.093, + "step": 1817 + }, + { + "epoch": 0.19119986327842559, + "grad_norm": 2.6332985511775493, + "learning_rate": 4.599762803715576e-06, + "loss": 1.0391, + "step": 1818 + }, + { + "epoch": 0.1913050337202729, + "grad_norm": 2.569865786330344, + "learning_rate": 4.599309602361671e-06, + "loss": 1.0291, + "step": 1819 + }, + { + "epoch": 0.19141020416212023, + "grad_norm": 3.2827113014482445, + "learning_rate": 4.5988561669184376e-06, + "loss": 1.0382, + "step": 1820 + }, + { + "epoch": 0.19151537460396756, + "grad_norm": 2.1719148645999584, + "learning_rate": 4.598402497436436e-06, + "loss": 1.0192, + "step": 1821 + }, + { + "epoch": 0.19162054504581488, + "grad_norm": 2.2765741322316546, + "learning_rate": 4.597948593966256e-06, + "loss": 1.0286, + "step": 1822 + }, + { + "epoch": 0.19172571548766218, + "grad_norm": 4.114857140083318, + "learning_rate": 4.59749445655851e-06, + "loss": 1.045, + "step": 1823 + }, + { + "epoch": 0.1918308859295095, + "grad_norm": 2.997696029602926, + "learning_rate": 4.597040085263838e-06, + "loss": 1.04, + "step": 1824 + }, + { + "epoch": 0.19193605637135683, + "grad_norm": 3.047085040895484, + "learning_rate": 4.596585480132906e-06, + "loss": 1.0447, + "step": 1825 + }, + { + "epoch": 0.19204122681320415, + "grad_norm": 3.9271378946979847, + "learning_rate": 4.596130641216406e-06, + "loss": 1.008, + "step": 1826 + }, + { + "epoch": 0.19214639725505148, + "grad_norm": 2.0196952752323414, + "learning_rate": 4.595675568565058e-06, + "loss": 0.9986, + "step": 1827 + }, + { + "epoch": 0.19225156769689877, + "grad_norm": 3.228450782508349, + "learning_rate": 4.5952202622296015e-06, + "loss": 1.0224, + "step": 1828 + }, + { + "epoch": 0.1923567381387461, + "grad_norm": 3.398130308359955, + "learning_rate": 4.594764722260812e-06, + "loss": 1.0222, + "step": 1829 + }, + { + "epoch": 0.19246190858059342, + "grad_norm": 2.3897945567374648, + "learning_rate": 4.594308948709482e-06, + "loss": 1.0258, + "step": 1830 + }, + { + "epoch": 0.19256707902244075, + "grad_norm": 2.867204854790798, + "learning_rate": 4.593852941626435e-06, + "loss": 0.9945, + "step": 1831 + }, + { + "epoch": 0.19267224946428807, + "grad_norm": 3.1790669432232317, + "learning_rate": 4.59339670106252e-06, + "loss": 1.0426, + "step": 1832 + }, + { + "epoch": 0.1927774199061354, + "grad_norm": 3.0542457539999694, + "learning_rate": 4.59294022706861e-06, + "loss": 1.0483, + "step": 1833 + }, + { + "epoch": 0.1928825903479827, + "grad_norm": 3.2664531106906685, + "learning_rate": 4.592483519695606e-06, + "loss": 0.989, + "step": 1834 + }, + { + "epoch": 0.19298776078983002, + "grad_norm": 2.4685050890530436, + "learning_rate": 4.592026578994435e-06, + "loss": 1.0167, + "step": 1835 + }, + { + "epoch": 0.19309293123167734, + "grad_norm": 2.9309814017337197, + "learning_rate": 4.59156940501605e-06, + "loss": 1.0402, + "step": 1836 + }, + { + "epoch": 0.19319810167352466, + "grad_norm": 3.121589794261374, + "learning_rate": 4.591111997811427e-06, + "loss": 1.0245, + "step": 1837 + }, + { + "epoch": 0.193303272115372, + "grad_norm": 3.620780997081411, + "learning_rate": 4.590654357431573e-06, + "loss": 1.0185, + "step": 1838 + }, + { + "epoch": 0.19340844255721928, + "grad_norm": 2.5408636903460775, + "learning_rate": 4.590196483927517e-06, + "loss": 1.0493, + "step": 1839 + }, + { + "epoch": 0.1935136129990666, + "grad_norm": 2.642817968832206, + "learning_rate": 4.589738377350316e-06, + "loss": 1.0139, + "step": 1840 + }, + { + "epoch": 0.19361878344091393, + "grad_norm": 2.461137436664243, + "learning_rate": 4.589280037751052e-06, + "loss": 1.048, + "step": 1841 + }, + { + "epoch": 0.19372395388276126, + "grad_norm": 2.2490071042179625, + "learning_rate": 4.5888214651808325e-06, + "loss": 1.014, + "step": 1842 + }, + { + "epoch": 0.19382912432460858, + "grad_norm": 2.4662682458707255, + "learning_rate": 4.5883626596907945e-06, + "loss": 1.0227, + "step": 1843 + }, + { + "epoch": 0.19393429476645588, + "grad_norm": 4.025300912872622, + "learning_rate": 4.587903621332097e-06, + "loss": 1.0394, + "step": 1844 + }, + { + "epoch": 0.1940394652083032, + "grad_norm": 3.0814943588959722, + "learning_rate": 4.5874443501559265e-06, + "loss": 1.0244, + "step": 1845 + }, + { + "epoch": 0.19414463565015053, + "grad_norm": 3.5317457168193074, + "learning_rate": 4.586984846213494e-06, + "loss": 1.0279, + "step": 1846 + }, + { + "epoch": 0.19424980609199785, + "grad_norm": 3.5877904213974934, + "learning_rate": 4.586525109556039e-06, + "loss": 1.0146, + "step": 1847 + }, + { + "epoch": 0.19435497653384518, + "grad_norm": 3.2081496657086075, + "learning_rate": 4.586065140234827e-06, + "loss": 1.022, + "step": 1848 + }, + { + "epoch": 0.19446014697569247, + "grad_norm": 2.0788025293634678, + "learning_rate": 4.585604938301146e-06, + "loss": 1.0051, + "step": 1849 + }, + { + "epoch": 0.1945653174175398, + "grad_norm": 2.894336944770527, + "learning_rate": 4.585144503806312e-06, + "loss": 1.0676, + "step": 1850 + }, + { + "epoch": 0.19467048785938712, + "grad_norm": 2.877696747200756, + "learning_rate": 4.584683836801669e-06, + "loss": 1.0292, + "step": 1851 + }, + { + "epoch": 0.19477565830123444, + "grad_norm": 4.55585031976607, + "learning_rate": 4.584222937338584e-06, + "loss": 1.0615, + "step": 1852 + }, + { + "epoch": 0.19488082874308177, + "grad_norm": 1.7959609995555694, + "learning_rate": 4.583761805468449e-06, + "loss": 0.967, + "step": 1853 + }, + { + "epoch": 0.19498599918492907, + "grad_norm": 2.742630215241344, + "learning_rate": 4.583300441242688e-06, + "loss": 0.9956, + "step": 1854 + }, + { + "epoch": 0.1950911696267764, + "grad_norm": 2.6981352318462974, + "learning_rate": 4.582838844712741e-06, + "loss": 1.0238, + "step": 1855 + }, + { + "epoch": 0.1951963400686237, + "grad_norm": 3.463326781342451, + "learning_rate": 4.582377015930085e-06, + "loss": 1.0759, + "step": 1856 + }, + { + "epoch": 0.19530151051047104, + "grad_norm": 2.4494104426262138, + "learning_rate": 4.581914954946215e-06, + "loss": 1.013, + "step": 1857 + }, + { + "epoch": 0.19540668095231836, + "grad_norm": 2.2066591737539865, + "learning_rate": 4.581452661812655e-06, + "loss": 1.0223, + "step": 1858 + }, + { + "epoch": 0.19551185139416566, + "grad_norm": 3.3773728745426803, + "learning_rate": 4.5809901365809524e-06, + "loss": 1.0246, + "step": 1859 + }, + { + "epoch": 0.19561702183601298, + "grad_norm": 2.5954532792428218, + "learning_rate": 4.580527379302685e-06, + "loss": 1.0264, + "step": 1860 + }, + { + "epoch": 0.1957221922778603, + "grad_norm": 2.651276443421835, + "learning_rate": 4.580064390029452e-06, + "loss": 1.0487, + "step": 1861 + }, + { + "epoch": 0.19582736271970763, + "grad_norm": 2.6830451292812834, + "learning_rate": 4.579601168812882e-06, + "loss": 1.037, + "step": 1862 + }, + { + "epoch": 0.19593253316155496, + "grad_norm": 3.2055413372462813, + "learning_rate": 4.579137715704626e-06, + "loss": 1.0315, + "step": 1863 + }, + { + "epoch": 0.19603770360340225, + "grad_norm": 2.144505459852746, + "learning_rate": 4.578674030756364e-06, + "loss": 0.9836, + "step": 1864 + }, + { + "epoch": 0.19614287404524958, + "grad_norm": 2.630253444185954, + "learning_rate": 4.578210114019799e-06, + "loss": 1.0294, + "step": 1865 + }, + { + "epoch": 0.1962480444870969, + "grad_norm": 3.8376888541418306, + "learning_rate": 4.577745965546662e-06, + "loss": 0.9988, + "step": 1866 + }, + { + "epoch": 0.19635321492894423, + "grad_norm": 2.068518479777229, + "learning_rate": 4.577281585388711e-06, + "loss": 1.0126, + "step": 1867 + }, + { + "epoch": 0.19645838537079155, + "grad_norm": 2.4680664224880497, + "learning_rate": 4.576816973597725e-06, + "loss": 1.013, + "step": 1868 + }, + { + "epoch": 0.19656355581263885, + "grad_norm": 3.2990236331625615, + "learning_rate": 4.576352130225513e-06, + "loss": 1.0228, + "step": 1869 + }, + { + "epoch": 0.19666872625448617, + "grad_norm": 2.772397789423621, + "learning_rate": 4.5758870553239095e-06, + "loss": 0.9804, + "step": 1870 + }, + { + "epoch": 0.1967738966963335, + "grad_norm": 3.1931620564914747, + "learning_rate": 4.575421748944773e-06, + "loss": 1.0534, + "step": 1871 + }, + { + "epoch": 0.19687906713818082, + "grad_norm": 2.4312586968940426, + "learning_rate": 4.574956211139989e-06, + "loss": 1.0221, + "step": 1872 + }, + { + "epoch": 0.19698423758002814, + "grad_norm": 1.9836984160196345, + "learning_rate": 4.574490441961469e-06, + "loss": 1.0625, + "step": 1873 + }, + { + "epoch": 0.19708940802187544, + "grad_norm": 2.687082682874049, + "learning_rate": 4.57402444146115e-06, + "loss": 1.0262, + "step": 1874 + }, + { + "epoch": 0.19719457846372276, + "grad_norm": 3.29954290699376, + "learning_rate": 4.573558209690993e-06, + "loss": 1.0292, + "step": 1875 + }, + { + "epoch": 0.1972997489055701, + "grad_norm": 2.5564424315027505, + "learning_rate": 4.573091746702988e-06, + "loss": 1.0151, + "step": 1876 + }, + { + "epoch": 0.1974049193474174, + "grad_norm": 3.4128408599390716, + "learning_rate": 4.572625052549149e-06, + "loss": 1.0509, + "step": 1877 + }, + { + "epoch": 0.19751008978926474, + "grad_norm": 2.754362006103999, + "learning_rate": 4.572158127281516e-06, + "loss": 0.9928, + "step": 1878 + }, + { + "epoch": 0.19761526023111203, + "grad_norm": 2.737808240634724, + "learning_rate": 4.571690970952155e-06, + "loss": 1.0393, + "step": 1879 + }, + { + "epoch": 0.19772043067295936, + "grad_norm": 2.5757192677948813, + "learning_rate": 4.571223583613157e-06, + "loss": 1.0617, + "step": 1880 + }, + { + "epoch": 0.19782560111480668, + "grad_norm": 2.622124067123704, + "learning_rate": 4.570755965316639e-06, + "loss": 0.9885, + "step": 1881 + }, + { + "epoch": 0.197930771556654, + "grad_norm": 2.5421671476785, + "learning_rate": 4.570288116114745e-06, + "loss": 0.9976, + "step": 1882 + }, + { + "epoch": 0.19803594199850133, + "grad_norm": 2.8283659577999583, + "learning_rate": 4.569820036059644e-06, + "loss": 1.054, + "step": 1883 + }, + { + "epoch": 0.19814111244034863, + "grad_norm": 2.4526889476942695, + "learning_rate": 4.56935172520353e-06, + "loss": 1.0015, + "step": 1884 + }, + { + "epoch": 0.19824628288219595, + "grad_norm": 1.8002824330773355, + "learning_rate": 4.568883183598623e-06, + "loss": 1.0031, + "step": 1885 + }, + { + "epoch": 0.19835145332404328, + "grad_norm": 1.7744689934585205, + "learning_rate": 4.56841441129717e-06, + "loss": 1.0815, + "step": 1886 + }, + { + "epoch": 0.1984566237658906, + "grad_norm": 3.315507706913469, + "learning_rate": 4.567945408351441e-06, + "loss": 1.007, + "step": 1887 + }, + { + "epoch": 0.19856179420773792, + "grad_norm": 2.5040098416445953, + "learning_rate": 4.567476174813735e-06, + "loss": 1.0166, + "step": 1888 + }, + { + "epoch": 0.19866696464958522, + "grad_norm": 3.0590946518561544, + "learning_rate": 4.567006710736375e-06, + "loss": 1.0333, + "step": 1889 + }, + { + "epoch": 0.19877213509143254, + "grad_norm": 1.9729874214632432, + "learning_rate": 4.5665370161717095e-06, + "loss": 1.0099, + "step": 1890 + }, + { + "epoch": 0.19887730553327987, + "grad_norm": 2.646128060765525, + "learning_rate": 4.566067091172114e-06, + "loss": 0.9994, + "step": 1891 + }, + { + "epoch": 0.1989824759751272, + "grad_norm": 2.39440777880957, + "learning_rate": 4.565596935789987e-06, + "loss": 1.0348, + "step": 1892 + }, + { + "epoch": 0.19908764641697452, + "grad_norm": 3.5153787075984586, + "learning_rate": 4.5651265500777564e-06, + "loss": 1.0805, + "step": 1893 + }, + { + "epoch": 0.19919281685882184, + "grad_norm": 2.5557058180370955, + "learning_rate": 4.564655934087873e-06, + "loss": 1.0109, + "step": 1894 + }, + { + "epoch": 0.19929798730066914, + "grad_norm": 2.541551289434198, + "learning_rate": 4.564185087872814e-06, + "loss": 1.0578, + "step": 1895 + }, + { + "epoch": 0.19940315774251646, + "grad_norm": 2.222619391054327, + "learning_rate": 4.563714011485082e-06, + "loss": 1.0541, + "step": 1896 + }, + { + "epoch": 0.1995083281843638, + "grad_norm": 3.635232836192664, + "learning_rate": 4.563242704977206e-06, + "loss": 1.0528, + "step": 1897 + }, + { + "epoch": 0.1996134986262111, + "grad_norm": 2.804337831993397, + "learning_rate": 4.562771168401742e-06, + "loss": 1.033, + "step": 1898 + }, + { + "epoch": 0.19971866906805844, + "grad_norm": 2.940386059188817, + "learning_rate": 4.562299401811268e-06, + "loss": 1.0787, + "step": 1899 + }, + { + "epoch": 0.19982383950990573, + "grad_norm": 2.516512929364484, + "learning_rate": 4.56182740525839e-06, + "loss": 1.0043, + "step": 1900 + }, + { + "epoch": 0.19992900995175306, + "grad_norm": 2.8419554559032596, + "learning_rate": 4.561355178795739e-06, + "loss": 1.0395, + "step": 1901 + }, + { + "epoch": 0.20003418039360038, + "grad_norm": 3.203241148860607, + "learning_rate": 4.560882722475973e-06, + "loss": 1.0354, + "step": 1902 + }, + { + "epoch": 0.2001393508354477, + "grad_norm": 2.9994247587115233, + "learning_rate": 4.560410036351774e-06, + "loss": 1.0102, + "step": 1903 + }, + { + "epoch": 0.20024452127729503, + "grad_norm": 2.5106634446501164, + "learning_rate": 4.559937120475849e-06, + "loss": 1.0177, + "step": 1904 + }, + { + "epoch": 0.20034969171914233, + "grad_norm": 3.201430008264517, + "learning_rate": 4.559463974900934e-06, + "loss": 1.0222, + "step": 1905 + }, + { + "epoch": 0.20045486216098965, + "grad_norm": 3.5626303781607938, + "learning_rate": 4.558990599679787e-06, + "loss": 1.021, + "step": 1906 + }, + { + "epoch": 0.20056003260283697, + "grad_norm": 2.3292692746627024, + "learning_rate": 4.558516994865194e-06, + "loss": 1.0364, + "step": 1907 + }, + { + "epoch": 0.2006652030446843, + "grad_norm": 2.747377159004723, + "learning_rate": 4.558043160509964e-06, + "loss": 1.0101, + "step": 1908 + }, + { + "epoch": 0.20077037348653162, + "grad_norm": 2.876129833597873, + "learning_rate": 4.557569096666934e-06, + "loss": 0.9977, + "step": 1909 + }, + { + "epoch": 0.20087554392837892, + "grad_norm": 3.8381922934160744, + "learning_rate": 4.5570948033889675e-06, + "loss": 1.0311, + "step": 1910 + }, + { + "epoch": 0.20098071437022624, + "grad_norm": 2.5502637033749447, + "learning_rate": 4.5566202807289485e-06, + "loss": 1.0031, + "step": 1911 + }, + { + "epoch": 0.20108588481207357, + "grad_norm": 2.07505839155443, + "learning_rate": 4.556145528739793e-06, + "loss": 1.0599, + "step": 1912 + }, + { + "epoch": 0.2011910552539209, + "grad_norm": 2.8177451858148355, + "learning_rate": 4.555670547474438e-06, + "loss": 1.0686, + "step": 1913 + }, + { + "epoch": 0.20129622569576822, + "grad_norm": 4.173041129558708, + "learning_rate": 4.555195336985848e-06, + "loss": 1.0301, + "step": 1914 + }, + { + "epoch": 0.2014013961376155, + "grad_norm": 2.552074293804037, + "learning_rate": 4.554719897327013e-06, + "loss": 1.0075, + "step": 1915 + }, + { + "epoch": 0.20150656657946284, + "grad_norm": 3.3799408110891975, + "learning_rate": 4.554244228550947e-06, + "loss": 1.0219, + "step": 1916 + }, + { + "epoch": 0.20161173702131016, + "grad_norm": 2.9673353024375415, + "learning_rate": 4.5537683307106924e-06, + "loss": 1.0484, + "step": 1917 + }, + { + "epoch": 0.20171690746315749, + "grad_norm": 2.8301509680066084, + "learning_rate": 4.553292203859314e-06, + "loss": 1.0586, + "step": 1918 + }, + { + "epoch": 0.2018220779050048, + "grad_norm": 3.757513825220459, + "learning_rate": 4.552815848049904e-06, + "loss": 0.9888, + "step": 1919 + }, + { + "epoch": 0.2019272483468521, + "grad_norm": 2.158466766123102, + "learning_rate": 4.552339263335581e-06, + "loss": 1.0265, + "step": 1920 + }, + { + "epoch": 0.20203241878869943, + "grad_norm": 4.206006355483029, + "learning_rate": 4.551862449769487e-06, + "loss": 1.0384, + "step": 1921 + }, + { + "epoch": 0.20213758923054675, + "grad_norm": 2.713465885037005, + "learning_rate": 4.5513854074047905e-06, + "loss": 1.0314, + "step": 1922 + }, + { + "epoch": 0.20224275967239408, + "grad_norm": 2.159072474282689, + "learning_rate": 4.550908136294685e-06, + "loss": 1.0454, + "step": 1923 + }, + { + "epoch": 0.2023479301142414, + "grad_norm": 2.5490149894200846, + "learning_rate": 4.55043063649239e-06, + "loss": 1.0143, + "step": 1924 + }, + { + "epoch": 0.2024531005560887, + "grad_norm": 2.9457442371208815, + "learning_rate": 4.549952908051151e-06, + "loss": 1.0303, + "step": 1925 + }, + { + "epoch": 0.20255827099793602, + "grad_norm": 3.717268820987839, + "learning_rate": 4.549474951024238e-06, + "loss": 1.0308, + "step": 1926 + }, + { + "epoch": 0.20266344143978335, + "grad_norm": 2.6781806036533222, + "learning_rate": 4.548996765464947e-06, + "loss": 1.0291, + "step": 1927 + }, + { + "epoch": 0.20276861188163067, + "grad_norm": 2.18546757845666, + "learning_rate": 4.5485183514266004e-06, + "loss": 1.0024, + "step": 1928 + }, + { + "epoch": 0.202873782323478, + "grad_norm": 2.510233734668508, + "learning_rate": 4.548039708962544e-06, + "loss": 1.0261, + "step": 1929 + }, + { + "epoch": 0.2029789527653253, + "grad_norm": 2.704997908914001, + "learning_rate": 4.547560838126149e-06, + "loss": 0.9926, + "step": 1930 + }, + { + "epoch": 0.20308412320717262, + "grad_norm": 2.5107441763705647, + "learning_rate": 4.5470817389708155e-06, + "loss": 1.0371, + "step": 1931 + }, + { + "epoch": 0.20318929364901994, + "grad_norm": 3.774440786379843, + "learning_rate": 4.546602411549966e-06, + "loss": 1.0473, + "step": 1932 + }, + { + "epoch": 0.20329446409086727, + "grad_norm": 2.3929427473890987, + "learning_rate": 4.546122855917049e-06, + "loss": 1.0211, + "step": 1933 + }, + { + "epoch": 0.2033996345327146, + "grad_norm": 2.150887513168131, + "learning_rate": 4.5456430721255384e-06, + "loss": 0.9851, + "step": 1934 + }, + { + "epoch": 0.2035048049745619, + "grad_norm": 2.4012826520584274, + "learning_rate": 4.545163060228934e-06, + "loss": 1.0105, + "step": 1935 + }, + { + "epoch": 0.2036099754164092, + "grad_norm": 2.1182604676351495, + "learning_rate": 4.544682820280762e-06, + "loss": 1.0485, + "step": 1936 + }, + { + "epoch": 0.20371514585825654, + "grad_norm": 3.9728966306261673, + "learning_rate": 4.54420235233457e-06, + "loss": 1.0238, + "step": 1937 + }, + { + "epoch": 0.20382031630010386, + "grad_norm": 3.2576390659692103, + "learning_rate": 4.543721656443938e-06, + "loss": 1.0105, + "step": 1938 + }, + { + "epoch": 0.20392548674195118, + "grad_norm": 2.7319424380077, + "learning_rate": 4.5432407326624635e-06, + "loss": 1.0294, + "step": 1939 + }, + { + "epoch": 0.20403065718379848, + "grad_norm": 3.6616075268684134, + "learning_rate": 4.542759581043775e-06, + "loss": 1.0375, + "step": 1940 + }, + { + "epoch": 0.2041358276256458, + "grad_norm": 2.286876999823875, + "learning_rate": 4.5422782016415255e-06, + "loss": 0.972, + "step": 1941 + }, + { + "epoch": 0.20424099806749313, + "grad_norm": 3.5108681231659533, + "learning_rate": 4.541796594509391e-06, + "loss": 1.0733, + "step": 1942 + }, + { + "epoch": 0.20434616850934045, + "grad_norm": 1.916935649789232, + "learning_rate": 4.541314759701075e-06, + "loss": 1.0334, + "step": 1943 + }, + { + "epoch": 0.20445133895118778, + "grad_norm": 3.1369552191984047, + "learning_rate": 4.540832697270305e-06, + "loss": 1.0138, + "step": 1944 + }, + { + "epoch": 0.20455650939303507, + "grad_norm": 2.8904365660760942, + "learning_rate": 4.540350407270836e-06, + "loss": 1.0579, + "step": 1945 + }, + { + "epoch": 0.2046616798348824, + "grad_norm": 2.678633223567234, + "learning_rate": 4.539867889756447e-06, + "loss": 1.0582, + "step": 1946 + }, + { + "epoch": 0.20476685027672972, + "grad_norm": 2.5400319091221664, + "learning_rate": 4.539385144780942e-06, + "loss": 1.0247, + "step": 1947 + }, + { + "epoch": 0.20487202071857705, + "grad_norm": 1.8249180058750547, + "learning_rate": 4.538902172398151e-06, + "loss": 1.0025, + "step": 1948 + }, + { + "epoch": 0.20497719116042437, + "grad_norm": 2.39557256136971, + "learning_rate": 4.5384189726619285e-06, + "loss": 1.0428, + "step": 1949 + }, + { + "epoch": 0.20508236160227167, + "grad_norm": 2.0902532581196818, + "learning_rate": 4.537935545626156e-06, + "loss": 1.0141, + "step": 1950 + }, + { + "epoch": 0.205187532044119, + "grad_norm": 2.6266101963885817, + "learning_rate": 4.5374518913447384e-06, + "loss": 1.0491, + "step": 1951 + }, + { + "epoch": 0.20529270248596632, + "grad_norm": 1.8895661108791229, + "learning_rate": 4.536968009871608e-06, + "loss": 1.0318, + "step": 1952 + }, + { + "epoch": 0.20539787292781364, + "grad_norm": 2.1586887149934095, + "learning_rate": 4.536483901260721e-06, + "loss": 1.0529, + "step": 1953 + }, + { + "epoch": 0.20550304336966096, + "grad_norm": 3.064938688866919, + "learning_rate": 4.535999565566058e-06, + "loss": 0.9881, + "step": 1954 + }, + { + "epoch": 0.2056082138115083, + "grad_norm": 3.4278129755911415, + "learning_rate": 4.535515002841628e-06, + "loss": 1.0366, + "step": 1955 + }, + { + "epoch": 0.20571338425335559, + "grad_norm": 2.442730036564765, + "learning_rate": 4.535030213141462e-06, + "loss": 1.0062, + "step": 1956 + }, + { + "epoch": 0.2058185546952029, + "grad_norm": 2.6315188808426813, + "learning_rate": 4.5345451965196196e-06, + "loss": 0.9853, + "step": 1957 + }, + { + "epoch": 0.20592372513705023, + "grad_norm": 3.773145979161978, + "learning_rate": 4.5340599530301826e-06, + "loss": 1.0389, + "step": 1958 + }, + { + "epoch": 0.20602889557889756, + "grad_norm": 2.7833840302672614, + "learning_rate": 4.53357448272726e-06, + "loss": 1.0276, + "step": 1959 + }, + { + "epoch": 0.20613406602074488, + "grad_norm": 3.1775766358378053, + "learning_rate": 4.5330887856649845e-06, + "loss": 0.9929, + "step": 1960 + }, + { + "epoch": 0.20623923646259218, + "grad_norm": 4.593991119183918, + "learning_rate": 4.532602861897516e-06, + "loss": 1.0597, + "step": 1961 + }, + { + "epoch": 0.2063444069044395, + "grad_norm": 2.310336282250504, + "learning_rate": 4.532116711479039e-06, + "loss": 1.0174, + "step": 1962 + }, + { + "epoch": 0.20644957734628683, + "grad_norm": 3.9022900668265668, + "learning_rate": 4.531630334463762e-06, + "loss": 1.079, + "step": 1963 + }, + { + "epoch": 0.20655474778813415, + "grad_norm": 3.760113016517964, + "learning_rate": 4.53114373090592e-06, + "loss": 0.992, + "step": 1964 + }, + { + "epoch": 0.20665991822998148, + "grad_norm": 2.583786767339631, + "learning_rate": 4.5306569008597745e-06, + "loss": 1.0444, + "step": 1965 + }, + { + "epoch": 0.20676508867182877, + "grad_norm": 1.7472575830950914, + "learning_rate": 4.53016984437961e-06, + "loss": 0.9853, + "step": 1966 + }, + { + "epoch": 0.2068702591136761, + "grad_norm": 2.6276414742188097, + "learning_rate": 4.529682561519736e-06, + "loss": 1.0232, + "step": 1967 + }, + { + "epoch": 0.20697542955552342, + "grad_norm": 2.4241124519098705, + "learning_rate": 4.52919505233449e-06, + "loss": 1.017, + "step": 1968 + }, + { + "epoch": 0.20708059999737075, + "grad_norm": 3.4436315276636815, + "learning_rate": 4.528707316878233e-06, + "loss": 1.0483, + "step": 1969 + }, + { + "epoch": 0.20718577043921807, + "grad_norm": 3.6255766199501904, + "learning_rate": 4.528219355205349e-06, + "loss": 1.007, + "step": 1970 + }, + { + "epoch": 0.20729094088106537, + "grad_norm": 2.5750935689757917, + "learning_rate": 4.527731167370252e-06, + "loss": 0.991, + "step": 1971 + }, + { + "epoch": 0.2073961113229127, + "grad_norm": 3.527871448351104, + "learning_rate": 4.527242753427378e-06, + "loss": 0.9817, + "step": 1972 + }, + { + "epoch": 0.20750128176476001, + "grad_norm": 3.0602580136672124, + "learning_rate": 4.526754113431188e-06, + "loss": 1.0392, + "step": 1973 + }, + { + "epoch": 0.20760645220660734, + "grad_norm": 2.7611033572598025, + "learning_rate": 4.526265247436171e-06, + "loss": 0.9994, + "step": 1974 + }, + { + "epoch": 0.20771162264845466, + "grad_norm": 2.6859585185179067, + "learning_rate": 4.525776155496838e-06, + "loss": 1.0119, + "step": 1975 + }, + { + "epoch": 0.20781679309030196, + "grad_norm": 2.782628475167393, + "learning_rate": 4.525286837667726e-06, + "loss": 1.0515, + "step": 1976 + }, + { + "epoch": 0.20792196353214928, + "grad_norm": 2.9155913927480275, + "learning_rate": 4.5247972940034e-06, + "loss": 0.9998, + "step": 1977 + }, + { + "epoch": 0.2080271339739966, + "grad_norm": 2.9307391999616335, + "learning_rate": 4.524307524558446e-06, + "loss": 1.0204, + "step": 1978 + }, + { + "epoch": 0.20813230441584393, + "grad_norm": 2.400026037357351, + "learning_rate": 4.523817529387478e-06, + "loss": 1.0126, + "step": 1979 + }, + { + "epoch": 0.20823747485769126, + "grad_norm": 2.116341446271874, + "learning_rate": 4.523327308545133e-06, + "loss": 1.0167, + "step": 1980 + }, + { + "epoch": 0.20834264529953855, + "grad_norm": 3.6019941589879414, + "learning_rate": 4.522836862086076e-06, + "loss": 1.0122, + "step": 1981 + }, + { + "epoch": 0.20844781574138588, + "grad_norm": 2.530136205218369, + "learning_rate": 4.5223461900649945e-06, + "loss": 1.0279, + "step": 1982 + }, + { + "epoch": 0.2085529861832332, + "grad_norm": 2.5456318862932923, + "learning_rate": 4.521855292536603e-06, + "loss": 1.0164, + "step": 1983 + }, + { + "epoch": 0.20865815662508053, + "grad_norm": 3.2036037703970095, + "learning_rate": 4.52136416955564e-06, + "loss": 1.0244, + "step": 1984 + }, + { + "epoch": 0.20876332706692785, + "grad_norm": 2.700422512134757, + "learning_rate": 4.5208728211768696e-06, + "loss": 1.0144, + "step": 1985 + }, + { + "epoch": 0.20886849750877515, + "grad_norm": 2.4414664331229265, + "learning_rate": 4.520381247455081e-06, + "loss": 1.0219, + "step": 1986 + }, + { + "epoch": 0.20897366795062247, + "grad_norm": 2.3814233051604368, + "learning_rate": 4.519889448445088e-06, + "loss": 1.0393, + "step": 1987 + }, + { + "epoch": 0.2090788383924698, + "grad_norm": 2.6970110365603985, + "learning_rate": 4.519397424201731e-06, + "loss": 1.0227, + "step": 1988 + }, + { + "epoch": 0.20918400883431712, + "grad_norm": 3.0579544571066384, + "learning_rate": 4.518905174779874e-06, + "loss": 1.0041, + "step": 1989 + }, + { + "epoch": 0.20928917927616444, + "grad_norm": 3.0701954087018533, + "learning_rate": 4.518412700234407e-06, + "loss": 1.0392, + "step": 1990 + }, + { + "epoch": 0.20939434971801174, + "grad_norm": 3.10208553663707, + "learning_rate": 4.5179200006202425e-06, + "loss": 0.9866, + "step": 1991 + }, + { + "epoch": 0.20949952015985907, + "grad_norm": 2.5379999852239687, + "learning_rate": 4.517427075992325e-06, + "loss": 0.9833, + "step": 1992 + }, + { + "epoch": 0.2096046906017064, + "grad_norm": 2.898863425583052, + "learning_rate": 4.516933926405614e-06, + "loss": 1.024, + "step": 1993 + }, + { + "epoch": 0.2097098610435537, + "grad_norm": 2.5016600905979134, + "learning_rate": 4.516440551915103e-06, + "loss": 1.0243, + "step": 1994 + }, + { + "epoch": 0.20981503148540104, + "grad_norm": 2.37701429193729, + "learning_rate": 4.5159469525758065e-06, + "loss": 1.0574, + "step": 1995 + }, + { + "epoch": 0.20992020192724833, + "grad_norm": 3.349993368242468, + "learning_rate": 4.515453128442764e-06, + "loss": 0.9991, + "step": 1996 + }, + { + "epoch": 0.21002537236909566, + "grad_norm": 3.041271688381229, + "learning_rate": 4.514959079571042e-06, + "loss": 1.0306, + "step": 1997 + }, + { + "epoch": 0.21013054281094298, + "grad_norm": 3.7606519650863452, + "learning_rate": 4.514464806015729e-06, + "loss": 1.0277, + "step": 1998 + }, + { + "epoch": 0.2102357132527903, + "grad_norm": 2.8528069179863307, + "learning_rate": 4.513970307831941e-06, + "loss": 0.9865, + "step": 1999 + }, + { + "epoch": 0.21034088369463763, + "grad_norm": 3.3798634505812974, + "learning_rate": 4.513475585074819e-06, + "loss": 1.0434, + "step": 2000 + }, + { + "epoch": 0.21044605413648493, + "grad_norm": 3.446977094091343, + "learning_rate": 4.512980637799529e-06, + "loss": 1.0129, + "step": 2001 + }, + { + "epoch": 0.21055122457833225, + "grad_norm": 2.5663183477604483, + "learning_rate": 4.512485466061258e-06, + "loss": 1.0074, + "step": 2002 + }, + { + "epoch": 0.21065639502017958, + "grad_norm": 2.713416190814693, + "learning_rate": 4.511990069915226e-06, + "loss": 1.0175, + "step": 2003 + }, + { + "epoch": 0.2107615654620269, + "grad_norm": 3.017467404478425, + "learning_rate": 4.511494449416671e-06, + "loss": 1.0022, + "step": 2004 + }, + { + "epoch": 0.21086673590387423, + "grad_norm": 2.869407764453783, + "learning_rate": 4.510998604620859e-06, + "loss": 1.0091, + "step": 2005 + }, + { + "epoch": 0.21097190634572152, + "grad_norm": 2.2914275171439624, + "learning_rate": 4.510502535583081e-06, + "loss": 1.027, + "step": 2006 + }, + { + "epoch": 0.21107707678756885, + "grad_norm": 3.1289559026795413, + "learning_rate": 4.510006242358652e-06, + "loss": 1.0192, + "step": 2007 + }, + { + "epoch": 0.21118224722941617, + "grad_norm": 3.365076590082289, + "learning_rate": 4.509509725002913e-06, + "loss": 1.0314, + "step": 2008 + }, + { + "epoch": 0.2112874176712635, + "grad_norm": 2.619162513458224, + "learning_rate": 4.5090129835712286e-06, + "loss": 0.9736, + "step": 2009 + }, + { + "epoch": 0.21139258811311082, + "grad_norm": 2.6581918245692973, + "learning_rate": 4.50851601811899e-06, + "loss": 0.9946, + "step": 2010 + }, + { + "epoch": 0.21149775855495812, + "grad_norm": 3.2402527360258713, + "learning_rate": 4.508018828701613e-06, + "loss": 1.0187, + "step": 2011 + }, + { + "epoch": 0.21160292899680544, + "grad_norm": 3.561750316675759, + "learning_rate": 4.5075214153745375e-06, + "loss": 1.0131, + "step": 2012 + }, + { + "epoch": 0.21170809943865276, + "grad_norm": 3.6081378607990815, + "learning_rate": 4.507023778193229e-06, + "loss": 1.0536, + "step": 2013 + }, + { + "epoch": 0.2118132698805001, + "grad_norm": 3.98061138379272, + "learning_rate": 4.506525917213178e-06, + "loss": 0.9886, + "step": 2014 + }, + { + "epoch": 0.2119184403223474, + "grad_norm": 3.6286847213673656, + "learning_rate": 4.506027832489901e-06, + "loss": 1.0802, + "step": 2015 + }, + { + "epoch": 0.21202361076419474, + "grad_norm": 2.970920799122295, + "learning_rate": 4.505529524078936e-06, + "loss": 0.9943, + "step": 2016 + }, + { + "epoch": 0.21212878120604203, + "grad_norm": 3.1998864392209034, + "learning_rate": 4.505030992035851e-06, + "loss": 1.0541, + "step": 2017 + }, + { + "epoch": 0.21223395164788936, + "grad_norm": 3.5531939185269117, + "learning_rate": 4.504532236416234e-06, + "loss": 1.0057, + "step": 2018 + }, + { + "epoch": 0.21233912208973668, + "grad_norm": 2.6807118334186173, + "learning_rate": 4.504033257275701e-06, + "loss": 0.989, + "step": 2019 + }, + { + "epoch": 0.212444292531584, + "grad_norm": 3.709260634562878, + "learning_rate": 4.5035340546698915e-06, + "loss": 1.0069, + "step": 2020 + }, + { + "epoch": 0.21254946297343133, + "grad_norm": 3.0800163674767655, + "learning_rate": 4.503034628654472e-06, + "loss": 1.0188, + "step": 2021 + }, + { + "epoch": 0.21265463341527863, + "grad_norm": 2.827473390632371, + "learning_rate": 4.50253497928513e-06, + "loss": 1.0189, + "step": 2022 + }, + { + "epoch": 0.21275980385712595, + "grad_norm": 3.293367596776558, + "learning_rate": 4.502035106617583e-06, + "loss": 0.9761, + "step": 2023 + }, + { + "epoch": 0.21286497429897328, + "grad_norm": 2.9764561882025093, + "learning_rate": 4.501535010707569e-06, + "loss": 1.0117, + "step": 2024 + }, + { + "epoch": 0.2129701447408206, + "grad_norm": 1.8974311339827867, + "learning_rate": 4.501034691610852e-06, + "loss": 1.0089, + "step": 2025 + }, + { + "epoch": 0.21307531518266792, + "grad_norm": 3.113046982013976, + "learning_rate": 4.500534149383222e-06, + "loss": 0.9953, + "step": 2026 + }, + { + "epoch": 0.21318048562451522, + "grad_norm": 2.5218275940026307, + "learning_rate": 4.5000333840804945e-06, + "loss": 1.0573, + "step": 2027 + }, + { + "epoch": 0.21328565606636254, + "grad_norm": 2.1504903869654974, + "learning_rate": 4.499532395758507e-06, + "loss": 1.0295, + "step": 2028 + }, + { + "epoch": 0.21339082650820987, + "grad_norm": 2.116485409339894, + "learning_rate": 4.499031184473125e-06, + "loss": 1.0145, + "step": 2029 + }, + { + "epoch": 0.2134959969500572, + "grad_norm": 2.1971557306143583, + "learning_rate": 4.498529750280237e-06, + "loss": 0.9975, + "step": 2030 + }, + { + "epoch": 0.21360116739190452, + "grad_norm": 2.441106240316848, + "learning_rate": 4.4980280932357565e-06, + "loss": 1.0324, + "step": 2031 + }, + { + "epoch": 0.2137063378337518, + "grad_norm": 1.8648832005177987, + "learning_rate": 4.4975262133956235e-06, + "loss": 0.976, + "step": 2032 + }, + { + "epoch": 0.21381150827559914, + "grad_norm": 2.393559217891105, + "learning_rate": 4.497024110815799e-06, + "loss": 1.0236, + "step": 2033 + }, + { + "epoch": 0.21391667871744646, + "grad_norm": 3.4843636423614175, + "learning_rate": 4.496521785552273e-06, + "loss": 1.0423, + "step": 2034 + }, + { + "epoch": 0.2140218491592938, + "grad_norm": 2.9511317676008297, + "learning_rate": 4.496019237661059e-06, + "loss": 1.0201, + "step": 2035 + }, + { + "epoch": 0.2141270196011411, + "grad_norm": 2.554507983535873, + "learning_rate": 4.495516467198193e-06, + "loss": 0.9985, + "step": 2036 + }, + { + "epoch": 0.2142321900429884, + "grad_norm": 2.5051574205030973, + "learning_rate": 4.49501347421974e-06, + "loss": 0.9961, + "step": 2037 + }, + { + "epoch": 0.21433736048483573, + "grad_norm": 2.148199842957758, + "learning_rate": 4.494510258781788e-06, + "loss": 0.9899, + "step": 2038 + }, + { + "epoch": 0.21444253092668306, + "grad_norm": 3.4634861798471466, + "learning_rate": 4.4940068209404465e-06, + "loss": 1.0501, + "step": 2039 + }, + { + "epoch": 0.21454770136853038, + "grad_norm": 3.022603989678606, + "learning_rate": 4.493503160751855e-06, + "loss": 0.999, + "step": 2040 + }, + { + "epoch": 0.2146528718103777, + "grad_norm": 2.595687398370017, + "learning_rate": 4.4929992782721755e-06, + "loss": 1.0535, + "step": 2041 + }, + { + "epoch": 0.214758042252225, + "grad_norm": 2.3610745008748033, + "learning_rate": 4.492495173557594e-06, + "loss": 0.9964, + "step": 2042 + }, + { + "epoch": 0.21486321269407233, + "grad_norm": 2.523798173897767, + "learning_rate": 4.491990846664322e-06, + "loss": 1.0458, + "step": 2043 + }, + { + "epoch": 0.21496838313591965, + "grad_norm": 2.768262718642694, + "learning_rate": 4.4914862976485975e-06, + "loss": 1.0412, + "step": 2044 + }, + { + "epoch": 0.21507355357776697, + "grad_norm": 3.5933411780857605, + "learning_rate": 4.4909815265666786e-06, + "loss": 1.0408, + "step": 2045 + }, + { + "epoch": 0.2151787240196143, + "grad_norm": 1.859821990629818, + "learning_rate": 4.490476533474854e-06, + "loss": 1.033, + "step": 2046 + }, + { + "epoch": 0.2152838944614616, + "grad_norm": 2.952652314844579, + "learning_rate": 4.4899713184294336e-06, + "loss": 1.013, + "step": 2047 + }, + { + "epoch": 0.21538906490330892, + "grad_norm": 3.0877949687525, + "learning_rate": 4.489465881486753e-06, + "loss": 1.065, + "step": 2048 + }, + { + "epoch": 0.21549423534515624, + "grad_norm": 2.1468116186309043, + "learning_rate": 4.48896022270317e-06, + "loss": 0.9845, + "step": 2049 + }, + { + "epoch": 0.21559940578700357, + "grad_norm": 3.0848353636050745, + "learning_rate": 4.488454342135074e-06, + "loss": 1.0412, + "step": 2050 + }, + { + "epoch": 0.2157045762288509, + "grad_norm": 2.827181553896088, + "learning_rate": 4.487948239838869e-06, + "loss": 0.957, + "step": 2051 + }, + { + "epoch": 0.2158097466706982, + "grad_norm": 2.722663933189815, + "learning_rate": 4.487441915870995e-06, + "loss": 0.9992, + "step": 2052 + }, + { + "epoch": 0.2159149171125455, + "grad_norm": 2.996172795915466, + "learning_rate": 4.486935370287907e-06, + "loss": 1.0284, + "step": 2053 + }, + { + "epoch": 0.21602008755439284, + "grad_norm": 3.6047336301727637, + "learning_rate": 4.48642860314609e-06, + "loss": 1.0249, + "step": 2054 + }, + { + "epoch": 0.21612525799624016, + "grad_norm": 2.867533696478624, + "learning_rate": 4.485921614502054e-06, + "loss": 1.0379, + "step": 2055 + }, + { + "epoch": 0.21623042843808749, + "grad_norm": 3.176123229551431, + "learning_rate": 4.485414404412329e-06, + "loss": 1.0427, + "step": 2056 + }, + { + "epoch": 0.21633559887993478, + "grad_norm": 2.9533715243960965, + "learning_rate": 4.484906972933476e-06, + "loss": 0.9631, + "step": 2057 + }, + { + "epoch": 0.2164407693217821, + "grad_norm": 2.8688315629005716, + "learning_rate": 4.484399320122075e-06, + "loss": 1.0105, + "step": 2058 + }, + { + "epoch": 0.21654593976362943, + "grad_norm": 3.5024671051619136, + "learning_rate": 4.483891446034736e-06, + "loss": 1.0587, + "step": 2059 + }, + { + "epoch": 0.21665111020547675, + "grad_norm": 1.901326742128866, + "learning_rate": 4.4833833507280884e-06, + "loss": 1.0266, + "step": 2060 + }, + { + "epoch": 0.21675628064732408, + "grad_norm": 2.5650512457429966, + "learning_rate": 4.4828750342587895e-06, + "loss": 1.0043, + "step": 2061 + }, + { + "epoch": 0.21686145108917138, + "grad_norm": 2.448189051315377, + "learning_rate": 4.482366496683521e-06, + "loss": 1.0479, + "step": 2062 + }, + { + "epoch": 0.2169666215310187, + "grad_norm": 2.824309733500601, + "learning_rate": 4.481857738058989e-06, + "loss": 0.988, + "step": 2063 + }, + { + "epoch": 0.21707179197286602, + "grad_norm": 3.056046040580293, + "learning_rate": 4.481348758441923e-06, + "loss": 1.017, + "step": 2064 + }, + { + "epoch": 0.21717696241471335, + "grad_norm": 2.0164976145859885, + "learning_rate": 4.480839557889079e-06, + "loss": 1.0477, + "step": 2065 + }, + { + "epoch": 0.21728213285656067, + "grad_norm": 2.9759205852579274, + "learning_rate": 4.480330136457237e-06, + "loss": 1.0313, + "step": 2066 + }, + { + "epoch": 0.21738730329840797, + "grad_norm": 4.396630703796807, + "learning_rate": 4.4798204942032005e-06, + "loss": 1.0554, + "step": 2067 + }, + { + "epoch": 0.2174924737402553, + "grad_norm": 3.2600670571924453, + "learning_rate": 4.4793106311838e-06, + "loss": 1.0142, + "step": 2068 + }, + { + "epoch": 0.21759764418210262, + "grad_norm": 2.554456195461736, + "learning_rate": 4.478800547455887e-06, + "loss": 1.033, + "step": 2069 + }, + { + "epoch": 0.21770281462394994, + "grad_norm": 1.8748589681364354, + "learning_rate": 4.478290243076342e-06, + "loss": 0.9785, + "step": 2070 + }, + { + "epoch": 0.21780798506579727, + "grad_norm": 3.558459347898174, + "learning_rate": 4.4777797181020675e-06, + "loss": 1.0148, + "step": 2071 + }, + { + "epoch": 0.2179131555076446, + "grad_norm": 2.4480561013052253, + "learning_rate": 4.477268972589989e-06, + "loss": 1.0506, + "step": 2072 + }, + { + "epoch": 0.2180183259494919, + "grad_norm": 3.167847874819189, + "learning_rate": 4.476758006597061e-06, + "loss": 1.0134, + "step": 2073 + }, + { + "epoch": 0.2181234963913392, + "grad_norm": 2.035096373195341, + "learning_rate": 4.476246820180259e-06, + "loss": 0.9828, + "step": 2074 + }, + { + "epoch": 0.21822866683318654, + "grad_norm": 3.0180664267026978, + "learning_rate": 4.475735413396585e-06, + "loss": 0.9717, + "step": 2075 + }, + { + "epoch": 0.21833383727503386, + "grad_norm": 2.937593336372816, + "learning_rate": 4.475223786303064e-06, + "loss": 1.0418, + "step": 2076 + }, + { + "epoch": 0.21843900771688118, + "grad_norm": 3.1853321519938125, + "learning_rate": 4.474711938956747e-06, + "loss": 1.0308, + "step": 2077 + }, + { + "epoch": 0.21854417815872848, + "grad_norm": 2.781640730371229, + "learning_rate": 4.4741998714147085e-06, + "loss": 0.9788, + "step": 2078 + }, + { + "epoch": 0.2186493486005758, + "grad_norm": 2.97926045372165, + "learning_rate": 4.473687583734048e-06, + "loss": 1.0543, + "step": 2079 + }, + { + "epoch": 0.21875451904242313, + "grad_norm": 3.427682301198862, + "learning_rate": 4.473175075971889e-06, + "loss": 1.0172, + "step": 2080 + }, + { + "epoch": 0.21885968948427045, + "grad_norm": 2.594403559813018, + "learning_rate": 4.472662348185382e-06, + "loss": 1.031, + "step": 2081 + }, + { + "epoch": 0.21896485992611778, + "grad_norm": 3.1273306327310193, + "learning_rate": 4.472149400431699e-06, + "loss": 1.0469, + "step": 2082 + }, + { + "epoch": 0.21907003036796507, + "grad_norm": 2.604066679542293, + "learning_rate": 4.471636232768038e-06, + "loss": 0.9904, + "step": 2083 + }, + { + "epoch": 0.2191752008098124, + "grad_norm": 2.4026133544836767, + "learning_rate": 4.47112284525162e-06, + "loss": 1.0221, + "step": 2084 + }, + { + "epoch": 0.21928037125165972, + "grad_norm": 3.599921055468287, + "learning_rate": 4.470609237939693e-06, + "loss": 0.9989, + "step": 2085 + }, + { + "epoch": 0.21938554169350705, + "grad_norm": 2.131221331900976, + "learning_rate": 4.470095410889528e-06, + "loss": 0.9947, + "step": 2086 + }, + { + "epoch": 0.21949071213535437, + "grad_norm": 2.1497035619992104, + "learning_rate": 4.46958136415842e-06, + "loss": 1.0332, + "step": 2087 + }, + { + "epoch": 0.21959588257720167, + "grad_norm": 2.3616472688438406, + "learning_rate": 4.469067097803689e-06, + "loss": 1.0208, + "step": 2088 + }, + { + "epoch": 0.219701053019049, + "grad_norm": 2.6359552878027674, + "learning_rate": 4.46855261188268e-06, + "loss": 1.0618, + "step": 2089 + }, + { + "epoch": 0.21980622346089632, + "grad_norm": 1.9018280297999302, + "learning_rate": 4.468037906452765e-06, + "loss": 1.006, + "step": 2090 + }, + { + "epoch": 0.21991139390274364, + "grad_norm": 2.5721362754088517, + "learning_rate": 4.4675229815713324e-06, + "loss": 1.0456, + "step": 2091 + }, + { + "epoch": 0.22001656434459096, + "grad_norm": 2.954980814994551, + "learning_rate": 4.467007837295805e-06, + "loss": 1.0167, + "step": 2092 + }, + { + "epoch": 0.22012173478643826, + "grad_norm": 3.3860189739598643, + "learning_rate": 4.466492473683624e-06, + "loss": 1.0261, + "step": 2093 + }, + { + "epoch": 0.22022690522828559, + "grad_norm": 2.9650931599791757, + "learning_rate": 4.465976890792254e-06, + "loss": 1.0466, + "step": 2094 + }, + { + "epoch": 0.2203320756701329, + "grad_norm": 3.104064301699326, + "learning_rate": 4.465461088679189e-06, + "loss": 1.0256, + "step": 2095 + }, + { + "epoch": 0.22043724611198023, + "grad_norm": 2.636686526867292, + "learning_rate": 4.464945067401945e-06, + "loss": 1.0324, + "step": 2096 + }, + { + "epoch": 0.22054241655382756, + "grad_norm": 2.9763466587493896, + "learning_rate": 4.464428827018062e-06, + "loss": 1.0048, + "step": 2097 + }, + { + "epoch": 0.22064758699567485, + "grad_norm": 2.658483317192696, + "learning_rate": 4.4639123675851035e-06, + "loss": 1.0172, + "step": 2098 + }, + { + "epoch": 0.22075275743752218, + "grad_norm": 2.70559891063467, + "learning_rate": 4.46339568916066e-06, + "loss": 1.0571, + "step": 2099 + }, + { + "epoch": 0.2208579278793695, + "grad_norm": 2.880535256255276, + "learning_rate": 4.462878791802345e-06, + "loss": 1.0431, + "step": 2100 + }, + { + "epoch": 0.22096309832121683, + "grad_norm": 3.233937870847447, + "learning_rate": 4.4623616755677965e-06, + "loss": 0.9888, + "step": 2101 + }, + { + "epoch": 0.22106826876306415, + "grad_norm": 2.804108237608663, + "learning_rate": 4.461844340514678e-06, + "loss": 1.0465, + "step": 2102 + }, + { + "epoch": 0.22117343920491145, + "grad_norm": 3.2318628882501295, + "learning_rate": 4.4613267867006745e-06, + "loss": 1.0353, + "step": 2103 + }, + { + "epoch": 0.22127860964675877, + "grad_norm": 3.0591146643434786, + "learning_rate": 4.460809014183498e-06, + "loss": 1.0467, + "step": 2104 + }, + { + "epoch": 0.2213837800886061, + "grad_norm": 3.1219953722891525, + "learning_rate": 4.460291023020885e-06, + "loss": 1.0335, + "step": 2105 + }, + { + "epoch": 0.22148895053045342, + "grad_norm": 2.837039568315429, + "learning_rate": 4.4597728132705944e-06, + "loss": 1.0017, + "step": 2106 + }, + { + "epoch": 0.22159412097230075, + "grad_norm": 2.401393140037482, + "learning_rate": 4.459254384990411e-06, + "loss": 1.0577, + "step": 2107 + }, + { + "epoch": 0.22169929141414804, + "grad_norm": 2.8215140434580905, + "learning_rate": 4.458735738238145e-06, + "loss": 0.9908, + "step": 2108 + }, + { + "epoch": 0.22180446185599537, + "grad_norm": 3.800737740073966, + "learning_rate": 4.4582168730716264e-06, + "loss": 1.0633, + "step": 2109 + }, + { + "epoch": 0.2219096322978427, + "grad_norm": 2.755738180922399, + "learning_rate": 4.457697789548715e-06, + "loss": 1.0299, + "step": 2110 + }, + { + "epoch": 0.22201480273969001, + "grad_norm": 3.5390554379093477, + "learning_rate": 4.457178487727292e-06, + "loss": 1.0707, + "step": 2111 + }, + { + "epoch": 0.22211997318153734, + "grad_norm": 4.020240288638399, + "learning_rate": 4.456658967665265e-06, + "loss": 1.0235, + "step": 2112 + }, + { + "epoch": 0.22222514362338464, + "grad_norm": 2.7427373031139166, + "learning_rate": 4.4561392294205615e-06, + "loss": 1.0166, + "step": 2113 + }, + { + "epoch": 0.22233031406523196, + "grad_norm": 3.0778943699917094, + "learning_rate": 4.455619273051139e-06, + "loss": 1.035, + "step": 2114 + }, + { + "epoch": 0.22243548450707928, + "grad_norm": 2.747240418850475, + "learning_rate": 4.455099098614975e-06, + "loss": 0.9829, + "step": 2115 + }, + { + "epoch": 0.2225406549489266, + "grad_norm": 3.7757497910125486, + "learning_rate": 4.454578706170075e-06, + "loss": 1.0399, + "step": 2116 + }, + { + "epoch": 0.22264582539077393, + "grad_norm": 3.683295020630938, + "learning_rate": 4.454058095774465e-06, + "loss": 1.0256, + "step": 2117 + }, + { + "epoch": 0.22275099583262123, + "grad_norm": 2.9562137863553235, + "learning_rate": 4.453537267486197e-06, + "loss": 1.027, + "step": 2118 + }, + { + "epoch": 0.22285616627446855, + "grad_norm": 3.658729786664914, + "learning_rate": 4.453016221363349e-06, + "loss": 1.0615, + "step": 2119 + }, + { + "epoch": 0.22296133671631588, + "grad_norm": 2.7902924257034796, + "learning_rate": 4.452494957464021e-06, + "loss": 1.0576, + "step": 2120 + }, + { + "epoch": 0.2230665071581632, + "grad_norm": 2.7787504278125286, + "learning_rate": 4.451973475846337e-06, + "loss": 0.9904, + "step": 2121 + }, + { + "epoch": 0.22317167760001053, + "grad_norm": 3.234868103009726, + "learning_rate": 4.451451776568446e-06, + "loss": 1.0415, + "step": 2122 + }, + { + "epoch": 0.22327684804185782, + "grad_norm": 3.1870487546909323, + "learning_rate": 4.450929859688524e-06, + "loss": 1.0222, + "step": 2123 + }, + { + "epoch": 0.22338201848370515, + "grad_norm": 2.4299350284656125, + "learning_rate": 4.450407725264768e-06, + "loss": 1.0192, + "step": 2124 + }, + { + "epoch": 0.22348718892555247, + "grad_norm": 2.0691045775857115, + "learning_rate": 4.449885373355398e-06, + "loss": 1.0423, + "step": 2125 + }, + { + "epoch": 0.2235923593673998, + "grad_norm": 2.2233933117212628, + "learning_rate": 4.449362804018661e-06, + "loss": 0.9819, + "step": 2126 + }, + { + "epoch": 0.22369752980924712, + "grad_norm": 3.019428274676109, + "learning_rate": 4.44884001731283e-06, + "loss": 1.0295, + "step": 2127 + }, + { + "epoch": 0.22380270025109442, + "grad_norm": 2.659057630935619, + "learning_rate": 4.4483170132961964e-06, + "loss": 0.9956, + "step": 2128 + }, + { + "epoch": 0.22390787069294174, + "grad_norm": 2.830751803296575, + "learning_rate": 4.447793792027082e-06, + "loss": 1.0471, + "step": 2129 + }, + { + "epoch": 0.22401304113478906, + "grad_norm": 3.464512110587927, + "learning_rate": 4.447270353563828e-06, + "loss": 1.0079, + "step": 2130 + }, + { + "epoch": 0.2241182115766364, + "grad_norm": 3.3402290513696724, + "learning_rate": 4.446746697964803e-06, + "loss": 1.0274, + "step": 2131 + }, + { + "epoch": 0.2242233820184837, + "grad_norm": 2.6721352938404586, + "learning_rate": 4.446222825288398e-06, + "loss": 0.9991, + "step": 2132 + }, + { + "epoch": 0.22432855246033104, + "grad_norm": 3.3974985750450815, + "learning_rate": 4.44569873559303e-06, + "loss": 1.0168, + "step": 2133 + }, + { + "epoch": 0.22443372290217833, + "grad_norm": 2.272453274425081, + "learning_rate": 4.445174428937137e-06, + "loss": 1.0018, + "step": 2134 + }, + { + "epoch": 0.22453889334402566, + "grad_norm": 2.733296898300132, + "learning_rate": 4.444649905379184e-06, + "loss": 1.0517, + "step": 2135 + }, + { + "epoch": 0.22464406378587298, + "grad_norm": 3.0055966087106563, + "learning_rate": 4.444125164977662e-06, + "loss": 1.0395, + "step": 2136 + }, + { + "epoch": 0.2247492342277203, + "grad_norm": 2.7417512364939682, + "learning_rate": 4.44360020779108e-06, + "loss": 1.0283, + "step": 2137 + }, + { + "epoch": 0.22485440466956763, + "grad_norm": 2.136270049883368, + "learning_rate": 4.443075033877978e-06, + "loss": 1.0313, + "step": 2138 + }, + { + "epoch": 0.22495957511141493, + "grad_norm": 2.623520046793813, + "learning_rate": 4.442549643296915e-06, + "loss": 1.0285, + "step": 2139 + }, + { + "epoch": 0.22506474555326225, + "grad_norm": 2.6185982076249306, + "learning_rate": 4.442024036106476e-06, + "loss": 1.0296, + "step": 2140 + }, + { + "epoch": 0.22516991599510958, + "grad_norm": 2.953502401243642, + "learning_rate": 4.441498212365271e-06, + "loss": 1.0403, + "step": 2141 + }, + { + "epoch": 0.2252750864369569, + "grad_norm": 5.131915620587942, + "learning_rate": 4.440972172131933e-06, + "loss": 1.0135, + "step": 2142 + }, + { + "epoch": 0.22538025687880422, + "grad_norm": 3.2510064788855537, + "learning_rate": 4.440445915465119e-06, + "loss": 1.0226, + "step": 2143 + }, + { + "epoch": 0.22548542732065152, + "grad_norm": 2.2690579029077007, + "learning_rate": 4.439919442423513e-06, + "loss": 1.0519, + "step": 2144 + }, + { + "epoch": 0.22559059776249885, + "grad_norm": 3.028677542365834, + "learning_rate": 4.439392753065818e-06, + "loss": 1.0209, + "step": 2145 + }, + { + "epoch": 0.22569576820434617, + "grad_norm": 2.6183652601612795, + "learning_rate": 4.438865847450766e-06, + "loss": 1.0234, + "step": 2146 + }, + { + "epoch": 0.2258009386461935, + "grad_norm": 2.655490949700378, + "learning_rate": 4.438338725637111e-06, + "loss": 0.9981, + "step": 2147 + }, + { + "epoch": 0.22590610908804082, + "grad_norm": 3.0919304406899104, + "learning_rate": 4.43781138768363e-06, + "loss": 1.0164, + "step": 2148 + }, + { + "epoch": 0.22601127952988811, + "grad_norm": 2.930774431302077, + "learning_rate": 4.437283833649125e-06, + "loss": 0.9999, + "step": 2149 + }, + { + "epoch": 0.22611644997173544, + "grad_norm": 1.6428635453927467, + "learning_rate": 4.436756063592424e-06, + "loss": 0.9752, + "step": 2150 + }, + { + "epoch": 0.22622162041358276, + "grad_norm": 2.3696606960139546, + "learning_rate": 4.4362280775723775e-06, + "loss": 1.0179, + "step": 2151 + }, + { + "epoch": 0.2263267908554301, + "grad_norm": 3.3313866190682817, + "learning_rate": 4.4356998756478585e-06, + "loss": 0.9765, + "step": 2152 + }, + { + "epoch": 0.2264319612972774, + "grad_norm": 2.577906154713765, + "learning_rate": 4.435171457877766e-06, + "loss": 0.97, + "step": 2153 + }, + { + "epoch": 0.2265371317391247, + "grad_norm": 2.503718726560151, + "learning_rate": 4.434642824321023e-06, + "loss": 0.9966, + "step": 2154 + }, + { + "epoch": 0.22664230218097203, + "grad_norm": 2.7189315507014298, + "learning_rate": 4.434113975036577e-06, + "loss": 1.0341, + "step": 2155 + }, + { + "epoch": 0.22674747262281936, + "grad_norm": 2.858149879919419, + "learning_rate": 4.433584910083397e-06, + "loss": 1.0271, + "step": 2156 + }, + { + "epoch": 0.22685264306466668, + "grad_norm": 2.2845656463626094, + "learning_rate": 4.4330556295204796e-06, + "loss": 0.9709, + "step": 2157 + }, + { + "epoch": 0.226957813506514, + "grad_norm": 2.2220751802170122, + "learning_rate": 4.432526133406843e-06, + "loss": 0.9941, + "step": 2158 + }, + { + "epoch": 0.2270629839483613, + "grad_norm": 2.710945616604557, + "learning_rate": 4.43199642180153e-06, + "loss": 1.0139, + "step": 2159 + }, + { + "epoch": 0.22716815439020863, + "grad_norm": 2.3634868040227235, + "learning_rate": 4.4314664947636075e-06, + "loss": 0.9786, + "step": 2160 + }, + { + "epoch": 0.22727332483205595, + "grad_norm": 2.4612164449548977, + "learning_rate": 4.430936352352167e-06, + "loss": 1.0174, + "step": 2161 + }, + { + "epoch": 0.22737849527390327, + "grad_norm": 2.0768737166088878, + "learning_rate": 4.430405994626323e-06, + "loss": 1.013, + "step": 2162 + }, + { + "epoch": 0.2274836657157506, + "grad_norm": 2.3532257856103374, + "learning_rate": 4.429875421645214e-06, + "loss": 1.0091, + "step": 2163 + }, + { + "epoch": 0.2275888361575979, + "grad_norm": 3.0325316947217362, + "learning_rate": 4.429344633468005e-06, + "loss": 1.028, + "step": 2164 + }, + { + "epoch": 0.22769400659944522, + "grad_norm": 3.017583155543902, + "learning_rate": 4.428813630153881e-06, + "loss": 1.0273, + "step": 2165 + }, + { + "epoch": 0.22779917704129254, + "grad_norm": 2.9673310296779034, + "learning_rate": 4.428282411762054e-06, + "loss": 1.0312, + "step": 2166 + }, + { + "epoch": 0.22790434748313987, + "grad_norm": 2.756113051254936, + "learning_rate": 4.427750978351758e-06, + "loss": 1.0329, + "step": 2167 + }, + { + "epoch": 0.2280095179249872, + "grad_norm": 3.197164204980888, + "learning_rate": 4.427219329982253e-06, + "loss": 0.999, + "step": 2168 + }, + { + "epoch": 0.2281146883668345, + "grad_norm": 2.8107489657605944, + "learning_rate": 4.426687466712822e-06, + "loss": 1.02, + "step": 2169 + }, + { + "epoch": 0.2282198588086818, + "grad_norm": 2.7507863352021547, + "learning_rate": 4.426155388602771e-06, + "loss": 0.9941, + "step": 2170 + }, + { + "epoch": 0.22832502925052914, + "grad_norm": 2.0778977549643103, + "learning_rate": 4.425623095711431e-06, + "loss": 1.0075, + "step": 2171 + }, + { + "epoch": 0.22843019969237646, + "grad_norm": 2.5218199437660487, + "learning_rate": 4.425090588098158e-06, + "loss": 1.0061, + "step": 2172 + }, + { + "epoch": 0.2285353701342238, + "grad_norm": 1.8947896556435786, + "learning_rate": 4.424557865822329e-06, + "loss": 1.0319, + "step": 2173 + }, + { + "epoch": 0.22864054057607108, + "grad_norm": 2.8580135935444715, + "learning_rate": 4.424024928943347e-06, + "loss": 1.0138, + "step": 2174 + }, + { + "epoch": 0.2287457110179184, + "grad_norm": 1.9185065863265085, + "learning_rate": 4.42349177752064e-06, + "loss": 1.0251, + "step": 2175 + }, + { + "epoch": 0.22885088145976573, + "grad_norm": 2.2887816350371484, + "learning_rate": 4.4229584116136574e-06, + "loss": 1.0304, + "step": 2176 + }, + { + "epoch": 0.22895605190161306, + "grad_norm": 2.114307828197282, + "learning_rate": 4.422424831281873e-06, + "loss": 1.0362, + "step": 2177 + }, + { + "epoch": 0.22906122234346038, + "grad_norm": 3.6302171769750364, + "learning_rate": 4.421891036584787e-06, + "loss": 1.0028, + "step": 2178 + }, + { + "epoch": 0.22916639278530768, + "grad_norm": 3.4568600537373784, + "learning_rate": 4.421357027581921e-06, + "loss": 0.9796, + "step": 2179 + }, + { + "epoch": 0.229271563227155, + "grad_norm": 3.3494243915757016, + "learning_rate": 4.42082280433282e-06, + "loss": 1.0287, + "step": 2180 + }, + { + "epoch": 0.22937673366900233, + "grad_norm": 3.815143472620564, + "learning_rate": 4.420288366897055e-06, + "loss": 1.0211, + "step": 2181 + }, + { + "epoch": 0.22948190411084965, + "grad_norm": 2.134153845893804, + "learning_rate": 4.419753715334219e-06, + "loss": 1.032, + "step": 2182 + }, + { + "epoch": 0.22958707455269697, + "grad_norm": 2.6909666767089333, + "learning_rate": 4.419218849703933e-06, + "loss": 0.9872, + "step": 2183 + }, + { + "epoch": 0.22969224499454427, + "grad_norm": 2.3377268022126994, + "learning_rate": 4.418683770065834e-06, + "loss": 1.0023, + "step": 2184 + }, + { + "epoch": 0.2297974154363916, + "grad_norm": 3.1485034054066783, + "learning_rate": 4.41814847647959e-06, + "loss": 1.0084, + "step": 2185 + }, + { + "epoch": 0.22990258587823892, + "grad_norm": 2.354211954549771, + "learning_rate": 4.4176129690048905e-06, + "loss": 1.006, + "step": 2186 + }, + { + "epoch": 0.23000775632008624, + "grad_norm": 2.463169295196819, + "learning_rate": 4.417077247701449e-06, + "loss": 1.0514, + "step": 2187 + }, + { + "epoch": 0.23011292676193357, + "grad_norm": 2.130669171298515, + "learning_rate": 4.4165413126290015e-06, + "loss": 1.0368, + "step": 2188 + }, + { + "epoch": 0.23021809720378086, + "grad_norm": 2.6129366976545105, + "learning_rate": 4.416005163847309e-06, + "loss": 0.9739, + "step": 2189 + }, + { + "epoch": 0.2303232676456282, + "grad_norm": 2.84939027329353, + "learning_rate": 4.415468801416158e-06, + "loss": 0.993, + "step": 2190 + }, + { + "epoch": 0.2304284380874755, + "grad_norm": 2.426269871426963, + "learning_rate": 4.4149322253953545e-06, + "loss": 1.0335, + "step": 2191 + }, + { + "epoch": 0.23053360852932284, + "grad_norm": 2.379802288203194, + "learning_rate": 4.414395435844732e-06, + "loss": 0.9636, + "step": 2192 + }, + { + "epoch": 0.23063877897117016, + "grad_norm": 2.400251646859452, + "learning_rate": 4.413858432824147e-06, + "loss": 1.0368, + "step": 2193 + }, + { + "epoch": 0.23074394941301749, + "grad_norm": 2.8763286217663677, + "learning_rate": 4.41332121639348e-06, + "loss": 1.0031, + "step": 2194 + }, + { + "epoch": 0.23084911985486478, + "grad_norm": 3.2398926109360526, + "learning_rate": 4.412783786612634e-06, + "loss": 1.0469, + "step": 2195 + }, + { + "epoch": 0.2309542902967121, + "grad_norm": 2.465573849120777, + "learning_rate": 4.412246143541536e-06, + "loss": 1.015, + "step": 2196 + }, + { + "epoch": 0.23105946073855943, + "grad_norm": 2.7811425882506873, + "learning_rate": 4.4117082872401395e-06, + "loss": 0.9901, + "step": 2197 + }, + { + "epoch": 0.23116463118040675, + "grad_norm": 2.990852585631329, + "learning_rate": 4.411170217768417e-06, + "loss": 1.0293, + "step": 2198 + }, + { + "epoch": 0.23126980162225408, + "grad_norm": 3.261551379361568, + "learning_rate": 4.410631935186369e-06, + "loss": 1.0079, + "step": 2199 + }, + { + "epoch": 0.23137497206410138, + "grad_norm": 2.7992235343655527, + "learning_rate": 4.410093439554019e-06, + "loss": 1.0397, + "step": 2200 + }, + { + "epoch": 0.2314801425059487, + "grad_norm": 2.4192682778964336, + "learning_rate": 4.409554730931412e-06, + "loss": 1.0289, + "step": 2201 + }, + { + "epoch": 0.23158531294779602, + "grad_norm": 2.5957977370487235, + "learning_rate": 4.4090158093786175e-06, + "loss": 1.0366, + "step": 2202 + }, + { + "epoch": 0.23169048338964335, + "grad_norm": 2.8959476464448923, + "learning_rate": 4.408476674955732e-06, + "loss": 1.0785, + "step": 2203 + }, + { + "epoch": 0.23179565383149067, + "grad_norm": 2.644482199192657, + "learning_rate": 4.407937327722871e-06, + "loss": 1.0254, + "step": 2204 + }, + { + "epoch": 0.23190082427333797, + "grad_norm": 3.442594882430092, + "learning_rate": 4.407397767740176e-06, + "loss": 1.0397, + "step": 2205 + }, + { + "epoch": 0.2320059947151853, + "grad_norm": 2.5341412795753566, + "learning_rate": 4.406857995067813e-06, + "loss": 1.006, + "step": 2206 + }, + { + "epoch": 0.23211116515703262, + "grad_norm": 2.2043374296304994, + "learning_rate": 4.406318009765971e-06, + "loss": 1.0111, + "step": 2207 + }, + { + "epoch": 0.23221633559887994, + "grad_norm": 3.0723931658417487, + "learning_rate": 4.405777811894861e-06, + "loss": 1.043, + "step": 2208 + }, + { + "epoch": 0.23232150604072727, + "grad_norm": 2.364705705885201, + "learning_rate": 4.405237401514722e-06, + "loss": 1.0548, + "step": 2209 + }, + { + "epoch": 0.23242667648257456, + "grad_norm": 3.1983834328748246, + "learning_rate": 4.404696778685811e-06, + "loss": 1.0139, + "step": 2210 + }, + { + "epoch": 0.2325318469244219, + "grad_norm": 2.746332139516773, + "learning_rate": 4.4041559434684135e-06, + "loss": 1.02, + "step": 2211 + }, + { + "epoch": 0.2326370173662692, + "grad_norm": 2.274325848928587, + "learning_rate": 4.4036148959228365e-06, + "loss": 1.0089, + "step": 2212 + }, + { + "epoch": 0.23274218780811654, + "grad_norm": 3.0278107573054336, + "learning_rate": 4.40307363610941e-06, + "loss": 1.0081, + "step": 2213 + }, + { + "epoch": 0.23284735824996386, + "grad_norm": 2.165972003847509, + "learning_rate": 4.40253216408849e-06, + "loss": 1.0138, + "step": 2214 + }, + { + "epoch": 0.23295252869181116, + "grad_norm": 2.8880774293438334, + "learning_rate": 4.401990479920455e-06, + "loss": 0.9974, + "step": 2215 + }, + { + "epoch": 0.23305769913365848, + "grad_norm": 2.105729606603633, + "learning_rate": 4.401448583665706e-06, + "loss": 1.0269, + "step": 2216 + }, + { + "epoch": 0.2331628695755058, + "grad_norm": 2.5957248301304108, + "learning_rate": 4.4009064753846685e-06, + "loss": 1.0, + "step": 2217 + }, + { + "epoch": 0.23326804001735313, + "grad_norm": 3.0915416585982336, + "learning_rate": 4.400364155137793e-06, + "loss": 1.0225, + "step": 2218 + }, + { + "epoch": 0.23337321045920045, + "grad_norm": 2.174158247873258, + "learning_rate": 4.3998216229855504e-06, + "loss": 0.9767, + "step": 2219 + }, + { + "epoch": 0.23347838090104775, + "grad_norm": 1.6915485879041205, + "learning_rate": 4.3992788789884385e-06, + "loss": 0.9691, + "step": 2220 + }, + { + "epoch": 0.23358355134289507, + "grad_norm": 3.1989567531239733, + "learning_rate": 4.398735923206978e-06, + "loss": 1.0219, + "step": 2221 + }, + { + "epoch": 0.2336887217847424, + "grad_norm": 2.4905020268160234, + "learning_rate": 4.398192755701713e-06, + "loss": 1.0105, + "step": 2222 + }, + { + "epoch": 0.23379389222658972, + "grad_norm": 3.061385255458901, + "learning_rate": 4.397649376533209e-06, + "loss": 1.0154, + "step": 2223 + }, + { + "epoch": 0.23389906266843705, + "grad_norm": 2.1173291128116647, + "learning_rate": 4.397105785762059e-06, + "loss": 1.0071, + "step": 2224 + }, + { + "epoch": 0.23400423311028434, + "grad_norm": 2.4753927815387264, + "learning_rate": 4.396561983448876e-06, + "loss": 1.0191, + "step": 2225 + }, + { + "epoch": 0.23410940355213167, + "grad_norm": 2.6295178770980465, + "learning_rate": 4.396017969654299e-06, + "loss": 1.0107, + "step": 2226 + }, + { + "epoch": 0.234214573993979, + "grad_norm": 2.555842843500082, + "learning_rate": 4.395473744438989e-06, + "loss": 1.0316, + "step": 2227 + }, + { + "epoch": 0.23431974443582632, + "grad_norm": 2.698211855081485, + "learning_rate": 4.394929307863633e-06, + "loss": 1.0002, + "step": 2228 + }, + { + "epoch": 0.23442491487767364, + "grad_norm": 3.0360295797113963, + "learning_rate": 4.394384659988938e-06, + "loss": 0.9551, + "step": 2229 + }, + { + "epoch": 0.23453008531952094, + "grad_norm": 3.1523775915462826, + "learning_rate": 4.393839800875638e-06, + "loss": 0.9874, + "step": 2230 + }, + { + "epoch": 0.23463525576136826, + "grad_norm": 2.900257288408909, + "learning_rate": 4.3932947305844875e-06, + "loss": 0.9747, + "step": 2231 + }, + { + "epoch": 0.23474042620321559, + "grad_norm": 3.0099034185771005, + "learning_rate": 4.392749449176268e-06, + "loss": 1.0314, + "step": 2232 + }, + { + "epoch": 0.2348455966450629, + "grad_norm": 3.82061371167963, + "learning_rate": 4.39220395671178e-06, + "loss": 1.0465, + "step": 2233 + }, + { + "epoch": 0.23495076708691023, + "grad_norm": 2.833323567535811, + "learning_rate": 4.391658253251853e-06, + "loss": 1.0376, + "step": 2234 + }, + { + "epoch": 0.23505593752875753, + "grad_norm": 4.222429513815148, + "learning_rate": 4.391112338857335e-06, + "loss": 0.9761, + "step": 2235 + }, + { + "epoch": 0.23516110797060485, + "grad_norm": 2.8239821014267354, + "learning_rate": 4.390566213589101e-06, + "loss": 1.0447, + "step": 2236 + }, + { + "epoch": 0.23526627841245218, + "grad_norm": 2.47061316638576, + "learning_rate": 4.390019877508048e-06, + "loss": 1.0365, + "step": 2237 + }, + { + "epoch": 0.2353714488542995, + "grad_norm": 2.504173310706496, + "learning_rate": 4.389473330675096e-06, + "loss": 1.0194, + "step": 2238 + }, + { + "epoch": 0.23547661929614683, + "grad_norm": 3.4045598206190935, + "learning_rate": 4.38892657315119e-06, + "loss": 1.0359, + "step": 2239 + }, + { + "epoch": 0.23558178973799412, + "grad_norm": 3.5442386693147396, + "learning_rate": 4.388379604997297e-06, + "loss": 0.9705, + "step": 2240 + }, + { + "epoch": 0.23568696017984145, + "grad_norm": 3.938459399524716, + "learning_rate": 4.387832426274409e-06, + "loss": 1.0083, + "step": 2241 + }, + { + "epoch": 0.23579213062168877, + "grad_norm": 2.7251348273983993, + "learning_rate": 4.38728503704354e-06, + "loss": 1.0247, + "step": 2242 + }, + { + "epoch": 0.2358973010635361, + "grad_norm": 3.119137566805799, + "learning_rate": 4.386737437365729e-06, + "loss": 0.9493, + "step": 2243 + }, + { + "epoch": 0.23600247150538342, + "grad_norm": 2.526082167109994, + "learning_rate": 4.386189627302037e-06, + "loss": 1.008, + "step": 2244 + }, + { + "epoch": 0.23610764194723072, + "grad_norm": 2.5608821293229105, + "learning_rate": 4.385641606913548e-06, + "loss": 0.9885, + "step": 2245 + }, + { + "epoch": 0.23621281238907804, + "grad_norm": 2.682222019417078, + "learning_rate": 4.385093376261373e-06, + "loss": 1.0404, + "step": 2246 + }, + { + "epoch": 0.23631798283092537, + "grad_norm": 2.258432603781061, + "learning_rate": 4.384544935406641e-06, + "loss": 1.0196, + "step": 2247 + }, + { + "epoch": 0.2364231532727727, + "grad_norm": 2.671642366757651, + "learning_rate": 4.383996284410511e-06, + "loss": 1.024, + "step": 2248 + }, + { + "epoch": 0.23652832371462001, + "grad_norm": 3.1327248445757463, + "learning_rate": 4.383447423334159e-06, + "loss": 1.0182, + "step": 2249 + }, + { + "epoch": 0.2366334941564673, + "grad_norm": 2.723883446970864, + "learning_rate": 4.382898352238788e-06, + "loss": 0.9918, + "step": 2250 + }, + { + "epoch": 0.23673866459831464, + "grad_norm": 3.0308816991498118, + "learning_rate": 4.382349071185624e-06, + "loss": 1.0059, + "step": 2251 + }, + { + "epoch": 0.23684383504016196, + "grad_norm": 2.223810923909976, + "learning_rate": 4.3817995802359155e-06, + "loss": 1.0591, + "step": 2252 + }, + { + "epoch": 0.23694900548200928, + "grad_norm": 2.198306040920323, + "learning_rate": 4.3812498794509365e-06, + "loss": 1.0037, + "step": 2253 + }, + { + "epoch": 0.2370541759238566, + "grad_norm": 2.823104481198863, + "learning_rate": 4.38069996889198e-06, + "loss": 1.0164, + "step": 2254 + }, + { + "epoch": 0.23715934636570393, + "grad_norm": 2.3927642257493975, + "learning_rate": 4.380149848620369e-06, + "loss": 1.0045, + "step": 2255 + }, + { + "epoch": 0.23726451680755123, + "grad_norm": 2.7176141928001334, + "learning_rate": 4.379599518697444e-06, + "loss": 0.9935, + "step": 2256 + }, + { + "epoch": 0.23736968724939855, + "grad_norm": 2.386268088932299, + "learning_rate": 4.379048979184572e-06, + "loss": 1.0106, + "step": 2257 + }, + { + "epoch": 0.23747485769124588, + "grad_norm": 2.327011792366536, + "learning_rate": 4.37849823014314e-06, + "loss": 0.9859, + "step": 2258 + }, + { + "epoch": 0.2375800281330932, + "grad_norm": 2.310094818927127, + "learning_rate": 4.377947271634564e-06, + "loss": 1.0425, + "step": 2259 + }, + { + "epoch": 0.23768519857494053, + "grad_norm": 3.7252277158290026, + "learning_rate": 4.3773961037202784e-06, + "loss": 1.0013, + "step": 2260 + }, + { + "epoch": 0.23779036901678782, + "grad_norm": 2.864574664034268, + "learning_rate": 4.376844726461743e-06, + "loss": 0.9962, + "step": 2261 + }, + { + "epoch": 0.23789553945863515, + "grad_norm": 3.3321853074983405, + "learning_rate": 4.376293139920441e-06, + "loss": 1.0182, + "step": 2262 + }, + { + "epoch": 0.23800070990048247, + "grad_norm": 2.2402296812420173, + "learning_rate": 4.375741344157879e-06, + "loss": 1.0232, + "step": 2263 + }, + { + "epoch": 0.2381058803423298, + "grad_norm": 3.332880747822815, + "learning_rate": 4.375189339235585e-06, + "loss": 0.9578, + "step": 2264 + }, + { + "epoch": 0.23821105078417712, + "grad_norm": 2.4681929360535872, + "learning_rate": 4.374637125215113e-06, + "loss": 0.9791, + "step": 2265 + }, + { + "epoch": 0.23831622122602442, + "grad_norm": 2.404293531281789, + "learning_rate": 4.374084702158039e-06, + "loss": 1.0218, + "step": 2266 + }, + { + "epoch": 0.23842139166787174, + "grad_norm": 2.4396191881750577, + "learning_rate": 4.373532070125962e-06, + "loss": 1.0236, + "step": 2267 + }, + { + "epoch": 0.23852656210971906, + "grad_norm": 2.9682403601811367, + "learning_rate": 4.3729792291805055e-06, + "loss": 1.0397, + "step": 2268 + }, + { + "epoch": 0.2386317325515664, + "grad_norm": 3.1442251603576037, + "learning_rate": 4.372426179383315e-06, + "loss": 0.9759, + "step": 2269 + }, + { + "epoch": 0.2387369029934137, + "grad_norm": 2.7480986026706664, + "learning_rate": 4.371872920796059e-06, + "loss": 1.0032, + "step": 2270 + }, + { + "epoch": 0.238842073435261, + "grad_norm": 3.520061750412933, + "learning_rate": 4.371319453480431e-06, + "loss": 1.0073, + "step": 2271 + }, + { + "epoch": 0.23894724387710833, + "grad_norm": 2.622605154078674, + "learning_rate": 4.370765777498149e-06, + "loss": 1.0505, + "step": 2272 + }, + { + "epoch": 0.23905241431895566, + "grad_norm": 2.995803194412082, + "learning_rate": 4.370211892910949e-06, + "loss": 1.0239, + "step": 2273 + }, + { + "epoch": 0.23915758476080298, + "grad_norm": 3.0130918185780664, + "learning_rate": 4.369657799780595e-06, + "loss": 1.0446, + "step": 2274 + }, + { + "epoch": 0.2392627552026503, + "grad_norm": 2.2351494176209736, + "learning_rate": 4.369103498168872e-06, + "loss": 1.0292, + "step": 2275 + }, + { + "epoch": 0.2393679256444976, + "grad_norm": 2.710072469645011, + "learning_rate": 4.368548988137589e-06, + "loss": 1.045, + "step": 2276 + }, + { + "epoch": 0.23947309608634493, + "grad_norm": 2.3805553686213075, + "learning_rate": 4.367994269748579e-06, + "loss": 0.9925, + "step": 2277 + }, + { + "epoch": 0.23957826652819225, + "grad_norm": 3.089542808347871, + "learning_rate": 4.367439343063696e-06, + "loss": 1.0466, + "step": 2278 + }, + { + "epoch": 0.23968343697003958, + "grad_norm": 1.6810878146896986, + "learning_rate": 4.3668842081448206e-06, + "loss": 1.0232, + "step": 2279 + }, + { + "epoch": 0.2397886074118869, + "grad_norm": 2.8351406317406522, + "learning_rate": 4.366328865053853e-06, + "loss": 1.0121, + "step": 2280 + }, + { + "epoch": 0.2398937778537342, + "grad_norm": 2.2094407063080648, + "learning_rate": 4.365773313852718e-06, + "loss": 1.0166, + "step": 2281 + }, + { + "epoch": 0.23999894829558152, + "grad_norm": 2.816907367682445, + "learning_rate": 4.365217554603365e-06, + "loss": 1.0322, + "step": 2282 + }, + { + "epoch": 0.24010411873742885, + "grad_norm": 3.0356646249148382, + "learning_rate": 4.364661587367765e-06, + "loss": 0.9952, + "step": 2283 + }, + { + "epoch": 0.24020928917927617, + "grad_norm": 3.4033015359175898, + "learning_rate": 4.364105412207914e-06, + "loss": 1.043, + "step": 2284 + }, + { + "epoch": 0.2403144596211235, + "grad_norm": 3.068575576133874, + "learning_rate": 4.3635490291858285e-06, + "loss": 1.0411, + "step": 2285 + }, + { + "epoch": 0.2404196300629708, + "grad_norm": 2.863399553567626, + "learning_rate": 4.362992438363549e-06, + "loss": 0.9437, + "step": 2286 + }, + { + "epoch": 0.24052480050481811, + "grad_norm": 3.0443021645648094, + "learning_rate": 4.362435639803141e-06, + "loss": 0.9712, + "step": 2287 + }, + { + "epoch": 0.24062997094666544, + "grad_norm": 2.100247783006265, + "learning_rate": 4.361878633566691e-06, + "loss": 0.9919, + "step": 2288 + }, + { + "epoch": 0.24073514138851276, + "grad_norm": 2.0331761263216053, + "learning_rate": 4.361321419716309e-06, + "loss": 1.0414, + "step": 2289 + }, + { + "epoch": 0.2408403118303601, + "grad_norm": 2.4507988887485017, + "learning_rate": 4.3607639983141305e-06, + "loss": 1.0253, + "step": 2290 + }, + { + "epoch": 0.24094548227220738, + "grad_norm": 2.3383275471123235, + "learning_rate": 4.360206369422311e-06, + "loss": 0.9672, + "step": 2291 + }, + { + "epoch": 0.2410506527140547, + "grad_norm": 2.985680116206487, + "learning_rate": 4.35964853310303e-06, + "loss": 1.0054, + "step": 2292 + }, + { + "epoch": 0.24115582315590203, + "grad_norm": 3.0674737355369768, + "learning_rate": 4.359090489418492e-06, + "loss": 1.0293, + "step": 2293 + }, + { + "epoch": 0.24126099359774936, + "grad_norm": 3.179508934892898, + "learning_rate": 4.358532238430922e-06, + "loss": 1.0232, + "step": 2294 + }, + { + "epoch": 0.24136616403959668, + "grad_norm": 2.68704103252696, + "learning_rate": 4.35797378020257e-06, + "loss": 1.0297, + "step": 2295 + }, + { + "epoch": 0.24147133448144398, + "grad_norm": 3.002150660941129, + "learning_rate": 4.357415114795709e-06, + "loss": 1.0245, + "step": 2296 + }, + { + "epoch": 0.2415765049232913, + "grad_norm": 2.258390927515721, + "learning_rate": 4.356856242272634e-06, + "loss": 1.0379, + "step": 2297 + }, + { + "epoch": 0.24168167536513863, + "grad_norm": 2.24898790122129, + "learning_rate": 4.356297162695662e-06, + "loss": 0.9842, + "step": 2298 + }, + { + "epoch": 0.24178684580698595, + "grad_norm": 2.0293860274575515, + "learning_rate": 4.3557378761271365e-06, + "loss": 0.9971, + "step": 2299 + }, + { + "epoch": 0.24189201624883327, + "grad_norm": 2.049737362518493, + "learning_rate": 4.3551783826294225e-06, + "loss": 0.9803, + "step": 2300 + }, + { + "epoch": 0.24199718669068057, + "grad_norm": 2.82131721819364, + "learning_rate": 4.3546186822649074e-06, + "loss": 1.0371, + "step": 2301 + }, + { + "epoch": 0.2421023571325279, + "grad_norm": 2.6674301569769794, + "learning_rate": 4.3540587750960015e-06, + "loss": 1.0335, + "step": 2302 + }, + { + "epoch": 0.24220752757437522, + "grad_norm": 2.5133457003010315, + "learning_rate": 4.353498661185139e-06, + "loss": 1.0216, + "step": 2303 + }, + { + "epoch": 0.24231269801622254, + "grad_norm": 2.4664866984104425, + "learning_rate": 4.352938340594778e-06, + "loss": 0.9618, + "step": 2304 + }, + { + "epoch": 0.24241786845806987, + "grad_norm": 2.5932114027385276, + "learning_rate": 4.352377813387398e-06, + "loss": 0.9812, + "step": 2305 + }, + { + "epoch": 0.24252303889991716, + "grad_norm": 2.9653618102280928, + "learning_rate": 4.351817079625502e-06, + "loss": 1.0269, + "step": 2306 + }, + { + "epoch": 0.2426282093417645, + "grad_norm": 2.724639989437581, + "learning_rate": 4.351256139371616e-06, + "loss": 1.0426, + "step": 2307 + }, + { + "epoch": 0.2427333797836118, + "grad_norm": 2.475922956963675, + "learning_rate": 4.350694992688289e-06, + "loss": 1.0309, + "step": 2308 + }, + { + "epoch": 0.24283855022545914, + "grad_norm": 2.3209110579795906, + "learning_rate": 4.350133639638094e-06, + "loss": 0.9894, + "step": 2309 + }, + { + "epoch": 0.24294372066730646, + "grad_norm": 3.9449479325880183, + "learning_rate": 4.349572080283626e-06, + "loss": 1.0032, + "step": 2310 + }, + { + "epoch": 0.24304889110915376, + "grad_norm": 3.591306291688279, + "learning_rate": 4.349010314687503e-06, + "loss": 1.0259, + "step": 2311 + }, + { + "epoch": 0.24315406155100108, + "grad_norm": 2.805682382653168, + "learning_rate": 4.348448342912365e-06, + "loss": 1.0302, + "step": 2312 + }, + { + "epoch": 0.2432592319928484, + "grad_norm": 2.540123317376257, + "learning_rate": 4.347886165020879e-06, + "loss": 1.0104, + "step": 2313 + }, + { + "epoch": 0.24336440243469573, + "grad_norm": 2.305099301071069, + "learning_rate": 4.347323781075731e-06, + "loss": 0.9888, + "step": 2314 + }, + { + "epoch": 0.24346957287654306, + "grad_norm": 3.599970133750129, + "learning_rate": 4.34676119113963e-06, + "loss": 1.0483, + "step": 2315 + }, + { + "epoch": 0.24357474331839038, + "grad_norm": 3.770809111412246, + "learning_rate": 4.3461983952753105e-06, + "loss": 1.018, + "step": 2316 + }, + { + "epoch": 0.24367991376023768, + "grad_norm": 3.0932950050010013, + "learning_rate": 4.3456353935455285e-06, + "loss": 0.9798, + "step": 2317 + }, + { + "epoch": 0.243785084202085, + "grad_norm": 2.2871843195096364, + "learning_rate": 4.345072186013063e-06, + "loss": 1.0237, + "step": 2318 + }, + { + "epoch": 0.24389025464393232, + "grad_norm": 2.957870000494886, + "learning_rate": 4.344508772740714e-06, + "loss": 1.01, + "step": 2319 + }, + { + "epoch": 0.24399542508577965, + "grad_norm": 1.8700377021220989, + "learning_rate": 4.343945153791309e-06, + "loss": 0.9877, + "step": 2320 + }, + { + "epoch": 0.24410059552762697, + "grad_norm": 1.7017015392411625, + "learning_rate": 4.343381329227695e-06, + "loss": 0.9863, + "step": 2321 + }, + { + "epoch": 0.24420576596947427, + "grad_norm": 2.0639304419621642, + "learning_rate": 4.342817299112744e-06, + "loss": 0.9882, + "step": 2322 + }, + { + "epoch": 0.2443109364113216, + "grad_norm": 3.6180446386514125, + "learning_rate": 4.342253063509348e-06, + "loss": 1.0368, + "step": 2323 + }, + { + "epoch": 0.24441610685316892, + "grad_norm": 2.843315371054092, + "learning_rate": 4.341688622480423e-06, + "loss": 1.0174, + "step": 2324 + }, + { + "epoch": 0.24452127729501624, + "grad_norm": 2.315783436822103, + "learning_rate": 4.341123976088912e-06, + "loss": 1.0636, + "step": 2325 + }, + { + "epoch": 0.24462644773686357, + "grad_norm": 2.584613916276836, + "learning_rate": 4.340559124397774e-06, + "loss": 1.0318, + "step": 2326 + }, + { + "epoch": 0.24473161817871086, + "grad_norm": 3.106997533126451, + "learning_rate": 4.339994067469996e-06, + "loss": 0.9838, + "step": 2327 + }, + { + "epoch": 0.2448367886205582, + "grad_norm": 2.931771272566417, + "learning_rate": 4.339428805368586e-06, + "loss": 1.026, + "step": 2328 + }, + { + "epoch": 0.2449419590624055, + "grad_norm": 3.097355703100136, + "learning_rate": 4.338863338156575e-06, + "loss": 1.0357, + "step": 2329 + }, + { + "epoch": 0.24504712950425284, + "grad_norm": 2.163293960868166, + "learning_rate": 4.338297665897016e-06, + "loss": 1.0129, + "step": 2330 + }, + { + "epoch": 0.24515229994610016, + "grad_norm": 2.6395195900185966, + "learning_rate": 4.3377317886529875e-06, + "loss": 1.0473, + "step": 2331 + }, + { + "epoch": 0.24525747038794746, + "grad_norm": 2.857130604203511, + "learning_rate": 4.337165706487589e-06, + "loss": 1.0317, + "step": 2332 + }, + { + "epoch": 0.24536264082979478, + "grad_norm": 2.49538696157863, + "learning_rate": 4.336599419463943e-06, + "loss": 0.9921, + "step": 2333 + }, + { + "epoch": 0.2454678112716421, + "grad_norm": 3.3688127588253645, + "learning_rate": 4.336032927645194e-06, + "loss": 1.0419, + "step": 2334 + }, + { + "epoch": 0.24557298171348943, + "grad_norm": 2.4873830553852536, + "learning_rate": 4.3354662310945105e-06, + "loss": 1.0384, + "step": 2335 + }, + { + "epoch": 0.24567815215533675, + "grad_norm": 4.367565068639842, + "learning_rate": 4.334899329875084e-06, + "loss": 1.0342, + "step": 2336 + }, + { + "epoch": 0.24578332259718405, + "grad_norm": 2.3864309309189227, + "learning_rate": 4.3343322240501276e-06, + "loss": 0.9888, + "step": 2337 + }, + { + "epoch": 0.24588849303903137, + "grad_norm": 2.2279995974847298, + "learning_rate": 4.33376491368288e-06, + "loss": 1.0035, + "step": 2338 + }, + { + "epoch": 0.2459936634808787, + "grad_norm": 2.8286281778671287, + "learning_rate": 4.333197398836598e-06, + "loss": 0.9848, + "step": 2339 + }, + { + "epoch": 0.24609883392272602, + "grad_norm": 1.9274410042281873, + "learning_rate": 4.332629679574566e-06, + "loss": 1.0054, + "step": 2340 + }, + { + "epoch": 0.24620400436457335, + "grad_norm": 2.5323024007120893, + "learning_rate": 4.332061755960089e-06, + "loss": 1.0341, + "step": 2341 + }, + { + "epoch": 0.24630917480642064, + "grad_norm": 2.8771741834278948, + "learning_rate": 4.331493628056494e-06, + "loss": 1.0287, + "step": 2342 + }, + { + "epoch": 0.24641434524826797, + "grad_norm": 3.559675628437571, + "learning_rate": 4.33092529592713e-06, + "loss": 1.0325, + "step": 2343 + }, + { + "epoch": 0.2465195156901153, + "grad_norm": 2.4388512232776023, + "learning_rate": 4.330356759635374e-06, + "loss": 1.0221, + "step": 2344 + }, + { + "epoch": 0.24662468613196262, + "grad_norm": 2.682329033109531, + "learning_rate": 4.329788019244621e-06, + "loss": 0.9862, + "step": 2345 + }, + { + "epoch": 0.24672985657380994, + "grad_norm": 2.5463446910483367, + "learning_rate": 4.329219074818288e-06, + "loss": 1.0284, + "step": 2346 + }, + { + "epoch": 0.24683502701565724, + "grad_norm": 2.0296846553885253, + "learning_rate": 4.32864992641982e-06, + "loss": 1.0119, + "step": 2347 + }, + { + "epoch": 0.24694019745750456, + "grad_norm": 2.048540561932806, + "learning_rate": 4.328080574112678e-06, + "loss": 1.01, + "step": 2348 + }, + { + "epoch": 0.2470453678993519, + "grad_norm": 3.4200047911393745, + "learning_rate": 4.327511017960353e-06, + "loss": 1.0236, + "step": 2349 + }, + { + "epoch": 0.2471505383411992, + "grad_norm": 2.777887461388932, + "learning_rate": 4.326941258026352e-06, + "loss": 1.0413, + "step": 2350 + }, + { + "epoch": 0.24725570878304653, + "grad_norm": 2.6334314381177175, + "learning_rate": 4.326371294374207e-06, + "loss": 1.0092, + "step": 2351 + }, + { + "epoch": 0.24736087922489383, + "grad_norm": 2.880908366865062, + "learning_rate": 4.325801127067477e-06, + "loss": 1.0373, + "step": 2352 + }, + { + "epoch": 0.24746604966674116, + "grad_norm": 3.588695174592861, + "learning_rate": 4.325230756169737e-06, + "loss": 1.0001, + "step": 2353 + }, + { + "epoch": 0.24757122010858848, + "grad_norm": 3.0543565117198535, + "learning_rate": 4.324660181744589e-06, + "loss": 0.9983, + "step": 2354 + }, + { + "epoch": 0.2476763905504358, + "grad_norm": 2.650395143277798, + "learning_rate": 4.324089403855657e-06, + "loss": 1.0016, + "step": 2355 + }, + { + "epoch": 0.24778156099228313, + "grad_norm": 2.339046826958962, + "learning_rate": 4.323518422566586e-06, + "loss": 1.021, + "step": 2356 + }, + { + "epoch": 0.24788673143413043, + "grad_norm": 2.6212259533171647, + "learning_rate": 4.322947237941045e-06, + "loss": 0.9759, + "step": 2357 + }, + { + "epoch": 0.24799190187597775, + "grad_norm": 3.383293390834341, + "learning_rate": 4.322375850042726e-06, + "loss": 0.9904, + "step": 2358 + }, + { + "epoch": 0.24809707231782507, + "grad_norm": 2.670237539236923, + "learning_rate": 4.321804258935343e-06, + "loss": 1.0199, + "step": 2359 + }, + { + "epoch": 0.2482022427596724, + "grad_norm": 3.250184677256926, + "learning_rate": 4.3212324646826334e-06, + "loss": 0.9948, + "step": 2360 + }, + { + "epoch": 0.24830741320151972, + "grad_norm": 2.5906224312948916, + "learning_rate": 4.320660467348355e-06, + "loss": 1.0054, + "step": 2361 + }, + { + "epoch": 0.24841258364336702, + "grad_norm": 2.3329999607620695, + "learning_rate": 4.320088266996292e-06, + "loss": 1.0111, + "step": 2362 + }, + { + "epoch": 0.24851775408521434, + "grad_norm": 3.94765814630664, + "learning_rate": 4.3195158636902475e-06, + "loss": 1.0006, + "step": 2363 + }, + { + "epoch": 0.24862292452706167, + "grad_norm": 2.247678998715992, + "learning_rate": 4.3189432574940506e-06, + "loss": 0.9504, + "step": 2364 + }, + { + "epoch": 0.248728094968909, + "grad_norm": 2.638346815741289, + "learning_rate": 4.3183704484715496e-06, + "loss": 1.016, + "step": 2365 + }, + { + "epoch": 0.24883326541075632, + "grad_norm": 2.9361318540962094, + "learning_rate": 4.3177974366866185e-06, + "loss": 1.0447, + "step": 2366 + }, + { + "epoch": 0.2489384358526036, + "grad_norm": 3.1042757839794515, + "learning_rate": 4.317224222203152e-06, + "loss": 1.0061, + "step": 2367 + }, + { + "epoch": 0.24904360629445094, + "grad_norm": 2.2154767610485426, + "learning_rate": 4.316650805085068e-06, + "loss": 1.005, + "step": 2368 + }, + { + "epoch": 0.24914877673629826, + "grad_norm": 3.623072584794595, + "learning_rate": 4.316077185396307e-06, + "loss": 1.0467, + "step": 2369 + }, + { + "epoch": 0.24925394717814559, + "grad_norm": 2.33674243869589, + "learning_rate": 4.3155033632008316e-06, + "loss": 1.0021, + "step": 2370 + }, + { + "epoch": 0.2493591176199929, + "grad_norm": 2.8007703287989907, + "learning_rate": 4.314929338562629e-06, + "loss": 1.0135, + "step": 2371 + }, + { + "epoch": 0.24946428806184023, + "grad_norm": 3.2196932437208923, + "learning_rate": 4.314355111545705e-06, + "loss": 0.9289, + "step": 2372 + }, + { + "epoch": 0.24956945850368753, + "grad_norm": 3.334365342086359, + "learning_rate": 4.313780682214093e-06, + "loss": 1.0507, + "step": 2373 + }, + { + "epoch": 0.24967462894553485, + "grad_norm": 2.4745573038357915, + "learning_rate": 4.313206050631844e-06, + "loss": 0.9509, + "step": 2374 + }, + { + "epoch": 0.24977979938738218, + "grad_norm": 2.0261724785390123, + "learning_rate": 4.312631216863035e-06, + "loss": 1.0073, + "step": 2375 + }, + { + "epoch": 0.2498849698292295, + "grad_norm": 3.786354346442156, + "learning_rate": 4.312056180971764e-06, + "loss": 1.0409, + "step": 2376 + }, + { + "epoch": 0.24999014027107683, + "grad_norm": 1.8293455049956973, + "learning_rate": 4.3114809430221534e-06, + "loss": 0.983, + "step": 2377 + }, + { + "epoch": 0.2500953107129241, + "grad_norm": 2.7551244030560023, + "learning_rate": 4.3109055030783445e-06, + "loss": 1.0099, + "step": 2378 + }, + { + "epoch": 0.25020048115477145, + "grad_norm": 3.713006572607176, + "learning_rate": 4.310329861204505e-06, + "loss": 0.9658, + "step": 2379 + }, + { + "epoch": 0.2503056515966188, + "grad_norm": 3.841601007385603, + "learning_rate": 4.309754017464823e-06, + "loss": 1.0511, + "step": 2380 + }, + { + "epoch": 0.2504108220384661, + "grad_norm": 1.7189850598284027, + "learning_rate": 4.309177971923509e-06, + "loss": 1.0225, + "step": 2381 + }, + { + "epoch": 0.2505159924803134, + "grad_norm": 2.3106913101555326, + "learning_rate": 4.308601724644797e-06, + "loss": 1.0436, + "step": 2382 + }, + { + "epoch": 0.25062116292216075, + "grad_norm": 1.685769997718884, + "learning_rate": 4.308025275692943e-06, + "loss": 0.9828, + "step": 2383 + }, + { + "epoch": 0.25072633336400807, + "grad_norm": 2.605151083100969, + "learning_rate": 4.307448625132225e-06, + "loss": 0.9986, + "step": 2384 + }, + { + "epoch": 0.25083150380585534, + "grad_norm": 2.070105328779204, + "learning_rate": 4.306871773026945e-06, + "loss": 0.9713, + "step": 2385 + }, + { + "epoch": 0.25093667424770266, + "grad_norm": 2.1511866343693007, + "learning_rate": 4.306294719441426e-06, + "loss": 1.0466, + "step": 2386 + }, + { + "epoch": 0.25104184468955, + "grad_norm": 2.7021103206947936, + "learning_rate": 4.305717464440015e-06, + "loss": 1.042, + "step": 2387 + }, + { + "epoch": 0.2511470151313973, + "grad_norm": 2.264135547093338, + "learning_rate": 4.305140008087078e-06, + "loss": 1.0355, + "step": 2388 + }, + { + "epoch": 0.25125218557324464, + "grad_norm": 2.6174496430229994, + "learning_rate": 4.304562350447008e-06, + "loss": 1.0403, + "step": 2389 + }, + { + "epoch": 0.25135735601509196, + "grad_norm": 2.5030793337590103, + "learning_rate": 4.303984491584217e-06, + "loss": 0.9831, + "step": 2390 + }, + { + "epoch": 0.2514625264569393, + "grad_norm": 3.246883328280729, + "learning_rate": 4.303406431563142e-06, + "loss": 1.0792, + "step": 2391 + }, + { + "epoch": 0.2515676968987866, + "grad_norm": 2.7687583296326004, + "learning_rate": 4.30282817044824e-06, + "loss": 1.0085, + "step": 2392 + }, + { + "epoch": 0.25167286734063393, + "grad_norm": 2.3272942288985554, + "learning_rate": 4.302249708303993e-06, + "loss": 1.0151, + "step": 2393 + }, + { + "epoch": 0.25177803778248126, + "grad_norm": 2.266231486995089, + "learning_rate": 4.301671045194902e-06, + "loss": 1.0047, + "step": 2394 + }, + { + "epoch": 0.2518832082243285, + "grad_norm": 2.754988205132692, + "learning_rate": 4.301092181185495e-06, + "loss": 1.0253, + "step": 2395 + }, + { + "epoch": 0.25198837866617585, + "grad_norm": 2.6929620972255255, + "learning_rate": 4.300513116340317e-06, + "loss": 1.0054, + "step": 2396 + }, + { + "epoch": 0.2520935491080232, + "grad_norm": 2.336085325218602, + "learning_rate": 4.2999338507239395e-06, + "loss": 0.9848, + "step": 2397 + }, + { + "epoch": 0.2521987195498705, + "grad_norm": 2.461142882427421, + "learning_rate": 4.299354384400957e-06, + "loss": 1.0059, + "step": 2398 + }, + { + "epoch": 0.2523038899917178, + "grad_norm": 2.532391792296108, + "learning_rate": 4.2987747174359805e-06, + "loss": 0.9754, + "step": 2399 + }, + { + "epoch": 0.25240906043356515, + "grad_norm": 2.621685341882318, + "learning_rate": 4.298194849893651e-06, + "loss": 1.0266, + "step": 2400 + }, + { + "epoch": 0.25251423087541247, + "grad_norm": 2.3181343970978356, + "learning_rate": 4.297614781838627e-06, + "loss": 1.0272, + "step": 2401 + }, + { + "epoch": 0.2526194013172598, + "grad_norm": 2.05334980419308, + "learning_rate": 4.297034513335591e-06, + "loss": 0.9885, + "step": 2402 + }, + { + "epoch": 0.2527245717591071, + "grad_norm": 2.364971791424021, + "learning_rate": 4.296454044449245e-06, + "loss": 1.009, + "step": 2403 + }, + { + "epoch": 0.25282974220095444, + "grad_norm": 2.0265896786829805, + "learning_rate": 4.295873375244319e-06, + "loss": 1.0018, + "step": 2404 + }, + { + "epoch": 0.25293491264280177, + "grad_norm": 4.311217157785901, + "learning_rate": 4.295292505785561e-06, + "loss": 1.0421, + "step": 2405 + }, + { + "epoch": 0.25304008308464904, + "grad_norm": 2.629094308586008, + "learning_rate": 4.294711436137742e-06, + "loss": 1.0348, + "step": 2406 + }, + { + "epoch": 0.25314525352649636, + "grad_norm": 2.680285943391119, + "learning_rate": 4.294130166365656e-06, + "loss": 1.057, + "step": 2407 + }, + { + "epoch": 0.2532504239683437, + "grad_norm": 2.5244192892893267, + "learning_rate": 4.293548696534119e-06, + "loss": 1.0008, + "step": 2408 + }, + { + "epoch": 0.253355594410191, + "grad_norm": 2.1567089358080076, + "learning_rate": 4.2929670267079695e-06, + "loss": 1.0427, + "step": 2409 + }, + { + "epoch": 0.25346076485203833, + "grad_norm": 2.4191444936394277, + "learning_rate": 4.292385156952069e-06, + "loss": 1.0596, + "step": 2410 + }, + { + "epoch": 0.25356593529388566, + "grad_norm": 2.260012880541108, + "learning_rate": 4.291803087331299e-06, + "loss": 1.0076, + "step": 2411 + }, + { + "epoch": 0.253671105735733, + "grad_norm": 1.9375892516696325, + "learning_rate": 4.291220817910566e-06, + "loss": 1.036, + "step": 2412 + }, + { + "epoch": 0.2537762761775803, + "grad_norm": 2.425680002791181, + "learning_rate": 4.290638348754797e-06, + "loss": 1.0195, + "step": 2413 + }, + { + "epoch": 0.25388144661942763, + "grad_norm": 2.1474551573197074, + "learning_rate": 4.2900556799289415e-06, + "loss": 0.9866, + "step": 2414 + }, + { + "epoch": 0.25398661706127496, + "grad_norm": 2.3436746364584877, + "learning_rate": 4.289472811497972e-06, + "loss": 1.0447, + "step": 2415 + }, + { + "epoch": 0.2540917875031222, + "grad_norm": 2.114334641282732, + "learning_rate": 4.288889743526884e-06, + "loss": 1.0044, + "step": 2416 + }, + { + "epoch": 0.25419695794496955, + "grad_norm": 2.5018267220432846, + "learning_rate": 4.288306476080691e-06, + "loss": 1.061, + "step": 2417 + }, + { + "epoch": 0.2543021283868169, + "grad_norm": 3.079700459263164, + "learning_rate": 4.287723009224436e-06, + "loss": 1.0394, + "step": 2418 + }, + { + "epoch": 0.2544072988286642, + "grad_norm": 2.6029768657221672, + "learning_rate": 4.2871393430231775e-06, + "loss": 1.0305, + "step": 2419 + }, + { + "epoch": 0.2545124692705115, + "grad_norm": 3.4032647927169966, + "learning_rate": 4.2865554775419985e-06, + "loss": 1.0707, + "step": 2420 + }, + { + "epoch": 0.25461763971235885, + "grad_norm": 2.9725437594396964, + "learning_rate": 4.2859714128460065e-06, + "loss": 0.9889, + "step": 2421 + }, + { + "epoch": 0.25472281015420617, + "grad_norm": 1.908112429787501, + "learning_rate": 4.2853871490003265e-06, + "loss": 0.9918, + "step": 2422 + }, + { + "epoch": 0.2548279805960535, + "grad_norm": 2.4844690233952913, + "learning_rate": 4.2848026860701104e-06, + "loss": 1.0429, + "step": 2423 + }, + { + "epoch": 0.2549331510379008, + "grad_norm": 2.7736489003252034, + "learning_rate": 4.284218024120531e-06, + "loss": 1.0414, + "step": 2424 + }, + { + "epoch": 0.25503832147974814, + "grad_norm": 2.532322414340705, + "learning_rate": 4.28363316321678e-06, + "loss": 1.001, + "step": 2425 + }, + { + "epoch": 0.2551434919215954, + "grad_norm": 2.970052877513903, + "learning_rate": 4.283048103424077e-06, + "loss": 1.0803, + "step": 2426 + }, + { + "epoch": 0.25524866236344274, + "grad_norm": 3.4153519083462185, + "learning_rate": 4.282462844807659e-06, + "loss": 1.01, + "step": 2427 + }, + { + "epoch": 0.25535383280529006, + "grad_norm": 3.1053802029271513, + "learning_rate": 4.281877387432787e-06, + "loss": 1.051, + "step": 2428 + }, + { + "epoch": 0.2554590032471374, + "grad_norm": 2.6052490168349975, + "learning_rate": 4.281291731364744e-06, + "loss": 0.9926, + "step": 2429 + }, + { + "epoch": 0.2555641736889847, + "grad_norm": 2.1996478485684903, + "learning_rate": 4.280705876668836e-06, + "loss": 1.0286, + "step": 2430 + }, + { + "epoch": 0.25566934413083203, + "grad_norm": 2.4448585973957906, + "learning_rate": 4.2801198234103895e-06, + "loss": 1.0489, + "step": 2431 + }, + { + "epoch": 0.25577451457267936, + "grad_norm": 2.605626730548949, + "learning_rate": 4.279533571654754e-06, + "loss": 1.0234, + "step": 2432 + }, + { + "epoch": 0.2558796850145267, + "grad_norm": 2.2571136636808773, + "learning_rate": 4.2789471214673e-06, + "loss": 1.049, + "step": 2433 + }, + { + "epoch": 0.255984855456374, + "grad_norm": 2.9495392792916615, + "learning_rate": 4.278360472913424e-06, + "loss": 0.9787, + "step": 2434 + }, + { + "epoch": 0.25609002589822133, + "grad_norm": 2.0896220524241884, + "learning_rate": 4.277773626058542e-06, + "loss": 1.0116, + "step": 2435 + }, + { + "epoch": 0.2561951963400686, + "grad_norm": 3.163234673182296, + "learning_rate": 4.277186580968088e-06, + "loss": 1.0366, + "step": 2436 + }, + { + "epoch": 0.2563003667819159, + "grad_norm": 2.462962363222071, + "learning_rate": 4.276599337707525e-06, + "loss": 1.0296, + "step": 2437 + }, + { + "epoch": 0.25640553722376325, + "grad_norm": 2.1865954009193524, + "learning_rate": 4.276011896342336e-06, + "loss": 1.0365, + "step": 2438 + }, + { + "epoch": 0.25651070766561057, + "grad_norm": 2.3965938792682144, + "learning_rate": 4.2754242569380226e-06, + "loss": 1.0514, + "step": 2439 + }, + { + "epoch": 0.2566158781074579, + "grad_norm": 2.428078705061389, + "learning_rate": 4.2748364195601135e-06, + "loss": 1.0428, + "step": 2440 + }, + { + "epoch": 0.2567210485493052, + "grad_norm": 2.7590755653461203, + "learning_rate": 4.274248384274156e-06, + "loss": 0.9769, + "step": 2441 + }, + { + "epoch": 0.25682621899115254, + "grad_norm": 1.9677366099461735, + "learning_rate": 4.27366015114572e-06, + "loss": 0.985, + "step": 2442 + }, + { + "epoch": 0.25693138943299987, + "grad_norm": 2.209987282212459, + "learning_rate": 4.2730717202404e-06, + "loss": 1.0309, + "step": 2443 + }, + { + "epoch": 0.2570365598748472, + "grad_norm": 1.9968714101047118, + "learning_rate": 4.272483091623809e-06, + "loss": 1.0394, + "step": 2444 + }, + { + "epoch": 0.2571417303166945, + "grad_norm": 2.579965658298572, + "learning_rate": 4.271894265361584e-06, + "loss": 0.9979, + "step": 2445 + }, + { + "epoch": 0.2572469007585418, + "grad_norm": 1.8660957651544707, + "learning_rate": 4.2713052415193844e-06, + "loss": 1.0358, + "step": 2446 + }, + { + "epoch": 0.2573520712003891, + "grad_norm": 2.4422132950894757, + "learning_rate": 4.27071602016289e-06, + "loss": 1.0568, + "step": 2447 + }, + { + "epoch": 0.25745724164223643, + "grad_norm": 3.129384441162122, + "learning_rate": 4.270126601357804e-06, + "loss": 1.085, + "step": 2448 + }, + { + "epoch": 0.25756241208408376, + "grad_norm": 2.813236158519783, + "learning_rate": 4.269536985169851e-06, + "loss": 1.0045, + "step": 2449 + }, + { + "epoch": 0.2576675825259311, + "grad_norm": 2.4100320645639863, + "learning_rate": 4.268947171664778e-06, + "loss": 1.0047, + "step": 2450 + }, + { + "epoch": 0.2577727529677784, + "grad_norm": 2.939524262190411, + "learning_rate": 4.268357160908354e-06, + "loss": 1.0213, + "step": 2451 + }, + { + "epoch": 0.25787792340962573, + "grad_norm": 2.450057161789921, + "learning_rate": 4.267766952966369e-06, + "loss": 1.0147, + "step": 2452 + }, + { + "epoch": 0.25798309385147306, + "grad_norm": 3.105852562406945, + "learning_rate": 4.267176547904636e-06, + "loss": 0.9983, + "step": 2453 + }, + { + "epoch": 0.2580882642933204, + "grad_norm": 3.0495170539629894, + "learning_rate": 4.266585945788991e-06, + "loss": 0.9887, + "step": 2454 + }, + { + "epoch": 0.2581934347351677, + "grad_norm": 1.9555953731591447, + "learning_rate": 4.26599514668529e-06, + "loss": 1.0166, + "step": 2455 + }, + { + "epoch": 0.258298605177015, + "grad_norm": 2.345172111269998, + "learning_rate": 4.26540415065941e-06, + "loss": 1.0157, + "step": 2456 + }, + { + "epoch": 0.2584037756188623, + "grad_norm": 2.648580133103009, + "learning_rate": 4.264812957777253e-06, + "loss": 1.059, + "step": 2457 + }, + { + "epoch": 0.2585089460607096, + "grad_norm": 2.8657246045783755, + "learning_rate": 4.2642215681047425e-06, + "loss": 1.0181, + "step": 2458 + }, + { + "epoch": 0.25861411650255695, + "grad_norm": 2.593705422659073, + "learning_rate": 4.263629981707823e-06, + "loss": 1.0275, + "step": 2459 + }, + { + "epoch": 0.25871928694440427, + "grad_norm": 2.733876182487878, + "learning_rate": 4.263038198652459e-06, + "loss": 0.9784, + "step": 2460 + }, + { + "epoch": 0.2588244573862516, + "grad_norm": 3.183024692281709, + "learning_rate": 4.26244621900464e-06, + "loss": 1.0185, + "step": 2461 + }, + { + "epoch": 0.2589296278280989, + "grad_norm": 2.7344593577023715, + "learning_rate": 4.261854042830377e-06, + "loss": 1.0289, + "step": 2462 + }, + { + "epoch": 0.25903479826994624, + "grad_norm": 1.9703850254076136, + "learning_rate": 4.261261670195701e-06, + "loss": 0.9684, + "step": 2463 + }, + { + "epoch": 0.25913996871179357, + "grad_norm": 2.910924468557452, + "learning_rate": 4.2606691011666675e-06, + "loss": 1.0207, + "step": 2464 + }, + { + "epoch": 0.2592451391536409, + "grad_norm": 3.5563615470552548, + "learning_rate": 4.260076335809351e-06, + "loss": 1.0237, + "step": 2465 + }, + { + "epoch": 0.2593503095954882, + "grad_norm": 3.665977688640787, + "learning_rate": 4.259483374189851e-06, + "loss": 0.9983, + "step": 2466 + }, + { + "epoch": 0.2594554800373355, + "grad_norm": 3.749669082591241, + "learning_rate": 4.258890216374286e-06, + "loss": 1.0253, + "step": 2467 + }, + { + "epoch": 0.2595606504791828, + "grad_norm": 2.2413255969070303, + "learning_rate": 4.258296862428799e-06, + "loss": 1.0029, + "step": 2468 + }, + { + "epoch": 0.25966582092103013, + "grad_norm": 2.267636209673735, + "learning_rate": 4.257703312419553e-06, + "loss": 1.0052, + "step": 2469 + }, + { + "epoch": 0.25977099136287746, + "grad_norm": 2.2484357476994177, + "learning_rate": 4.257109566412733e-06, + "loss": 1.045, + "step": 2470 + }, + { + "epoch": 0.2598761618047248, + "grad_norm": 2.933825598950081, + "learning_rate": 4.256515624474547e-06, + "loss": 1.0228, + "step": 2471 + }, + { + "epoch": 0.2599813322465721, + "grad_norm": 2.404569611194018, + "learning_rate": 4.255921486671223e-06, + "loss": 1.0117, + "step": 2472 + }, + { + "epoch": 0.26008650268841943, + "grad_norm": 2.6554997150272595, + "learning_rate": 4.255327153069014e-06, + "loss": 1.0379, + "step": 2473 + }, + { + "epoch": 0.26019167313026675, + "grad_norm": 2.912876480944375, + "learning_rate": 4.25473262373419e-06, + "loss": 1.0201, + "step": 2474 + }, + { + "epoch": 0.2602968435721141, + "grad_norm": 2.4808723598184925, + "learning_rate": 4.254137898733048e-06, + "loss": 1.0086, + "step": 2475 + }, + { + "epoch": 0.2604020140139614, + "grad_norm": 3.0077031830519627, + "learning_rate": 4.253542978131904e-06, + "loss": 1.0183, + "step": 2476 + }, + { + "epoch": 0.26050718445580867, + "grad_norm": 2.119327566527547, + "learning_rate": 4.252947861997096e-06, + "loss": 1.0165, + "step": 2477 + }, + { + "epoch": 0.260612354897656, + "grad_norm": 2.9506512336323585, + "learning_rate": 4.2523525503949835e-06, + "loss": 1.0186, + "step": 2478 + }, + { + "epoch": 0.2607175253395033, + "grad_norm": 3.4059461780090414, + "learning_rate": 4.251757043391949e-06, + "loss": 1.0065, + "step": 2479 + }, + { + "epoch": 0.26082269578135064, + "grad_norm": 3.159826449070252, + "learning_rate": 4.251161341054396e-06, + "loss": 1.0176, + "step": 2480 + }, + { + "epoch": 0.26092786622319797, + "grad_norm": 3.0165983438656765, + "learning_rate": 4.25056544344875e-06, + "loss": 1.0177, + "step": 2481 + }, + { + "epoch": 0.2610330366650453, + "grad_norm": 2.909172263879279, + "learning_rate": 4.249969350641459e-06, + "loss": 1.0157, + "step": 2482 + }, + { + "epoch": 0.2611382071068926, + "grad_norm": 2.629076914941897, + "learning_rate": 4.2493730626989905e-06, + "loss": 1.0504, + "step": 2483 + }, + { + "epoch": 0.26124337754873994, + "grad_norm": 2.790385279161908, + "learning_rate": 4.248776579687835e-06, + "loss": 1.0102, + "step": 2484 + }, + { + "epoch": 0.26134854799058727, + "grad_norm": 3.4197241373079224, + "learning_rate": 4.248179901674508e-06, + "loss": 1.0153, + "step": 2485 + }, + { + "epoch": 0.2614537184324346, + "grad_norm": 2.7065753512349597, + "learning_rate": 4.247583028725539e-06, + "loss": 0.9738, + "step": 2486 + }, + { + "epoch": 0.26155888887428186, + "grad_norm": 2.0539337784982474, + "learning_rate": 4.246985960907488e-06, + "loss": 1.0371, + "step": 2487 + }, + { + "epoch": 0.2616640593161292, + "grad_norm": 3.5039520400652115, + "learning_rate": 4.2463886982869316e-06, + "loss": 1.0311, + "step": 2488 + }, + { + "epoch": 0.2617692297579765, + "grad_norm": 2.260518449174576, + "learning_rate": 4.245791240930469e-06, + "loss": 1.0418, + "step": 2489 + }, + { + "epoch": 0.26187440019982383, + "grad_norm": 2.593178309237441, + "learning_rate": 4.245193588904721e-06, + "loss": 0.9884, + "step": 2490 + }, + { + "epoch": 0.26197957064167116, + "grad_norm": 2.5646604102114807, + "learning_rate": 4.244595742276329e-06, + "loss": 1.0198, + "step": 2491 + }, + { + "epoch": 0.2620847410835185, + "grad_norm": 2.1573764067407284, + "learning_rate": 4.243997701111961e-06, + "loss": 1.0283, + "step": 2492 + }, + { + "epoch": 0.2621899115253658, + "grad_norm": 2.706034557337528, + "learning_rate": 4.243399465478302e-06, + "loss": 0.9915, + "step": 2493 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 2.8335990248067704, + "learning_rate": 4.242801035442059e-06, + "loss": 0.9332, + "step": 2494 + }, + { + "epoch": 0.26240025240906045, + "grad_norm": 2.633396099123692, + "learning_rate": 4.242202411069961e-06, + "loss": 0.9786, + "step": 2495 + }, + { + "epoch": 0.2625054228509078, + "grad_norm": 3.360271499168492, + "learning_rate": 4.241603592428761e-06, + "loss": 1.0477, + "step": 2496 + }, + { + "epoch": 0.26261059329275505, + "grad_norm": 2.148918489099196, + "learning_rate": 4.241004579585231e-06, + "loss": 1.0063, + "step": 2497 + }, + { + "epoch": 0.26271576373460237, + "grad_norm": 3.2367845205717, + "learning_rate": 4.240405372606165e-06, + "loss": 1.0015, + "step": 2498 + }, + { + "epoch": 0.2628209341764497, + "grad_norm": 2.7895205043470988, + "learning_rate": 4.239805971558381e-06, + "loss": 1.0499, + "step": 2499 + }, + { + "epoch": 0.262926104618297, + "grad_norm": 1.9537882626779242, + "learning_rate": 4.239206376508716e-06, + "loss": 1.0381, + "step": 2500 + }, + { + "epoch": 0.26303127506014434, + "grad_norm": 2.784638128216924, + "learning_rate": 4.2386065875240295e-06, + "loss": 1.0069, + "step": 2501 + }, + { + "epoch": 0.26313644550199167, + "grad_norm": 2.9037401874074096, + "learning_rate": 4.238006604671202e-06, + "loss": 0.997, + "step": 2502 + }, + { + "epoch": 0.263241615943839, + "grad_norm": 3.7907319889378095, + "learning_rate": 4.2374064280171375e-06, + "loss": 1.0153, + "step": 2503 + }, + { + "epoch": 0.2633467863856863, + "grad_norm": 2.42146316099506, + "learning_rate": 4.2368060576287586e-06, + "loss": 0.9756, + "step": 2504 + }, + { + "epoch": 0.26345195682753364, + "grad_norm": 3.3487340200239943, + "learning_rate": 4.236205493573013e-06, + "loss": 1.0098, + "step": 2505 + }, + { + "epoch": 0.26355712726938096, + "grad_norm": 2.3850429416001337, + "learning_rate": 4.235604735916867e-06, + "loss": 1.0055, + "step": 2506 + }, + { + "epoch": 0.26366229771122823, + "grad_norm": 2.849867107035556, + "learning_rate": 4.235003784727311e-06, + "loss": 1.0175, + "step": 2507 + }, + { + "epoch": 0.26376746815307556, + "grad_norm": 3.019032768395256, + "learning_rate": 4.234402640071355e-06, + "loss": 1.0389, + "step": 2508 + }, + { + "epoch": 0.2638726385949229, + "grad_norm": 1.6410605133020546, + "learning_rate": 4.23380130201603e-06, + "loss": 1.0201, + "step": 2509 + }, + { + "epoch": 0.2639778090367702, + "grad_norm": 2.8300981371727656, + "learning_rate": 4.233199770628392e-06, + "loss": 1.0188, + "step": 2510 + }, + { + "epoch": 0.26408297947861753, + "grad_norm": 2.5403242188674, + "learning_rate": 4.2325980459755155e-06, + "loss": 1.0789, + "step": 2511 + }, + { + "epoch": 0.26418814992046485, + "grad_norm": 2.6388880305238147, + "learning_rate": 4.231996128124498e-06, + "loss": 0.9696, + "step": 2512 + }, + { + "epoch": 0.2642933203623122, + "grad_norm": 3.679088388538602, + "learning_rate": 4.231394017142458e-06, + "loss": 1.0035, + "step": 2513 + }, + { + "epoch": 0.2643984908041595, + "grad_norm": 2.153085146074569, + "learning_rate": 4.230791713096534e-06, + "loss": 0.9416, + "step": 2514 + }, + { + "epoch": 0.2645036612460068, + "grad_norm": 2.9035280609022696, + "learning_rate": 4.2301892160538895e-06, + "loss": 1.0398, + "step": 2515 + }, + { + "epoch": 0.26460883168785415, + "grad_norm": 3.382690191668748, + "learning_rate": 4.229586526081707e-06, + "loss": 1.0457, + "step": 2516 + }, + { + "epoch": 0.2647140021297014, + "grad_norm": 2.7928016735192167, + "learning_rate": 4.22898364324719e-06, + "loss": 0.9843, + "step": 2517 + }, + { + "epoch": 0.26481917257154874, + "grad_norm": 1.3854879435295921, + "learning_rate": 4.228380567617566e-06, + "loss": 1.06, + "step": 2518 + }, + { + "epoch": 0.26492434301339607, + "grad_norm": 2.3894512440051945, + "learning_rate": 4.227777299260083e-06, + "loss": 1.0271, + "step": 2519 + }, + { + "epoch": 0.2650295134552434, + "grad_norm": 2.409763930345137, + "learning_rate": 4.227173838242008e-06, + "loss": 1.0086, + "step": 2520 + }, + { + "epoch": 0.2651346838970907, + "grad_norm": 2.300365291967152, + "learning_rate": 4.2265701846306336e-06, + "loss": 1.0139, + "step": 2521 + }, + { + "epoch": 0.26523985433893804, + "grad_norm": 2.9413706238859936, + "learning_rate": 4.225966338493272e-06, + "loss": 1.0238, + "step": 2522 + }, + { + "epoch": 0.26534502478078537, + "grad_norm": 3.3804230105479385, + "learning_rate": 4.225362299897254e-06, + "loss": 1.0536, + "step": 2523 + }, + { + "epoch": 0.2654501952226327, + "grad_norm": 2.708985307345623, + "learning_rate": 4.2247580689099376e-06, + "loss": 0.9965, + "step": 2524 + }, + { + "epoch": 0.26555536566448, + "grad_norm": 2.8007246157283494, + "learning_rate": 4.224153645598698e-06, + "loss": 1.033, + "step": 2525 + }, + { + "epoch": 0.26566053610632734, + "grad_norm": 2.552503024762742, + "learning_rate": 4.2235490300309324e-06, + "loss": 1.0094, + "step": 2526 + }, + { + "epoch": 0.26576570654817466, + "grad_norm": 2.761858798258985, + "learning_rate": 4.222944222274062e-06, + "loss": 1.0003, + "step": 2527 + }, + { + "epoch": 0.26587087699002193, + "grad_norm": 2.5736511815926844, + "learning_rate": 4.222339222395526e-06, + "loss": 0.9731, + "step": 2528 + }, + { + "epoch": 0.26597604743186926, + "grad_norm": 2.6976780654834744, + "learning_rate": 4.221734030462787e-06, + "loss": 1.0515, + "step": 2529 + }, + { + "epoch": 0.2660812178737166, + "grad_norm": 3.4009547412639187, + "learning_rate": 4.22112864654333e-06, + "loss": 1.02, + "step": 2530 + }, + { + "epoch": 0.2661863883155639, + "grad_norm": 2.060001991686356, + "learning_rate": 4.220523070704657e-06, + "loss": 0.967, + "step": 2531 + }, + { + "epoch": 0.26629155875741123, + "grad_norm": 3.326975711226428, + "learning_rate": 4.219917303014297e-06, + "loss": 1.0475, + "step": 2532 + }, + { + "epoch": 0.26639672919925855, + "grad_norm": 3.6695180566563743, + "learning_rate": 4.219311343539797e-06, + "loss": 1.0338, + "step": 2533 + }, + { + "epoch": 0.2665018996411059, + "grad_norm": 3.402820484725127, + "learning_rate": 4.218705192348725e-06, + "loss": 1.0143, + "step": 2534 + }, + { + "epoch": 0.2666070700829532, + "grad_norm": 2.611669788607023, + "learning_rate": 4.2180988495086745e-06, + "loss": 1.0101, + "step": 2535 + }, + { + "epoch": 0.2667122405248005, + "grad_norm": 2.1839819006885364, + "learning_rate": 4.217492315087255e-06, + "loss": 1.0269, + "step": 2536 + }, + { + "epoch": 0.26681741096664785, + "grad_norm": 2.6726735252473715, + "learning_rate": 4.2168855891520995e-06, + "loss": 1.0632, + "step": 2537 + }, + { + "epoch": 0.2669225814084951, + "grad_norm": 2.8199626949758305, + "learning_rate": 4.216278671770865e-06, + "loss": 1.0688, + "step": 2538 + }, + { + "epoch": 0.26702775185034244, + "grad_norm": 3.170146232476931, + "learning_rate": 4.2156715630112245e-06, + "loss": 0.9838, + "step": 2539 + }, + { + "epoch": 0.26713292229218977, + "grad_norm": 3.0560237745595513, + "learning_rate": 4.215064262940878e-06, + "loss": 1.0116, + "step": 2540 + }, + { + "epoch": 0.2672380927340371, + "grad_norm": 2.510504357329309, + "learning_rate": 4.214456771627542e-06, + "loss": 0.9855, + "step": 2541 + }, + { + "epoch": 0.2673432631758844, + "grad_norm": 2.5311289381637163, + "learning_rate": 4.213849089138959e-06, + "loss": 1.0087, + "step": 2542 + }, + { + "epoch": 0.26744843361773174, + "grad_norm": 2.244207627746343, + "learning_rate": 4.213241215542888e-06, + "loss": 0.9875, + "step": 2543 + }, + { + "epoch": 0.26755360405957906, + "grad_norm": 1.9425305151864143, + "learning_rate": 4.2126331509071125e-06, + "loss": 0.9958, + "step": 2544 + }, + { + "epoch": 0.2676587745014264, + "grad_norm": 2.895384366283835, + "learning_rate": 4.212024895299437e-06, + "loss": 0.9963, + "step": 2545 + }, + { + "epoch": 0.2677639449432737, + "grad_norm": 2.6493035965793443, + "learning_rate": 4.211416448787686e-06, + "loss": 1.0128, + "step": 2546 + }, + { + "epoch": 0.26786911538512104, + "grad_norm": 2.303330006776538, + "learning_rate": 4.210807811439707e-06, + "loss": 1.0284, + "step": 2547 + }, + { + "epoch": 0.2679742858269683, + "grad_norm": 2.6701335223547833, + "learning_rate": 4.210198983323366e-06, + "loss": 1.0172, + "step": 2548 + }, + { + "epoch": 0.26807945626881563, + "grad_norm": 2.548255245355431, + "learning_rate": 4.209589964506553e-06, + "loss": 1.0094, + "step": 2549 + }, + { + "epoch": 0.26818462671066295, + "grad_norm": 2.5662458724791417, + "learning_rate": 4.2089807550571786e-06, + "loss": 1.0284, + "step": 2550 + }, + { + "epoch": 0.2682897971525103, + "grad_norm": 2.7555215764610255, + "learning_rate": 4.208371355043174e-06, + "loss": 0.9936, + "step": 2551 + }, + { + "epoch": 0.2683949675943576, + "grad_norm": 1.6391645215285977, + "learning_rate": 4.207761764532493e-06, + "loss": 1.0162, + "step": 2552 + }, + { + "epoch": 0.2685001380362049, + "grad_norm": 2.861451794419541, + "learning_rate": 4.207151983593109e-06, + "loss": 1.0493, + "step": 2553 + }, + { + "epoch": 0.26860530847805225, + "grad_norm": 3.000252612225211, + "learning_rate": 4.206542012293016e-06, + "loss": 0.9965, + "step": 2554 + }, + { + "epoch": 0.2687104789198996, + "grad_norm": 3.0326862394669813, + "learning_rate": 4.205931850700232e-06, + "loss": 1.0026, + "step": 2555 + }, + { + "epoch": 0.2688156493617469, + "grad_norm": 2.6432936556834026, + "learning_rate": 4.205321498882795e-06, + "loss": 1.0279, + "step": 2556 + }, + { + "epoch": 0.2689208198035942, + "grad_norm": 2.9469947026091146, + "learning_rate": 4.204710956908763e-06, + "loss": 0.9982, + "step": 2557 + }, + { + "epoch": 0.2690259902454415, + "grad_norm": 3.0253527353338447, + "learning_rate": 4.204100224846217e-06, + "loss": 1.0196, + "step": 2558 + }, + { + "epoch": 0.2691311606872888, + "grad_norm": 2.8517981562593753, + "learning_rate": 4.203489302763258e-06, + "loss": 1.0275, + "step": 2559 + }, + { + "epoch": 0.26923633112913614, + "grad_norm": 2.4348277733854418, + "learning_rate": 4.202878190728009e-06, + "loss": 0.9938, + "step": 2560 + }, + { + "epoch": 0.26934150157098347, + "grad_norm": 3.561918888528251, + "learning_rate": 4.202266888808613e-06, + "loss": 1.041, + "step": 2561 + }, + { + "epoch": 0.2694466720128308, + "grad_norm": 2.0281988514230394, + "learning_rate": 4.201655397073234e-06, + "loss": 1.0193, + "step": 2562 + }, + { + "epoch": 0.2695518424546781, + "grad_norm": 2.6293480142232317, + "learning_rate": 4.2010437155900605e-06, + "loss": 1.0125, + "step": 2563 + }, + { + "epoch": 0.26965701289652544, + "grad_norm": 3.250687419475308, + "learning_rate": 4.200431844427299e-06, + "loss": 0.993, + "step": 2564 + }, + { + "epoch": 0.26976218333837276, + "grad_norm": 2.42448998982233, + "learning_rate": 4.199819783653177e-06, + "loss": 1.0116, + "step": 2565 + }, + { + "epoch": 0.2698673537802201, + "grad_norm": 2.6337946712929936, + "learning_rate": 4.199207533335944e-06, + "loss": 0.9941, + "step": 2566 + }, + { + "epoch": 0.2699725242220674, + "grad_norm": 2.710054204566471, + "learning_rate": 4.198595093543871e-06, + "loss": 0.9949, + "step": 2567 + }, + { + "epoch": 0.2700776946639147, + "grad_norm": 2.697631288571504, + "learning_rate": 4.197982464345251e-06, + "loss": 0.9725, + "step": 2568 + }, + { + "epoch": 0.270182865105762, + "grad_norm": 2.5276904086562926, + "learning_rate": 4.197369645808394e-06, + "loss": 1.0171, + "step": 2569 + }, + { + "epoch": 0.27028803554760933, + "grad_norm": 2.46942882217501, + "learning_rate": 4.196756638001638e-06, + "loss": 1.0111, + "step": 2570 + }, + { + "epoch": 0.27039320598945665, + "grad_norm": 2.657116364982858, + "learning_rate": 4.196143440993335e-06, + "loss": 1.0135, + "step": 2571 + }, + { + "epoch": 0.270498376431304, + "grad_norm": 2.173209952959072, + "learning_rate": 4.195530054851863e-06, + "loss": 0.9677, + "step": 2572 + }, + { + "epoch": 0.2706035468731513, + "grad_norm": 2.22871510749567, + "learning_rate": 4.1949164796456174e-06, + "loss": 1.0164, + "step": 2573 + }, + { + "epoch": 0.2707087173149986, + "grad_norm": 1.903168210740841, + "learning_rate": 4.194302715443018e-06, + "loss": 0.9581, + "step": 2574 + }, + { + "epoch": 0.27081388775684595, + "grad_norm": 1.8872564936631262, + "learning_rate": 4.193688762312504e-06, + "loss": 0.9807, + "step": 2575 + }, + { + "epoch": 0.2709190581986933, + "grad_norm": 2.456914305309123, + "learning_rate": 4.193074620322536e-06, + "loss": 1.0195, + "step": 2576 + }, + { + "epoch": 0.2710242286405406, + "grad_norm": 3.185302938492518, + "learning_rate": 4.192460289541596e-06, + "loss": 0.9971, + "step": 2577 + }, + { + "epoch": 0.27112939908238787, + "grad_norm": 3.314584198817348, + "learning_rate": 4.191845770038186e-06, + "loss": 0.9779, + "step": 2578 + }, + { + "epoch": 0.2712345695242352, + "grad_norm": 2.876042479155621, + "learning_rate": 4.19123106188083e-06, + "loss": 0.9777, + "step": 2579 + }, + { + "epoch": 0.2713397399660825, + "grad_norm": 2.446043436959705, + "learning_rate": 4.1906161651380725e-06, + "loss": 1.0048, + "step": 2580 + }, + { + "epoch": 0.27144491040792984, + "grad_norm": 2.376061110547315, + "learning_rate": 4.19000107987848e-06, + "loss": 1.0115, + "step": 2581 + }, + { + "epoch": 0.27155008084977716, + "grad_norm": 2.681106684018155, + "learning_rate": 4.189385806170637e-06, + "loss": 1.027, + "step": 2582 + }, + { + "epoch": 0.2716552512916245, + "grad_norm": 3.098522232486938, + "learning_rate": 4.188770344083155e-06, + "loss": 0.9921, + "step": 2583 + }, + { + "epoch": 0.2717604217334718, + "grad_norm": 2.90935844056823, + "learning_rate": 4.188154693684659e-06, + "loss": 1.0271, + "step": 2584 + }, + { + "epoch": 0.27186559217531914, + "grad_norm": 3.1110671923281936, + "learning_rate": 4.187538855043802e-06, + "loss": 1.0198, + "step": 2585 + }, + { + "epoch": 0.27197076261716646, + "grad_norm": 2.682316070595872, + "learning_rate": 4.186922828229254e-06, + "loss": 1.0307, + "step": 2586 + }, + { + "epoch": 0.2720759330590138, + "grad_norm": 3.015097650865152, + "learning_rate": 4.186306613309704e-06, + "loss": 1.0326, + "step": 2587 + }, + { + "epoch": 0.2721811035008611, + "grad_norm": 2.366488727509157, + "learning_rate": 4.185690210353869e-06, + "loss": 1.0285, + "step": 2588 + }, + { + "epoch": 0.2722862739427084, + "grad_norm": 2.6756347836291523, + "learning_rate": 4.185073619430479e-06, + "loss": 1.0118, + "step": 2589 + }, + { + "epoch": 0.2723914443845557, + "grad_norm": 2.001448901496582, + "learning_rate": 4.184456840608291e-06, + "loss": 1.0067, + "step": 2590 + }, + { + "epoch": 0.272496614826403, + "grad_norm": 2.425491517107179, + "learning_rate": 4.183839873956081e-06, + "loss": 0.9963, + "step": 2591 + }, + { + "epoch": 0.27260178526825035, + "grad_norm": 3.4036870716254417, + "learning_rate": 4.183222719542643e-06, + "loss": 0.9999, + "step": 2592 + }, + { + "epoch": 0.2727069557100977, + "grad_norm": 2.720894946053787, + "learning_rate": 4.182605377436797e-06, + "loss": 1.0125, + "step": 2593 + }, + { + "epoch": 0.272812126151945, + "grad_norm": 2.6441136268156793, + "learning_rate": 4.18198784770738e-06, + "loss": 1.0377, + "step": 2594 + }, + { + "epoch": 0.2729172965937923, + "grad_norm": 3.0457102136253273, + "learning_rate": 4.1813701304232515e-06, + "loss": 1.0487, + "step": 2595 + }, + { + "epoch": 0.27302246703563965, + "grad_norm": 2.559547849904084, + "learning_rate": 4.1807522256532925e-06, + "loss": 0.9605, + "step": 2596 + }, + { + "epoch": 0.273127637477487, + "grad_norm": 1.8862838127732708, + "learning_rate": 4.1801341334664035e-06, + "loss": 0.9979, + "step": 2597 + }, + { + "epoch": 0.2732328079193343, + "grad_norm": 3.6119148619521337, + "learning_rate": 4.179515853931507e-06, + "loss": 1.0307, + "step": 2598 + }, + { + "epoch": 0.27333797836118157, + "grad_norm": 3.286509646757304, + "learning_rate": 4.178897387117547e-06, + "loss": 0.9789, + "step": 2599 + }, + { + "epoch": 0.2734431488030289, + "grad_norm": 2.7213519725866573, + "learning_rate": 4.178278733093485e-06, + "loss": 1.0111, + "step": 2600 + }, + { + "epoch": 0.2735483192448762, + "grad_norm": 2.352945473282292, + "learning_rate": 4.177659891928307e-06, + "loss": 1.0025, + "step": 2601 + }, + { + "epoch": 0.27365348968672354, + "grad_norm": 2.5229083023446015, + "learning_rate": 4.1770408636910185e-06, + "loss": 1.0267, + "step": 2602 + }, + { + "epoch": 0.27375866012857086, + "grad_norm": 2.818848626261983, + "learning_rate": 4.176421648450646e-06, + "loss": 0.9997, + "step": 2603 + }, + { + "epoch": 0.2738638305704182, + "grad_norm": 2.737597916134103, + "learning_rate": 4.175802246276237e-06, + "loss": 1.0182, + "step": 2604 + }, + { + "epoch": 0.2739690010122655, + "grad_norm": 2.3272831659875393, + "learning_rate": 4.1751826572368596e-06, + "loss": 1.0206, + "step": 2605 + }, + { + "epoch": 0.27407417145411284, + "grad_norm": 3.029868785399325, + "learning_rate": 4.174562881401602e-06, + "loss": 1.0163, + "step": 2606 + }, + { + "epoch": 0.27417934189596016, + "grad_norm": 3.336473793800413, + "learning_rate": 4.173942918839576e-06, + "loss": 0.976, + "step": 2607 + }, + { + "epoch": 0.2742845123378075, + "grad_norm": 2.6514956429533942, + "learning_rate": 4.17332276961991e-06, + "loss": 1.0177, + "step": 2608 + }, + { + "epoch": 0.27438968277965475, + "grad_norm": 2.679420445962811, + "learning_rate": 4.172702433811756e-06, + "loss": 1.0326, + "step": 2609 + }, + { + "epoch": 0.2744948532215021, + "grad_norm": 3.6351143708377616, + "learning_rate": 4.172081911484287e-06, + "loss": 1.005, + "step": 2610 + }, + { + "epoch": 0.2746000236633494, + "grad_norm": 2.9056209865867184, + "learning_rate": 4.171461202706696e-06, + "loss": 1.0271, + "step": 2611 + }, + { + "epoch": 0.2747051941051967, + "grad_norm": 2.417998566001529, + "learning_rate": 4.170840307548196e-06, + "loss": 1.02, + "step": 2612 + }, + { + "epoch": 0.27481036454704405, + "grad_norm": 2.9211736184925505, + "learning_rate": 4.170219226078023e-06, + "loss": 1.0503, + "step": 2613 + }, + { + "epoch": 0.2749155349888914, + "grad_norm": 2.418767483745108, + "learning_rate": 4.169597958365431e-06, + "loss": 1.0352, + "step": 2614 + }, + { + "epoch": 0.2750207054307387, + "grad_norm": 1.9237097504393978, + "learning_rate": 4.1689765044796965e-06, + "loss": 1.0098, + "step": 2615 + }, + { + "epoch": 0.275125875872586, + "grad_norm": 2.6838641339374902, + "learning_rate": 4.168354864490117e-06, + "loss": 1.0082, + "step": 2616 + }, + { + "epoch": 0.27523104631443335, + "grad_norm": 2.577137912600681, + "learning_rate": 4.16773303846601e-06, + "loss": 1.0114, + "step": 2617 + }, + { + "epoch": 0.27533621675628067, + "grad_norm": 3.0299539149461783, + "learning_rate": 4.167111026476714e-06, + "loss": 1.0265, + "step": 2618 + }, + { + "epoch": 0.27544138719812794, + "grad_norm": 2.3354635781135595, + "learning_rate": 4.166488828591587e-06, + "loss": 1.0034, + "step": 2619 + }, + { + "epoch": 0.27554655763997526, + "grad_norm": 3.2046716724463673, + "learning_rate": 4.1658664448800105e-06, + "loss": 1.0174, + "step": 2620 + }, + { + "epoch": 0.2756517280818226, + "grad_norm": 2.4271263749473597, + "learning_rate": 4.165243875411385e-06, + "loss": 1.038, + "step": 2621 + }, + { + "epoch": 0.2757568985236699, + "grad_norm": 2.946133587548101, + "learning_rate": 4.16462112025513e-06, + "loss": 1.0256, + "step": 2622 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.4337734453214823, + "learning_rate": 4.163998179480689e-06, + "loss": 1.0307, + "step": 2623 + }, + { + "epoch": 0.27596723940736456, + "grad_norm": 2.551909935384673, + "learning_rate": 4.163375053157526e-06, + "loss": 1.0084, + "step": 2624 + }, + { + "epoch": 0.2760724098492119, + "grad_norm": 3.4353272254389964, + "learning_rate": 4.162751741355122e-06, + "loss": 1.0516, + "step": 2625 + }, + { + "epoch": 0.2761775802910592, + "grad_norm": 2.070361498390321, + "learning_rate": 4.1621282441429824e-06, + "loss": 0.9945, + "step": 2626 + }, + { + "epoch": 0.27628275073290653, + "grad_norm": 2.6297532263169803, + "learning_rate": 4.161504561590632e-06, + "loss": 1.0422, + "step": 2627 + }, + { + "epoch": 0.27638792117475386, + "grad_norm": 2.225063604883171, + "learning_rate": 4.1608806937676156e-06, + "loss": 1.0377, + "step": 2628 + }, + { + "epoch": 0.2764930916166011, + "grad_norm": 2.1970819975722082, + "learning_rate": 4.160256640743499e-06, + "loss": 1.0056, + "step": 2629 + }, + { + "epoch": 0.27659826205844845, + "grad_norm": 1.8843326423778457, + "learning_rate": 4.159632402587871e-06, + "loss": 1.0027, + "step": 2630 + }, + { + "epoch": 0.2767034325002958, + "grad_norm": 2.384132121607812, + "learning_rate": 4.159007979370337e-06, + "loss": 0.9889, + "step": 2631 + }, + { + "epoch": 0.2768086029421431, + "grad_norm": 2.3542048671824496, + "learning_rate": 4.158383371160526e-06, + "loss": 0.9994, + "step": 2632 + }, + { + "epoch": 0.2769137733839904, + "grad_norm": 1.9109594616784749, + "learning_rate": 4.157758578028086e-06, + "loss": 0.9851, + "step": 2633 + }, + { + "epoch": 0.27701894382583775, + "grad_norm": 2.719043073466606, + "learning_rate": 4.1571336000426865e-06, + "loss": 0.9938, + "step": 2634 + }, + { + "epoch": 0.2771241142676851, + "grad_norm": 2.3398878575871818, + "learning_rate": 4.156508437274017e-06, + "loss": 0.9909, + "step": 2635 + }, + { + "epoch": 0.2772292847095324, + "grad_norm": 2.6076473869071077, + "learning_rate": 4.15588308979179e-06, + "loss": 1.0094, + "step": 2636 + }, + { + "epoch": 0.2773344551513797, + "grad_norm": 2.9186482297651373, + "learning_rate": 4.155257557665734e-06, + "loss": 1.0107, + "step": 2637 + }, + { + "epoch": 0.27743962559322705, + "grad_norm": 2.924548529106016, + "learning_rate": 4.154631840965603e-06, + "loss": 1.0273, + "step": 2638 + }, + { + "epoch": 0.2775447960350743, + "grad_norm": 2.79169988893619, + "learning_rate": 4.154005939761167e-06, + "loss": 1.045, + "step": 2639 + }, + { + "epoch": 0.27764996647692164, + "grad_norm": 2.5444212624663476, + "learning_rate": 4.15337985412222e-06, + "loss": 0.9968, + "step": 2640 + }, + { + "epoch": 0.27775513691876896, + "grad_norm": 2.770240170620273, + "learning_rate": 4.152753584118576e-06, + "loss": 1.0392, + "step": 2641 + }, + { + "epoch": 0.2778603073606163, + "grad_norm": 3.112319778752494, + "learning_rate": 4.152127129820067e-06, + "loss": 1.0249, + "step": 2642 + }, + { + "epoch": 0.2779654778024636, + "grad_norm": 2.310464927042958, + "learning_rate": 4.151500491296551e-06, + "loss": 0.9965, + "step": 2643 + }, + { + "epoch": 0.27807064824431094, + "grad_norm": 3.2829488475899558, + "learning_rate": 4.150873668617899e-06, + "loss": 0.9908, + "step": 2644 + }, + { + "epoch": 0.27817581868615826, + "grad_norm": 2.176507323056151, + "learning_rate": 4.150246661854009e-06, + "loss": 0.9991, + "step": 2645 + }, + { + "epoch": 0.2782809891280056, + "grad_norm": 1.5655734274913256, + "learning_rate": 4.149619471074797e-06, + "loss": 0.9418, + "step": 2646 + }, + { + "epoch": 0.2783861595698529, + "grad_norm": 2.108505326156569, + "learning_rate": 4.148992096350199e-06, + "loss": 1.0599, + "step": 2647 + }, + { + "epoch": 0.27849133001170023, + "grad_norm": 3.3136337973670202, + "learning_rate": 4.1483645377501726e-06, + "loss": 1.0149, + "step": 2648 + }, + { + "epoch": 0.27859650045354756, + "grad_norm": 2.54494923113061, + "learning_rate": 4.147736795344695e-06, + "loss": 0.9897, + "step": 2649 + }, + { + "epoch": 0.2787016708953948, + "grad_norm": 3.225755330401636, + "learning_rate": 4.147108869203765e-06, + "loss": 1.0042, + "step": 2650 + }, + { + "epoch": 0.27880684133724215, + "grad_norm": 2.7331184633216123, + "learning_rate": 4.146480759397401e-06, + "loss": 1.0194, + "step": 2651 + }, + { + "epoch": 0.2789120117790895, + "grad_norm": 3.0050939408575794, + "learning_rate": 4.145852465995642e-06, + "loss": 0.9754, + "step": 2652 + }, + { + "epoch": 0.2790171822209368, + "grad_norm": 1.8464056831154667, + "learning_rate": 4.145223989068547e-06, + "loss": 0.9801, + "step": 2653 + }, + { + "epoch": 0.2791223526627841, + "grad_norm": 3.288289011526489, + "learning_rate": 4.1445953286861976e-06, + "loss": 1.0099, + "step": 2654 + }, + { + "epoch": 0.27922752310463145, + "grad_norm": 2.9001394289482763, + "learning_rate": 4.143966484918692e-06, + "loss": 1.0246, + "step": 2655 + }, + { + "epoch": 0.27933269354647877, + "grad_norm": 2.7075837385137715, + "learning_rate": 4.143337457836154e-06, + "loss": 1.0293, + "step": 2656 + }, + { + "epoch": 0.2794378639883261, + "grad_norm": 3.4531421732250114, + "learning_rate": 4.142708247508723e-06, + "loss": 1.015, + "step": 2657 + }, + { + "epoch": 0.2795430344301734, + "grad_norm": 2.473925702976424, + "learning_rate": 4.142078854006561e-06, + "loss": 1.0234, + "step": 2658 + }, + { + "epoch": 0.27964820487202074, + "grad_norm": 2.533831100531155, + "learning_rate": 4.141449277399852e-06, + "loss": 1.04, + "step": 2659 + }, + { + "epoch": 0.279753375313868, + "grad_norm": 2.9141485074580458, + "learning_rate": 4.140819517758796e-06, + "loss": 1.0459, + "step": 2660 + }, + { + "epoch": 0.27985854575571534, + "grad_norm": 1.6275648359890877, + "learning_rate": 4.1401895751536185e-06, + "loss": 1.0112, + "step": 2661 + }, + { + "epoch": 0.27996371619756266, + "grad_norm": 2.3074858640345064, + "learning_rate": 4.139559449654561e-06, + "loss": 1.0349, + "step": 2662 + }, + { + "epoch": 0.28006888663941, + "grad_norm": 2.266029848888559, + "learning_rate": 4.138929141331888e-06, + "loss": 1.0054, + "step": 2663 + }, + { + "epoch": 0.2801740570812573, + "grad_norm": 2.30011851501213, + "learning_rate": 4.138298650255885e-06, + "loss": 1.0183, + "step": 2664 + }, + { + "epoch": 0.28027922752310463, + "grad_norm": 2.0840813611244378, + "learning_rate": 4.137667976496855e-06, + "loss": 1.0502, + "step": 2665 + }, + { + "epoch": 0.28038439796495196, + "grad_norm": 2.1068525223894343, + "learning_rate": 4.137037120125125e-06, + "loss": 0.9381, + "step": 2666 + }, + { + "epoch": 0.2804895684067993, + "grad_norm": 2.7828354238745967, + "learning_rate": 4.136406081211038e-06, + "loss": 1.019, + "step": 2667 + }, + { + "epoch": 0.2805947388486466, + "grad_norm": 2.6442032647808733, + "learning_rate": 4.135774859824962e-06, + "loss": 1.0206, + "step": 2668 + }, + { + "epoch": 0.28069990929049393, + "grad_norm": 2.5457929175843677, + "learning_rate": 4.13514345603728e-06, + "loss": 0.9947, + "step": 2669 + }, + { + "epoch": 0.2808050797323412, + "grad_norm": 3.348381030141236, + "learning_rate": 4.134511869918402e-06, + "loss": 1.0436, + "step": 2670 + }, + { + "epoch": 0.2809102501741885, + "grad_norm": 1.8730510261518738, + "learning_rate": 4.133880101538753e-06, + "loss": 0.9471, + "step": 2671 + }, + { + "epoch": 0.28101542061603585, + "grad_norm": 2.39799470254033, + "learning_rate": 4.13324815096878e-06, + "loss": 1.0186, + "step": 2672 + }, + { + "epoch": 0.2811205910578832, + "grad_norm": 2.626761498640316, + "learning_rate": 4.132616018278951e-06, + "loss": 0.9936, + "step": 2673 + }, + { + "epoch": 0.2812257614997305, + "grad_norm": 2.7240523919341504, + "learning_rate": 4.1319837035397525e-06, + "loss": 0.9984, + "step": 2674 + }, + { + "epoch": 0.2813309319415778, + "grad_norm": 2.856921462750168, + "learning_rate": 4.131351206821694e-06, + "loss": 1.0513, + "step": 2675 + }, + { + "epoch": 0.28143610238342515, + "grad_norm": 1.9943496303192936, + "learning_rate": 4.130718528195303e-06, + "loss": 1.0222, + "step": 2676 + }, + { + "epoch": 0.28154127282527247, + "grad_norm": 4.2289456038778, + "learning_rate": 4.130085667731129e-06, + "loss": 1.0211, + "step": 2677 + }, + { + "epoch": 0.2816464432671198, + "grad_norm": 2.9894225702147543, + "learning_rate": 4.129452625499739e-06, + "loss": 1.0277, + "step": 2678 + }, + { + "epoch": 0.2817516137089671, + "grad_norm": 1.992493935234122, + "learning_rate": 4.128819401571724e-06, + "loss": 1.0171, + "step": 2679 + }, + { + "epoch": 0.2818567841508144, + "grad_norm": 2.652762924314855, + "learning_rate": 4.128185996017692e-06, + "loss": 1.0551, + "step": 2680 + }, + { + "epoch": 0.2819619545926617, + "grad_norm": 2.604810015626862, + "learning_rate": 4.127552408908274e-06, + "loss": 1.0353, + "step": 2681 + }, + { + "epoch": 0.28206712503450904, + "grad_norm": 2.7095942486504505, + "learning_rate": 4.126918640314118e-06, + "loss": 1.0172, + "step": 2682 + }, + { + "epoch": 0.28217229547635636, + "grad_norm": 3.2119845572549863, + "learning_rate": 4.1262846903058966e-06, + "loss": 0.9824, + "step": 2683 + }, + { + "epoch": 0.2822774659182037, + "grad_norm": 2.4305337457552003, + "learning_rate": 4.1256505589542976e-06, + "loss": 1.0247, + "step": 2684 + }, + { + "epoch": 0.282382636360051, + "grad_norm": 2.0472831189235277, + "learning_rate": 4.125016246330034e-06, + "loss": 1.0256, + "step": 2685 + }, + { + "epoch": 0.28248780680189833, + "grad_norm": 2.1373038919099687, + "learning_rate": 4.124381752503834e-06, + "loss": 1.0105, + "step": 2686 + }, + { + "epoch": 0.28259297724374566, + "grad_norm": 1.5615996486183839, + "learning_rate": 4.123747077546451e-06, + "loss": 1.0378, + "step": 2687 + }, + { + "epoch": 0.282698147685593, + "grad_norm": 1.8388087927806853, + "learning_rate": 4.123112221528654e-06, + "loss": 1.003, + "step": 2688 + }, + { + "epoch": 0.2828033181274403, + "grad_norm": 3.487408850014367, + "learning_rate": 4.122477184521237e-06, + "loss": 1.0202, + "step": 2689 + }, + { + "epoch": 0.2829084885692876, + "grad_norm": 2.666547606564312, + "learning_rate": 4.121841966595009e-06, + "loss": 1.042, + "step": 2690 + }, + { + "epoch": 0.2830136590111349, + "grad_norm": 2.99015276586862, + "learning_rate": 4.121206567820803e-06, + "loss": 1.0559, + "step": 2691 + }, + { + "epoch": 0.2831188294529822, + "grad_norm": 2.866974473703025, + "learning_rate": 4.120570988269472e-06, + "loss": 0.9848, + "step": 2692 + }, + { + "epoch": 0.28322399989482955, + "grad_norm": 2.50728052143124, + "learning_rate": 4.119935228011885e-06, + "loss": 1.0286, + "step": 2693 + }, + { + "epoch": 0.28332917033667687, + "grad_norm": 2.3872886212034095, + "learning_rate": 4.119299287118937e-06, + "loss": 1.0258, + "step": 2694 + }, + { + "epoch": 0.2834343407785242, + "grad_norm": 1.7175832511041358, + "learning_rate": 4.118663165661538e-06, + "loss": 1.0324, + "step": 2695 + }, + { + "epoch": 0.2835395112203715, + "grad_norm": 2.657674607317461, + "learning_rate": 4.118026863710623e-06, + "loss": 1.0573, + "step": 2696 + }, + { + "epoch": 0.28364468166221884, + "grad_norm": 2.5589768139758755, + "learning_rate": 4.117390381337144e-06, + "loss": 0.9864, + "step": 2697 + }, + { + "epoch": 0.28374985210406617, + "grad_norm": 3.1191553451509093, + "learning_rate": 4.116753718612072e-06, + "loss": 0.9765, + "step": 2698 + }, + { + "epoch": 0.2838550225459135, + "grad_norm": 3.6343579637339842, + "learning_rate": 4.116116875606402e-06, + "loss": 1.0286, + "step": 2699 + }, + { + "epoch": 0.28396019298776076, + "grad_norm": 2.884280278934574, + "learning_rate": 4.1154798523911446e-06, + "loss": 1.0308, + "step": 2700 + }, + { + "epoch": 0.2840653634296081, + "grad_norm": 2.2973711289546603, + "learning_rate": 4.114842649037335e-06, + "loss": 0.965, + "step": 2701 + }, + { + "epoch": 0.2841705338714554, + "grad_norm": 2.870025826884082, + "learning_rate": 4.114205265616026e-06, + "loss": 0.9826, + "step": 2702 + }, + { + "epoch": 0.28427570431330274, + "grad_norm": 2.8685291174085332, + "learning_rate": 4.1135677021982885e-06, + "loss": 1.0087, + "step": 2703 + }, + { + "epoch": 0.28438087475515006, + "grad_norm": 3.000969023465314, + "learning_rate": 4.11292995885522e-06, + "loss": 1.0142, + "step": 2704 + }, + { + "epoch": 0.2844860451969974, + "grad_norm": 2.382052707066313, + "learning_rate": 4.11229203565793e-06, + "loss": 0.9957, + "step": 2705 + }, + { + "epoch": 0.2845912156388447, + "grad_norm": 2.5092385915195723, + "learning_rate": 4.111653932677553e-06, + "loss": 0.97, + "step": 2706 + }, + { + "epoch": 0.28469638608069203, + "grad_norm": 2.83339755995173, + "learning_rate": 4.111015649985243e-06, + "loss": 0.9873, + "step": 2707 + }, + { + "epoch": 0.28480155652253936, + "grad_norm": 2.7074863691662046, + "learning_rate": 4.110377187652174e-06, + "loss": 0.987, + "step": 2708 + }, + { + "epoch": 0.2849067269643867, + "grad_norm": 1.9740060538086266, + "learning_rate": 4.109738545749538e-06, + "loss": 0.9809, + "step": 2709 + }, + { + "epoch": 0.285011897406234, + "grad_norm": 3.3206180620517114, + "learning_rate": 4.1090997243485494e-06, + "loss": 1.0318, + "step": 2710 + }, + { + "epoch": 0.2851170678480813, + "grad_norm": 3.4193647547163435, + "learning_rate": 4.108460723520441e-06, + "loss": 1.0323, + "step": 2711 + }, + { + "epoch": 0.2852222382899286, + "grad_norm": 2.461999394482132, + "learning_rate": 4.107821543336468e-06, + "loss": 0.969, + "step": 2712 + }, + { + "epoch": 0.2853274087317759, + "grad_norm": 2.033813622679507, + "learning_rate": 4.107182183867903e-06, + "loss": 0.9876, + "step": 2713 + }, + { + "epoch": 0.28543257917362325, + "grad_norm": 2.7246823900429673, + "learning_rate": 4.106542645186039e-06, + "loss": 0.9927, + "step": 2714 + }, + { + "epoch": 0.28553774961547057, + "grad_norm": 2.869297378241626, + "learning_rate": 4.10590292736219e-06, + "loss": 1.0259, + "step": 2715 + }, + { + "epoch": 0.2856429200573179, + "grad_norm": 2.521121687829408, + "learning_rate": 4.105263030467689e-06, + "loss": 1.0261, + "step": 2716 + }, + { + "epoch": 0.2857480904991652, + "grad_norm": 2.599567696407617, + "learning_rate": 4.10462295457389e-06, + "loss": 1.0017, + "step": 2717 + }, + { + "epoch": 0.28585326094101254, + "grad_norm": 2.024220267749111, + "learning_rate": 4.103982699752167e-06, + "loss": 0.9992, + "step": 2718 + }, + { + "epoch": 0.28595843138285987, + "grad_norm": 2.6696411790056565, + "learning_rate": 4.103342266073913e-06, + "loss": 1.0149, + "step": 2719 + }, + { + "epoch": 0.2860636018247072, + "grad_norm": 1.8522315223803922, + "learning_rate": 4.102701653610541e-06, + "loss": 1.0337, + "step": 2720 + }, + { + "epoch": 0.28616877226655446, + "grad_norm": 2.7949963553772075, + "learning_rate": 4.102060862433484e-06, + "loss": 0.9954, + "step": 2721 + }, + { + "epoch": 0.2862739427084018, + "grad_norm": 2.229728603318455, + "learning_rate": 4.101419892614195e-06, + "loss": 1.0195, + "step": 2722 + }, + { + "epoch": 0.2863791131502491, + "grad_norm": 3.3540545324036914, + "learning_rate": 4.10077874422415e-06, + "loss": 0.9692, + "step": 2723 + }, + { + "epoch": 0.28648428359209643, + "grad_norm": 2.7750001031832183, + "learning_rate": 4.100137417334838e-06, + "loss": 1.0112, + "step": 2724 + }, + { + "epoch": 0.28658945403394376, + "grad_norm": 2.8776802158198462, + "learning_rate": 4.099495912017773e-06, + "loss": 1.0133, + "step": 2725 + }, + { + "epoch": 0.2866946244757911, + "grad_norm": 2.728837469705898, + "learning_rate": 4.09885422834449e-06, + "loss": 1.0062, + "step": 2726 + }, + { + "epoch": 0.2867997949176384, + "grad_norm": 3.151019351944655, + "learning_rate": 4.0982123663865394e-06, + "loss": 1.0328, + "step": 2727 + }, + { + "epoch": 0.28690496535948573, + "grad_norm": 2.278189959064822, + "learning_rate": 4.097570326215495e-06, + "loss": 0.9912, + "step": 2728 + }, + { + "epoch": 0.28701013580133306, + "grad_norm": 3.0741949886346394, + "learning_rate": 4.096928107902949e-06, + "loss": 1.0505, + "step": 2729 + }, + { + "epoch": 0.2871153062431804, + "grad_norm": 2.2526302976277495, + "learning_rate": 4.096285711520513e-06, + "loss": 1.0154, + "step": 2730 + }, + { + "epoch": 0.28722047668502765, + "grad_norm": 2.4076837142276974, + "learning_rate": 4.09564313713982e-06, + "loss": 1.0188, + "step": 2731 + }, + { + "epoch": 0.287325647126875, + "grad_norm": 1.8879117425120007, + "learning_rate": 4.095000384832522e-06, + "loss": 1.0139, + "step": 2732 + }, + { + "epoch": 0.2874308175687223, + "grad_norm": 3.1660647193201643, + "learning_rate": 4.09435745467029e-06, + "loss": 0.9862, + "step": 2733 + }, + { + "epoch": 0.2875359880105696, + "grad_norm": 2.4093841491214922, + "learning_rate": 4.0937143467248176e-06, + "loss": 1.038, + "step": 2734 + }, + { + "epoch": 0.28764115845241695, + "grad_norm": 2.3022115233534937, + "learning_rate": 4.093071061067815e-06, + "loss": 1.0139, + "step": 2735 + }, + { + "epoch": 0.28774632889426427, + "grad_norm": 2.285699802794061, + "learning_rate": 4.092427597771013e-06, + "loss": 0.9595, + "step": 2736 + }, + { + "epoch": 0.2878514993361116, + "grad_norm": 2.7549603896627004, + "learning_rate": 4.091783956906164e-06, + "loss": 1.0187, + "step": 2737 + }, + { + "epoch": 0.2879566697779589, + "grad_norm": 3.483788671598276, + "learning_rate": 4.091140138545037e-06, + "loss": 1.0093, + "step": 2738 + }, + { + "epoch": 0.28806184021980624, + "grad_norm": 3.2320477916369437, + "learning_rate": 4.090496142759425e-06, + "loss": 1.0223, + "step": 2739 + }, + { + "epoch": 0.28816701066165357, + "grad_norm": 2.4540897960845, + "learning_rate": 4.089851969621138e-06, + "loss": 1.0203, + "step": 2740 + }, + { + "epoch": 0.28827218110350084, + "grad_norm": 3.241726113530304, + "learning_rate": 4.089207619202006e-06, + "loss": 1.015, + "step": 2741 + }, + { + "epoch": 0.28837735154534816, + "grad_norm": 1.8502699365222972, + "learning_rate": 4.088563091573879e-06, + "loss": 1.0269, + "step": 2742 + }, + { + "epoch": 0.2884825219871955, + "grad_norm": 2.5427369122444423, + "learning_rate": 4.087918386808627e-06, + "loss": 1.0118, + "step": 2743 + }, + { + "epoch": 0.2885876924290428, + "grad_norm": 2.301419718364066, + "learning_rate": 4.087273504978139e-06, + "loss": 1.0222, + "step": 2744 + }, + { + "epoch": 0.28869286287089013, + "grad_norm": 2.917540225698338, + "learning_rate": 4.086628446154325e-06, + "loss": 1.0672, + "step": 2745 + }, + { + "epoch": 0.28879803331273746, + "grad_norm": 3.578547174167657, + "learning_rate": 4.085983210409114e-06, + "loss": 0.9953, + "step": 2746 + }, + { + "epoch": 0.2889032037545848, + "grad_norm": 3.3534719860895152, + "learning_rate": 4.085337797814455e-06, + "loss": 0.9982, + "step": 2747 + }, + { + "epoch": 0.2890083741964321, + "grad_norm": 2.676661209030138, + "learning_rate": 4.084692208442316e-06, + "loss": 1.0266, + "step": 2748 + }, + { + "epoch": 0.28911354463827943, + "grad_norm": 3.20816985126381, + "learning_rate": 4.084046442364686e-06, + "loss": 1.0083, + "step": 2749 + }, + { + "epoch": 0.28921871508012675, + "grad_norm": 4.219244894983786, + "learning_rate": 4.0834004996535706e-06, + "loss": 1.0561, + "step": 2750 + }, + { + "epoch": 0.289323885521974, + "grad_norm": 2.1890256985234298, + "learning_rate": 4.082754380381001e-06, + "loss": 0.9447, + "step": 2751 + }, + { + "epoch": 0.28942905596382135, + "grad_norm": 2.8643683405510396, + "learning_rate": 4.082108084619021e-06, + "loss": 0.9889, + "step": 2752 + }, + { + "epoch": 0.28953422640566867, + "grad_norm": 2.51789630360384, + "learning_rate": 4.0814616124397015e-06, + "loss": 0.991, + "step": 2753 + }, + { + "epoch": 0.289639396847516, + "grad_norm": 2.0882264359630436, + "learning_rate": 4.080814963915125e-06, + "loss": 0.9913, + "step": 2754 + }, + { + "epoch": 0.2897445672893633, + "grad_norm": 2.214555512262277, + "learning_rate": 4.0801681391174005e-06, + "loss": 0.9323, + "step": 2755 + }, + { + "epoch": 0.28984973773121064, + "grad_norm": 2.4270616412613424, + "learning_rate": 4.079521138118654e-06, + "loss": 1.0091, + "step": 2756 + }, + { + "epoch": 0.28995490817305797, + "grad_norm": 2.5107109147244344, + "learning_rate": 4.07887396099103e-06, + "loss": 1.0413, + "step": 2757 + }, + { + "epoch": 0.2900600786149053, + "grad_norm": 2.599915906438244, + "learning_rate": 4.078226607806694e-06, + "loss": 0.9893, + "step": 2758 + }, + { + "epoch": 0.2901652490567526, + "grad_norm": 2.2767518525790287, + "learning_rate": 4.077579078637831e-06, + "loss": 1.0241, + "step": 2759 + }, + { + "epoch": 0.29027041949859994, + "grad_norm": 2.794647276010669, + "learning_rate": 4.076931373556646e-06, + "loss": 1.037, + "step": 2760 + }, + { + "epoch": 0.2903755899404472, + "grad_norm": 3.253264621576212, + "learning_rate": 4.076283492635362e-06, + "loss": 1.0122, + "step": 2761 + }, + { + "epoch": 0.29048076038229453, + "grad_norm": 2.499980883018592, + "learning_rate": 4.075635435946225e-06, + "loss": 1.0027, + "step": 2762 + }, + { + "epoch": 0.29058593082414186, + "grad_norm": 2.972899947742262, + "learning_rate": 4.074987203561497e-06, + "loss": 0.9985, + "step": 2763 + }, + { + "epoch": 0.2906911012659892, + "grad_norm": 1.9737479143234022, + "learning_rate": 4.074338795553459e-06, + "loss": 1.0032, + "step": 2764 + }, + { + "epoch": 0.2907962717078365, + "grad_norm": 2.584186588539805, + "learning_rate": 4.073690211994417e-06, + "loss": 1.0373, + "step": 2765 + }, + { + "epoch": 0.29090144214968383, + "grad_norm": 3.1651907150502154, + "learning_rate": 4.0730414529566905e-06, + "loss": 0.9922, + "step": 2766 + }, + { + "epoch": 0.29100661259153116, + "grad_norm": 2.6878955915114253, + "learning_rate": 4.072392518512623e-06, + "loss": 1.0367, + "step": 2767 + }, + { + "epoch": 0.2911117830333785, + "grad_norm": 2.489228399908619, + "learning_rate": 4.071743408734574e-06, + "loss": 1.0302, + "step": 2768 + }, + { + "epoch": 0.2912169534752258, + "grad_norm": 3.060067782653973, + "learning_rate": 4.071094123694926e-06, + "loss": 0.9873, + "step": 2769 + }, + { + "epoch": 0.29132212391707313, + "grad_norm": 2.9513075554961388, + "learning_rate": 4.070444663466079e-06, + "loss": 1.0038, + "step": 2770 + }, + { + "epoch": 0.29142729435892045, + "grad_norm": 2.495540383919349, + "learning_rate": 4.069795028120452e-06, + "loss": 0.9954, + "step": 2771 + }, + { + "epoch": 0.2915324648007677, + "grad_norm": 4.0277271187757, + "learning_rate": 4.069145217730484e-06, + "loss": 0.9957, + "step": 2772 + }, + { + "epoch": 0.29163763524261505, + "grad_norm": 2.511687233295335, + "learning_rate": 4.068495232368635e-06, + "loss": 0.9685, + "step": 2773 + }, + { + "epoch": 0.29174280568446237, + "grad_norm": 2.1591231007772085, + "learning_rate": 4.067845072107384e-06, + "loss": 1.0597, + "step": 2774 + }, + { + "epoch": 0.2918479761263097, + "grad_norm": 2.4755744056169635, + "learning_rate": 4.0671947370192264e-06, + "loss": 1.0009, + "step": 2775 + }, + { + "epoch": 0.291953146568157, + "grad_norm": 1.7500400514158185, + "learning_rate": 4.066544227176683e-06, + "loss": 1.0042, + "step": 2776 + }, + { + "epoch": 0.29205831701000434, + "grad_norm": 2.7751869337926225, + "learning_rate": 4.065893542652288e-06, + "loss": 0.9955, + "step": 2777 + }, + { + "epoch": 0.29216348745185167, + "grad_norm": 2.0427141686414445, + "learning_rate": 4.065242683518599e-06, + "loss": 0.9956, + "step": 2778 + }, + { + "epoch": 0.292268657893699, + "grad_norm": 2.582374430746467, + "learning_rate": 4.0645916498481905e-06, + "loss": 0.9875, + "step": 2779 + }, + { + "epoch": 0.2923738283355463, + "grad_norm": 2.1183619829247573, + "learning_rate": 4.06394044171366e-06, + "loss": 0.9543, + "step": 2780 + }, + { + "epoch": 0.29247899877739364, + "grad_norm": 2.288872193618224, + "learning_rate": 4.063289059187621e-06, + "loss": 1.0294, + "step": 2781 + }, + { + "epoch": 0.2925841692192409, + "grad_norm": 2.9861250226649356, + "learning_rate": 4.062637502342708e-06, + "loss": 1.0364, + "step": 2782 + }, + { + "epoch": 0.29268933966108823, + "grad_norm": 2.263444569755448, + "learning_rate": 4.061985771251573e-06, + "loss": 1.0157, + "step": 2783 + }, + { + "epoch": 0.29279451010293556, + "grad_norm": 3.25482041267685, + "learning_rate": 4.061333865986892e-06, + "loss": 1.0169, + "step": 2784 + }, + { + "epoch": 0.2928996805447829, + "grad_norm": 2.993652291933884, + "learning_rate": 4.060681786621357e-06, + "loss": 1.0432, + "step": 2785 + }, + { + "epoch": 0.2930048509866302, + "grad_norm": 2.5325196207248517, + "learning_rate": 4.060029533227678e-06, + "loss": 0.9786, + "step": 2786 + }, + { + "epoch": 0.29311002142847753, + "grad_norm": 3.072755160863761, + "learning_rate": 4.059377105878586e-06, + "loss": 1.0081, + "step": 2787 + }, + { + "epoch": 0.29321519187032485, + "grad_norm": 2.9059852060216436, + "learning_rate": 4.058724504646834e-06, + "loss": 0.999, + "step": 2788 + }, + { + "epoch": 0.2933203623121722, + "grad_norm": 2.1664281660182154, + "learning_rate": 4.058071729605191e-06, + "loss": 1.0101, + "step": 2789 + }, + { + "epoch": 0.2934255327540195, + "grad_norm": 2.6755938701735014, + "learning_rate": 4.057418780826448e-06, + "loss": 0.9915, + "step": 2790 + }, + { + "epoch": 0.2935307031958668, + "grad_norm": 1.7269644244858031, + "learning_rate": 4.05676565838341e-06, + "loss": 1.0071, + "step": 2791 + }, + { + "epoch": 0.2936358736377141, + "grad_norm": 2.74362018348315, + "learning_rate": 4.0561123623489096e-06, + "loss": 1.0314, + "step": 2792 + }, + { + "epoch": 0.2937410440795614, + "grad_norm": 2.340557452024337, + "learning_rate": 4.0554588927957925e-06, + "loss": 0.9678, + "step": 2793 + }, + { + "epoch": 0.29384621452140874, + "grad_norm": 2.066040916919864, + "learning_rate": 4.054805249796925e-06, + "loss": 1.0263, + "step": 2794 + }, + { + "epoch": 0.29395138496325607, + "grad_norm": 2.8805200149811956, + "learning_rate": 4.054151433425194e-06, + "loss": 1.0603, + "step": 2795 + }, + { + "epoch": 0.2940565554051034, + "grad_norm": 2.434123603671788, + "learning_rate": 4.053497443753505e-06, + "loss": 1.0042, + "step": 2796 + }, + { + "epoch": 0.2941617258469507, + "grad_norm": 3.618193905329278, + "learning_rate": 4.052843280854783e-06, + "loss": 0.9702, + "step": 2797 + }, + { + "epoch": 0.29426689628879804, + "grad_norm": 3.1682364893441193, + "learning_rate": 4.052188944801972e-06, + "loss": 1.0143, + "step": 2798 + }, + { + "epoch": 0.29437206673064537, + "grad_norm": 2.3744837915482484, + "learning_rate": 4.051534435668038e-06, + "loss": 0.9982, + "step": 2799 + }, + { + "epoch": 0.2944772371724927, + "grad_norm": 2.1386699249557872, + "learning_rate": 4.050879753525959e-06, + "loss": 1.0242, + "step": 2800 + }, + { + "epoch": 0.29458240761434, + "grad_norm": 2.6446899519304314, + "learning_rate": 4.050224898448741e-06, + "loss": 0.9404, + "step": 2801 + }, + { + "epoch": 0.2946875780561873, + "grad_norm": 2.381644444210151, + "learning_rate": 4.049569870509404e-06, + "loss": 1.0128, + "step": 2802 + }, + { + "epoch": 0.2947927484980346, + "grad_norm": 2.0924851058105287, + "learning_rate": 4.048914669780989e-06, + "loss": 1.0263, + "step": 2803 + }, + { + "epoch": 0.29489791893988193, + "grad_norm": 2.8128059213176666, + "learning_rate": 4.048259296336556e-06, + "loss": 1.0109, + "step": 2804 + }, + { + "epoch": 0.29500308938172926, + "grad_norm": 2.230026087010622, + "learning_rate": 4.047603750249184e-06, + "loss": 0.9928, + "step": 2805 + }, + { + "epoch": 0.2951082598235766, + "grad_norm": 3.651123695031622, + "learning_rate": 4.0469480315919714e-06, + "loss": 1.0337, + "step": 2806 + }, + { + "epoch": 0.2952134302654239, + "grad_norm": 3.794948500887568, + "learning_rate": 4.0462921404380376e-06, + "loss": 1.0177, + "step": 2807 + }, + { + "epoch": 0.29531860070727123, + "grad_norm": 2.717172683131933, + "learning_rate": 4.045636076860517e-06, + "loss": 0.9645, + "step": 2808 + }, + { + "epoch": 0.29542377114911855, + "grad_norm": 2.8432922404185557, + "learning_rate": 4.044979840932567e-06, + "loss": 1.0147, + "step": 2809 + }, + { + "epoch": 0.2955289415909659, + "grad_norm": 2.466690276825208, + "learning_rate": 4.044323432727363e-06, + "loss": 1.04, + "step": 2810 + }, + { + "epoch": 0.2956341120328132, + "grad_norm": 1.898880562225239, + "learning_rate": 4.0436668523180985e-06, + "loss": 0.9796, + "step": 2811 + }, + { + "epoch": 0.29573928247466047, + "grad_norm": 2.2401687192743265, + "learning_rate": 4.04301009977799e-06, + "loss": 1.0075, + "step": 2812 + }, + { + "epoch": 0.2958444529165078, + "grad_norm": 2.779156433583653, + "learning_rate": 4.042353175180268e-06, + "loss": 1.0163, + "step": 2813 + }, + { + "epoch": 0.2959496233583551, + "grad_norm": 2.6233755857676617, + "learning_rate": 4.041696078598185e-06, + "loss": 1.0308, + "step": 2814 + }, + { + "epoch": 0.29605479380020244, + "grad_norm": 2.89215727215323, + "learning_rate": 4.041038810105014e-06, + "loss": 1.039, + "step": 2815 + }, + { + "epoch": 0.29615996424204977, + "grad_norm": 2.2937244141847275, + "learning_rate": 4.040381369774045e-06, + "loss": 0.9985, + "step": 2816 + }, + { + "epoch": 0.2962651346838971, + "grad_norm": 2.2753449571613507, + "learning_rate": 4.039723757678585e-06, + "loss": 0.9842, + "step": 2817 + }, + { + "epoch": 0.2963703051257444, + "grad_norm": 2.825575794098944, + "learning_rate": 4.0390659738919665e-06, + "loss": 1.0148, + "step": 2818 + }, + { + "epoch": 0.29647547556759174, + "grad_norm": 2.1237158283334088, + "learning_rate": 4.0384080184875355e-06, + "loss": 0.9896, + "step": 2819 + }, + { + "epoch": 0.29658064600943906, + "grad_norm": 2.8266979036329123, + "learning_rate": 4.037749891538661e-06, + "loss": 1.0097, + "step": 2820 + }, + { + "epoch": 0.2966858164512864, + "grad_norm": 2.2004271894165557, + "learning_rate": 4.037091593118726e-06, + "loss": 1.0418, + "step": 2821 + }, + { + "epoch": 0.2967909868931337, + "grad_norm": 2.0681432005760914, + "learning_rate": 4.036433123301139e-06, + "loss": 0.9897, + "step": 2822 + }, + { + "epoch": 0.296896157334981, + "grad_norm": 2.8211256820827404, + "learning_rate": 4.035774482159323e-06, + "loss": 1.0378, + "step": 2823 + }, + { + "epoch": 0.2970013277768283, + "grad_norm": 2.6606426976386675, + "learning_rate": 4.035115669766721e-06, + "loss": 1.0247, + "step": 2824 + }, + { + "epoch": 0.29710649821867563, + "grad_norm": 2.5354918405939366, + "learning_rate": 4.034456686196798e-06, + "loss": 0.9795, + "step": 2825 + }, + { + "epoch": 0.29721166866052295, + "grad_norm": 3.4840884239739487, + "learning_rate": 4.033797531523034e-06, + "loss": 0.9938, + "step": 2826 + }, + { + "epoch": 0.2973168391023703, + "grad_norm": 2.9035094968158135, + "learning_rate": 4.033138205818931e-06, + "loss": 0.9971, + "step": 2827 + }, + { + "epoch": 0.2974220095442176, + "grad_norm": 2.388147959432155, + "learning_rate": 4.032478709158007e-06, + "loss": 0.9786, + "step": 2828 + }, + { + "epoch": 0.2975271799860649, + "grad_norm": 2.5043218419305657, + "learning_rate": 4.0318190416138024e-06, + "loss": 1.013, + "step": 2829 + }, + { + "epoch": 0.29763235042791225, + "grad_norm": 2.630395705298903, + "learning_rate": 4.031159203259876e-06, + "loss": 0.9978, + "step": 2830 + }, + { + "epoch": 0.2977375208697596, + "grad_norm": 3.048566956011636, + "learning_rate": 4.030499194169803e-06, + "loss": 1.022, + "step": 2831 + }, + { + "epoch": 0.2978426913116069, + "grad_norm": 2.5379425325683656, + "learning_rate": 4.029839014417181e-06, + "loss": 1.0029, + "step": 2832 + }, + { + "epoch": 0.29794786175345417, + "grad_norm": 2.6960366196246857, + "learning_rate": 4.029178664075625e-06, + "loss": 1.0183, + "step": 2833 + }, + { + "epoch": 0.2980530321953015, + "grad_norm": 3.224224287628243, + "learning_rate": 4.028518143218768e-06, + "loss": 1.073, + "step": 2834 + }, + { + "epoch": 0.2981582026371488, + "grad_norm": 2.429803063051489, + "learning_rate": 4.027857451920264e-06, + "loss": 0.9988, + "step": 2835 + }, + { + "epoch": 0.29826337307899614, + "grad_norm": 3.126241311853215, + "learning_rate": 4.027196590253786e-06, + "loss": 1.0351, + "step": 2836 + }, + { + "epoch": 0.29836854352084347, + "grad_norm": 2.3886966033688557, + "learning_rate": 4.026535558293024e-06, + "loss": 1.0146, + "step": 2837 + }, + { + "epoch": 0.2984737139626908, + "grad_norm": 2.1691315598429544, + "learning_rate": 4.025874356111689e-06, + "loss": 0.9821, + "step": 2838 + }, + { + "epoch": 0.2985788844045381, + "grad_norm": 3.2536165239458774, + "learning_rate": 4.025212983783511e-06, + "loss": 1.0239, + "step": 2839 + }, + { + "epoch": 0.29868405484638544, + "grad_norm": 2.361345102232033, + "learning_rate": 4.024551441382235e-06, + "loss": 1.0125, + "step": 2840 + }, + { + "epoch": 0.29878922528823276, + "grad_norm": 2.797329995864098, + "learning_rate": 4.023889728981631e-06, + "loss": 1.0278, + "step": 2841 + }, + { + "epoch": 0.2988943957300801, + "grad_norm": 2.053294094600003, + "learning_rate": 4.023227846655484e-06, + "loss": 0.9636, + "step": 2842 + }, + { + "epoch": 0.29899956617192736, + "grad_norm": 2.0068541001608, + "learning_rate": 4.0225657944776e-06, + "loss": 0.9924, + "step": 2843 + }, + { + "epoch": 0.2991047366137747, + "grad_norm": 3.0489588147562947, + "learning_rate": 4.021903572521802e-06, + "loss": 1.059, + "step": 2844 + }, + { + "epoch": 0.299209907055622, + "grad_norm": 2.416659465285543, + "learning_rate": 4.021241180861933e-06, + "loss": 1.0491, + "step": 2845 + }, + { + "epoch": 0.29931507749746933, + "grad_norm": 1.8189123954689044, + "learning_rate": 4.0205786195718545e-06, + "loss": 0.997, + "step": 2846 + }, + { + "epoch": 0.29942024793931665, + "grad_norm": 2.2393968626632126, + "learning_rate": 4.0199158887254484e-06, + "loss": 1.0253, + "step": 2847 + }, + { + "epoch": 0.299525418381164, + "grad_norm": 2.7113732338095438, + "learning_rate": 4.019252988396613e-06, + "loss": 0.9651, + "step": 2848 + }, + { + "epoch": 0.2996305888230113, + "grad_norm": 3.219458618141807, + "learning_rate": 4.018589918659267e-06, + "loss": 1.047, + "step": 2849 + }, + { + "epoch": 0.2997357592648586, + "grad_norm": 2.485215246867124, + "learning_rate": 4.01792667958735e-06, + "loss": 0.995, + "step": 2850 + }, + { + "epoch": 0.29984092970670595, + "grad_norm": 2.5877817382570276, + "learning_rate": 4.0172632712548145e-06, + "loss": 0.9823, + "step": 2851 + }, + { + "epoch": 0.2999461001485533, + "grad_norm": 2.8920656586696043, + "learning_rate": 4.016599693735639e-06, + "loss": 1.0172, + "step": 2852 + }, + { + "epoch": 0.30005127059040054, + "grad_norm": 2.1377145970196025, + "learning_rate": 4.015935947103816e-06, + "loss": 0.9841, + "step": 2853 + }, + { + "epoch": 0.30015644103224787, + "grad_norm": 2.357987310739265, + "learning_rate": 4.015272031433358e-06, + "loss": 0.9969, + "step": 2854 + }, + { + "epoch": 0.3002616114740952, + "grad_norm": 2.6065064666162927, + "learning_rate": 4.014607946798298e-06, + "loss": 0.9742, + "step": 2855 + }, + { + "epoch": 0.3003667819159425, + "grad_norm": 2.5201558670387443, + "learning_rate": 4.013943693272686e-06, + "loss": 0.9801, + "step": 2856 + }, + { + "epoch": 0.30047195235778984, + "grad_norm": 3.0654655143598166, + "learning_rate": 4.013279270930592e-06, + "loss": 1.0204, + "step": 2857 + }, + { + "epoch": 0.30057712279963716, + "grad_norm": 4.142146640718669, + "learning_rate": 4.012614679846103e-06, + "loss": 1.0169, + "step": 2858 + }, + { + "epoch": 0.3006822932414845, + "grad_norm": 1.7013966226721964, + "learning_rate": 4.011949920093327e-06, + "loss": 1.0133, + "step": 2859 + }, + { + "epoch": 0.3007874636833318, + "grad_norm": 2.3493733773934036, + "learning_rate": 4.0112849917463905e-06, + "loss": 1.0396, + "step": 2860 + }, + { + "epoch": 0.30089263412517914, + "grad_norm": 2.9932760739027167, + "learning_rate": 4.010619894879436e-06, + "loss": 1.0482, + "step": 2861 + }, + { + "epoch": 0.30099780456702646, + "grad_norm": 3.4913376390190747, + "learning_rate": 4.009954629566629e-06, + "loss": 1.0478, + "step": 2862 + }, + { + "epoch": 0.30110297500887373, + "grad_norm": 2.4522472564185223, + "learning_rate": 4.0092891958821515e-06, + "loss": 1.0164, + "step": 2863 + }, + { + "epoch": 0.30120814545072105, + "grad_norm": 2.7224329510455996, + "learning_rate": 4.0086235939002024e-06, + "loss": 1.0279, + "step": 2864 + }, + { + "epoch": 0.3013133158925684, + "grad_norm": 2.7001843459526396, + "learning_rate": 4.007957823695005e-06, + "loss": 1.0319, + "step": 2865 + }, + { + "epoch": 0.3014184863344157, + "grad_norm": 2.9328100471570813, + "learning_rate": 4.007291885340796e-06, + "loss": 0.9669, + "step": 2866 + }, + { + "epoch": 0.301523656776263, + "grad_norm": 2.8953214503968705, + "learning_rate": 4.006625778911831e-06, + "loss": 0.9993, + "step": 2867 + }, + { + "epoch": 0.30162882721811035, + "grad_norm": 3.0833861909775333, + "learning_rate": 4.005959504482389e-06, + "loss": 0.9904, + "step": 2868 + }, + { + "epoch": 0.3017339976599577, + "grad_norm": 2.706249563006177, + "learning_rate": 4.005293062126764e-06, + "loss": 1.0137, + "step": 2869 + }, + { + "epoch": 0.301839168101805, + "grad_norm": 2.6523322513613996, + "learning_rate": 4.004626451919268e-06, + "loss": 1.022, + "step": 2870 + }, + { + "epoch": 0.3019443385436523, + "grad_norm": 2.363970677643061, + "learning_rate": 4.003959673934235e-06, + "loss": 0.9987, + "step": 2871 + }, + { + "epoch": 0.30204950898549965, + "grad_norm": 2.6706791504734824, + "learning_rate": 4.003292728246015e-06, + "loss": 1.0179, + "step": 2872 + }, + { + "epoch": 0.3021546794273469, + "grad_norm": 3.0479510186456977, + "learning_rate": 4.002625614928978e-06, + "loss": 0.956, + "step": 2873 + }, + { + "epoch": 0.30225984986919424, + "grad_norm": 1.8651319495727035, + "learning_rate": 4.001958334057512e-06, + "loss": 1.0283, + "step": 2874 + }, + { + "epoch": 0.30236502031104157, + "grad_norm": 2.065753591700377, + "learning_rate": 4.001290885706023e-06, + "loss": 0.9982, + "step": 2875 + }, + { + "epoch": 0.3024701907528889, + "grad_norm": 3.420963266297305, + "learning_rate": 4.0006232699489385e-06, + "loss": 0.9814, + "step": 2876 + }, + { + "epoch": 0.3025753611947362, + "grad_norm": 2.598806589411273, + "learning_rate": 3.9999554868607036e-06, + "loss": 1.0091, + "step": 2877 + }, + { + "epoch": 0.30268053163658354, + "grad_norm": 2.691137805944449, + "learning_rate": 3.999287536515778e-06, + "loss": 1.0782, + "step": 2878 + }, + { + "epoch": 0.30278570207843086, + "grad_norm": 2.5676564139134976, + "learning_rate": 3.998619418988646e-06, + "loss": 1.0563, + "step": 2879 + }, + { + "epoch": 0.3028908725202782, + "grad_norm": 2.5833344941215235, + "learning_rate": 3.997951134353808e-06, + "loss": 1.0403, + "step": 2880 + }, + { + "epoch": 0.3029960429621255, + "grad_norm": 2.1282219410524914, + "learning_rate": 3.99728268268578e-06, + "loss": 0.9861, + "step": 2881 + }, + { + "epoch": 0.30310121340397284, + "grad_norm": 3.4275643393055333, + "learning_rate": 3.996614064059104e-06, + "loss": 1.0173, + "step": 2882 + }, + { + "epoch": 0.30320638384582016, + "grad_norm": 2.2265143102649647, + "learning_rate": 3.995945278548331e-06, + "loss": 1.0226, + "step": 2883 + }, + { + "epoch": 0.30331155428766743, + "grad_norm": 2.8938023915675264, + "learning_rate": 3.99527632622804e-06, + "loss": 1.0185, + "step": 2884 + }, + { + "epoch": 0.30341672472951475, + "grad_norm": 2.854330050298019, + "learning_rate": 3.994607207172823e-06, + "loss": 1.0003, + "step": 2885 + }, + { + "epoch": 0.3035218951713621, + "grad_norm": 2.878769782756261, + "learning_rate": 3.993937921457292e-06, + "loss": 0.9944, + "step": 2886 + }, + { + "epoch": 0.3036270656132094, + "grad_norm": 2.8721086441370294, + "learning_rate": 3.993268469156077e-06, + "loss": 0.9825, + "step": 2887 + }, + { + "epoch": 0.3037322360550567, + "grad_norm": 2.2081747573437074, + "learning_rate": 3.992598850343827e-06, + "loss": 0.9873, + "step": 2888 + }, + { + "epoch": 0.30383740649690405, + "grad_norm": 2.9007140299444822, + "learning_rate": 3.991929065095211e-06, + "loss": 1.0232, + "step": 2889 + }, + { + "epoch": 0.3039425769387514, + "grad_norm": 1.3415032258707018, + "learning_rate": 3.991259113484916e-06, + "loss": 0.9871, + "step": 2890 + }, + { + "epoch": 0.3040477473805987, + "grad_norm": 2.7105364628823456, + "learning_rate": 3.990588995587643e-06, + "loss": 1.0259, + "step": 2891 + }, + { + "epoch": 0.304152917822446, + "grad_norm": 2.31954600362432, + "learning_rate": 3.989918711478118e-06, + "loss": 1.042, + "step": 2892 + }, + { + "epoch": 0.30425808826429335, + "grad_norm": 2.7954636771354346, + "learning_rate": 3.989248261231084e-06, + "loss": 1.0302, + "step": 2893 + }, + { + "epoch": 0.3043632587061406, + "grad_norm": 2.7537708698276813, + "learning_rate": 3.988577644921299e-06, + "loss": 1.0587, + "step": 2894 + }, + { + "epoch": 0.30446842914798794, + "grad_norm": 3.0962306715267935, + "learning_rate": 3.9879068626235425e-06, + "loss": 0.9759, + "step": 2895 + }, + { + "epoch": 0.30457359958983526, + "grad_norm": 2.3957225896167746, + "learning_rate": 3.987235914412614e-06, + "loss": 1.0023, + "step": 2896 + }, + { + "epoch": 0.3046787700316826, + "grad_norm": 2.771932833415502, + "learning_rate": 3.986564800363326e-06, + "loss": 1.0512, + "step": 2897 + }, + { + "epoch": 0.3047839404735299, + "grad_norm": 2.3606896051469537, + "learning_rate": 3.985893520550516e-06, + "loss": 1.0107, + "step": 2898 + }, + { + "epoch": 0.30488911091537724, + "grad_norm": 2.4174906103885943, + "learning_rate": 3.985222075049035e-06, + "loss": 0.9982, + "step": 2899 + }, + { + "epoch": 0.30499428135722456, + "grad_norm": 1.6550177073693109, + "learning_rate": 3.984550463933754e-06, + "loss": 0.9569, + "step": 2900 + }, + { + "epoch": 0.3050994517990719, + "grad_norm": 3.4294334384914213, + "learning_rate": 3.983878687279565e-06, + "loss": 1.0138, + "step": 2901 + }, + { + "epoch": 0.3052046222409192, + "grad_norm": 2.56368532622028, + "learning_rate": 3.9832067451613755e-06, + "loss": 1.0517, + "step": 2902 + }, + { + "epoch": 0.30530979268276653, + "grad_norm": 2.8083843993980953, + "learning_rate": 3.982534637654112e-06, + "loss": 0.9687, + "step": 2903 + }, + { + "epoch": 0.3054149631246138, + "grad_norm": 2.7297876966125156, + "learning_rate": 3.981862364832718e-06, + "loss": 1.014, + "step": 2904 + }, + { + "epoch": 0.3055201335664611, + "grad_norm": 2.103857181645903, + "learning_rate": 3.981189926772161e-06, + "loss": 0.9635, + "step": 2905 + }, + { + "epoch": 0.30562530400830845, + "grad_norm": 2.5474502602534175, + "learning_rate": 3.980517323547419e-06, + "loss": 1.0005, + "step": 2906 + }, + { + "epoch": 0.3057304744501558, + "grad_norm": 2.4631842991491637, + "learning_rate": 3.979844555233496e-06, + "loss": 1.0259, + "step": 2907 + }, + { + "epoch": 0.3058356448920031, + "grad_norm": 3.829976049615588, + "learning_rate": 3.979171621905409e-06, + "loss": 1.0053, + "step": 2908 + }, + { + "epoch": 0.3059408153338504, + "grad_norm": 3.0337382004641404, + "learning_rate": 3.978498523638194e-06, + "loss": 1.0447, + "step": 2909 + }, + { + "epoch": 0.30604598577569775, + "grad_norm": 2.4115719177112105, + "learning_rate": 3.97782526050691e-06, + "loss": 0.9999, + "step": 2910 + }, + { + "epoch": 0.3061511562175451, + "grad_norm": 2.5527608779229696, + "learning_rate": 3.977151832586628e-06, + "loss": 1.0281, + "step": 2911 + }, + { + "epoch": 0.3062563266593924, + "grad_norm": 2.0029391608583262, + "learning_rate": 3.976478239952444e-06, + "loss": 1.0366, + "step": 2912 + }, + { + "epoch": 0.3063614971012397, + "grad_norm": 2.447326314753849, + "learning_rate": 3.975804482679464e-06, + "loss": 1.0104, + "step": 2913 + }, + { + "epoch": 0.306466667543087, + "grad_norm": 2.2765266252277203, + "learning_rate": 3.975130560842821e-06, + "loss": 0.979, + "step": 2914 + }, + { + "epoch": 0.3065718379849343, + "grad_norm": 2.615082713312339, + "learning_rate": 3.974456474517661e-06, + "loss": 1.0355, + "step": 2915 + }, + { + "epoch": 0.30667700842678164, + "grad_norm": 2.380276345939876, + "learning_rate": 3.97378222377915e-06, + "loss": 1.0034, + "step": 2916 + }, + { + "epoch": 0.30678217886862896, + "grad_norm": 2.3073048459869168, + "learning_rate": 3.973107808702472e-06, + "loss": 1.0268, + "step": 2917 + }, + { + "epoch": 0.3068873493104763, + "grad_norm": 1.8941823202478179, + "learning_rate": 3.9724332293628295e-06, + "loss": 1.0306, + "step": 2918 + }, + { + "epoch": 0.3069925197523236, + "grad_norm": 2.896280116503032, + "learning_rate": 3.9717584858354454e-06, + "loss": 1.008, + "step": 2919 + }, + { + "epoch": 0.30709769019417094, + "grad_norm": 2.7180851301598103, + "learning_rate": 3.971083578195556e-06, + "loss": 1.0215, + "step": 2920 + }, + { + "epoch": 0.30720286063601826, + "grad_norm": 2.0401389712432487, + "learning_rate": 3.970408506518419e-06, + "loss": 0.9842, + "step": 2921 + }, + { + "epoch": 0.3073080310778656, + "grad_norm": 2.3667558391152426, + "learning_rate": 3.969733270879313e-06, + "loss": 0.9797, + "step": 2922 + }, + { + "epoch": 0.3074132015197129, + "grad_norm": 3.307599231062686, + "learning_rate": 3.969057871353529e-06, + "loss": 0.9895, + "step": 2923 + }, + { + "epoch": 0.3075183719615602, + "grad_norm": 2.8646693785344555, + "learning_rate": 3.96838230801638e-06, + "loss": 0.9618, + "step": 2924 + }, + { + "epoch": 0.3076235424034075, + "grad_norm": 2.2316449558106974, + "learning_rate": 3.967706580943197e-06, + "loss": 1.0306, + "step": 2925 + }, + { + "epoch": 0.3077287128452548, + "grad_norm": 2.771790047908696, + "learning_rate": 3.9670306902093286e-06, + "loss": 1.0021, + "step": 2926 + }, + { + "epoch": 0.30783388328710215, + "grad_norm": 2.7193389805440207, + "learning_rate": 3.9663546358901415e-06, + "loss": 0.9975, + "step": 2927 + }, + { + "epoch": 0.3079390537289495, + "grad_norm": 2.174531390393959, + "learning_rate": 3.965678418061023e-06, + "loss": 1.0174, + "step": 2928 + }, + { + "epoch": 0.3080442241707968, + "grad_norm": 2.473148623762536, + "learning_rate": 3.965002036797374e-06, + "loss": 0.9751, + "step": 2929 + }, + { + "epoch": 0.3081493946126441, + "grad_norm": 2.523481125448685, + "learning_rate": 3.9643254921746176e-06, + "loss": 1.0486, + "step": 2930 + }, + { + "epoch": 0.30825456505449145, + "grad_norm": 2.4231464073038103, + "learning_rate": 3.963648784268193e-06, + "loss": 1.0308, + "step": 2931 + }, + { + "epoch": 0.30835973549633877, + "grad_norm": 2.908693220754445, + "learning_rate": 3.9629719131535595e-06, + "loss": 1.019, + "step": 2932 + }, + { + "epoch": 0.3084649059381861, + "grad_norm": 2.831569138155086, + "learning_rate": 3.9622948789061935e-06, + "loss": 1.0282, + "step": 2933 + }, + { + "epoch": 0.30857007638003336, + "grad_norm": 2.1822044513817533, + "learning_rate": 3.961617681601588e-06, + "loss": 1.0471, + "step": 2934 + }, + { + "epoch": 0.3086752468218807, + "grad_norm": 1.968984775871048, + "learning_rate": 3.960940321315257e-06, + "loss": 0.9938, + "step": 2935 + }, + { + "epoch": 0.308780417263728, + "grad_norm": 3.0742676631782015, + "learning_rate": 3.960262798122731e-06, + "loss": 1.0384, + "step": 2936 + }, + { + "epoch": 0.30888558770557534, + "grad_norm": 1.7715236826173646, + "learning_rate": 3.95958511209956e-06, + "loss": 1.017, + "step": 2937 + }, + { + "epoch": 0.30899075814742266, + "grad_norm": 2.6875698356975297, + "learning_rate": 3.95890726332131e-06, + "loss": 1.0554, + "step": 2938 + }, + { + "epoch": 0.30909592858927, + "grad_norm": 1.9619216486997688, + "learning_rate": 3.958229251863567e-06, + "loss": 0.9878, + "step": 2939 + }, + { + "epoch": 0.3092010990311173, + "grad_norm": 2.7541620784435406, + "learning_rate": 3.957551077801935e-06, + "loss": 0.9557, + "step": 2940 + }, + { + "epoch": 0.30930626947296463, + "grad_norm": 1.8909424032199753, + "learning_rate": 3.956872741212035e-06, + "loss": 1.0064, + "step": 2941 + }, + { + "epoch": 0.30941143991481196, + "grad_norm": 2.623868633279091, + "learning_rate": 3.956194242169506e-06, + "loss": 1.0086, + "step": 2942 + }, + { + "epoch": 0.3095166103566593, + "grad_norm": 3.8473610355629195, + "learning_rate": 3.955515580750008e-06, + "loss": 1.0438, + "step": 2943 + }, + { + "epoch": 0.3096217807985066, + "grad_norm": 3.550700597509594, + "learning_rate": 3.954836757029214e-06, + "loss": 1.0497, + "step": 2944 + }, + { + "epoch": 0.3097269512403539, + "grad_norm": 2.9530235516637013, + "learning_rate": 3.9541577710828225e-06, + "loss": 1.0073, + "step": 2945 + }, + { + "epoch": 0.3098321216822012, + "grad_norm": 2.5132191578181384, + "learning_rate": 3.953478622986542e-06, + "loss": 1.0157, + "step": 2946 + }, + { + "epoch": 0.3099372921240485, + "grad_norm": 2.815147625525899, + "learning_rate": 3.952799312816105e-06, + "loss": 1.0341, + "step": 2947 + }, + { + "epoch": 0.31004246256589585, + "grad_norm": 2.3437305046069077, + "learning_rate": 3.9521198406472575e-06, + "loss": 1.0106, + "step": 2948 + }, + { + "epoch": 0.3101476330077432, + "grad_norm": 2.7478630820842724, + "learning_rate": 3.9514402065557675e-06, + "loss": 0.9561, + "step": 2949 + }, + { + "epoch": 0.3102528034495905, + "grad_norm": 3.1039712218296445, + "learning_rate": 3.950760410617421e-06, + "loss": 1.0222, + "step": 2950 + }, + { + "epoch": 0.3103579738914378, + "grad_norm": 2.758211476967507, + "learning_rate": 3.950080452908016e-06, + "loss": 0.9854, + "step": 2951 + }, + { + "epoch": 0.31046314433328515, + "grad_norm": 3.118482777087128, + "learning_rate": 3.949400333503378e-06, + "loss": 1.0438, + "step": 2952 + }, + { + "epoch": 0.31056831477513247, + "grad_norm": 2.3959007914603077, + "learning_rate": 3.948720052479343e-06, + "loss": 1.002, + "step": 2953 + }, + { + "epoch": 0.3106734852169798, + "grad_norm": 1.9575742067090993, + "learning_rate": 3.948039609911768e-06, + "loss": 0.9846, + "step": 2954 + }, + { + "epoch": 0.31077865565882706, + "grad_norm": 2.218966006005466, + "learning_rate": 3.947359005876527e-06, + "loss": 1.0369, + "step": 2955 + }, + { + "epoch": 0.3108838261006744, + "grad_norm": 2.957402283333697, + "learning_rate": 3.946678240449515e-06, + "loss": 0.9891, + "step": 2956 + }, + { + "epoch": 0.3109889965425217, + "grad_norm": 2.6314748348639805, + "learning_rate": 3.94599731370664e-06, + "loss": 1.0428, + "step": 2957 + }, + { + "epoch": 0.31109416698436904, + "grad_norm": 2.7377714026833866, + "learning_rate": 3.945316225723832e-06, + "loss": 1.035, + "step": 2958 + }, + { + "epoch": 0.31119933742621636, + "grad_norm": 2.053832855562837, + "learning_rate": 3.944634976577036e-06, + "loss": 0.9603, + "step": 2959 + }, + { + "epoch": 0.3113045078680637, + "grad_norm": 3.1158025154221125, + "learning_rate": 3.943953566342219e-06, + "loss": 0.9998, + "step": 2960 + }, + { + "epoch": 0.311409678309911, + "grad_norm": 2.396662606226732, + "learning_rate": 3.9432719950953625e-06, + "loss": 1.003, + "step": 2961 + }, + { + "epoch": 0.31151484875175833, + "grad_norm": 2.477520494380087, + "learning_rate": 3.942590262912466e-06, + "loss": 1.0024, + "step": 2962 + }, + { + "epoch": 0.31162001919360566, + "grad_norm": 2.966774835153069, + "learning_rate": 3.941908369869549e-06, + "loss": 1.024, + "step": 2963 + }, + { + "epoch": 0.311725189635453, + "grad_norm": 2.3612255349598286, + "learning_rate": 3.941226316042648e-06, + "loss": 0.9814, + "step": 2964 + }, + { + "epoch": 0.31183036007730025, + "grad_norm": 3.0645177496274334, + "learning_rate": 3.940544101507817e-06, + "loss": 0.9985, + "step": 2965 + }, + { + "epoch": 0.3119355305191476, + "grad_norm": 2.695073034424229, + "learning_rate": 3.939861726341128e-06, + "loss": 0.9781, + "step": 2966 + }, + { + "epoch": 0.3120407009609949, + "grad_norm": 2.731332674190935, + "learning_rate": 3.939179190618671e-06, + "loss": 0.9987, + "step": 2967 + }, + { + "epoch": 0.3121458714028422, + "grad_norm": 1.620215381519614, + "learning_rate": 3.938496494416554e-06, + "loss": 0.9743, + "step": 2968 + }, + { + "epoch": 0.31225104184468955, + "grad_norm": 2.3891831524051295, + "learning_rate": 3.937813637810904e-06, + "loss": 1.0118, + "step": 2969 + }, + { + "epoch": 0.31235621228653687, + "grad_norm": 2.2376707736140067, + "learning_rate": 3.937130620877863e-06, + "loss": 0.9824, + "step": 2970 + }, + { + "epoch": 0.3124613827283842, + "grad_norm": 2.4199521650953866, + "learning_rate": 3.936447443693595e-06, + "loss": 0.9938, + "step": 2971 + }, + { + "epoch": 0.3125665531702315, + "grad_norm": 2.3031295788347226, + "learning_rate": 3.935764106334278e-06, + "loss": 1.0102, + "step": 2972 + }, + { + "epoch": 0.31267172361207884, + "grad_norm": 3.5924980543328506, + "learning_rate": 3.935080608876109e-06, + "loss": 0.9912, + "step": 2973 + }, + { + "epoch": 0.31277689405392617, + "grad_norm": 2.8358224243128154, + "learning_rate": 3.934396951395305e-06, + "loss": 1.004, + "step": 2974 + }, + { + "epoch": 0.31288206449577344, + "grad_norm": 2.094776426203808, + "learning_rate": 3.933713133968097e-06, + "loss": 0.96, + "step": 2975 + }, + { + "epoch": 0.31298723493762076, + "grad_norm": 2.8795819218722722, + "learning_rate": 3.933029156670738e-06, + "loss": 0.99, + "step": 2976 + }, + { + "epoch": 0.3130924053794681, + "grad_norm": 3.1871471127409756, + "learning_rate": 3.9323450195794954e-06, + "loss": 0.9891, + "step": 2977 + }, + { + "epoch": 0.3131975758213154, + "grad_norm": 2.573983278943729, + "learning_rate": 3.9316607227706564e-06, + "loss": 0.995, + "step": 2978 + }, + { + "epoch": 0.31330274626316273, + "grad_norm": 2.4060298355716445, + "learning_rate": 3.930976266320525e-06, + "loss": 1.0275, + "step": 2979 + }, + { + "epoch": 0.31340791670501006, + "grad_norm": 2.4230905246024435, + "learning_rate": 3.930291650305424e-06, + "loss": 0.997, + "step": 2980 + }, + { + "epoch": 0.3135130871468574, + "grad_norm": 1.8561383044323279, + "learning_rate": 3.929606874801694e-06, + "loss": 0.9847, + "step": 2981 + }, + { + "epoch": 0.3136182575887047, + "grad_norm": 2.549475061635563, + "learning_rate": 3.9289219398856905e-06, + "loss": 0.9506, + "step": 2982 + }, + { + "epoch": 0.31372342803055203, + "grad_norm": 2.220462170609673, + "learning_rate": 3.928236845633791e-06, + "loss": 1.0019, + "step": 2983 + }, + { + "epoch": 0.31382859847239936, + "grad_norm": 2.856787147753265, + "learning_rate": 3.927551592122389e-06, + "loss": 1.0285, + "step": 2984 + }, + { + "epoch": 0.3139337689142466, + "grad_norm": 3.083922842525906, + "learning_rate": 3.926866179427894e-06, + "loss": 1.0602, + "step": 2985 + }, + { + "epoch": 0.31403893935609395, + "grad_norm": 2.597249942858582, + "learning_rate": 3.926180607626735e-06, + "loss": 1.0163, + "step": 2986 + }, + { + "epoch": 0.3141441097979413, + "grad_norm": 2.3456341225046278, + "learning_rate": 3.92549487679536e-06, + "loss": 1.0348, + "step": 2987 + }, + { + "epoch": 0.3142492802397886, + "grad_norm": 3.092563769825593, + "learning_rate": 3.924808987010234e-06, + "loss": 1.0002, + "step": 2988 + }, + { + "epoch": 0.3143544506816359, + "grad_norm": 2.1068768689816983, + "learning_rate": 3.924122938347835e-06, + "loss": 0.994, + "step": 2989 + }, + { + "epoch": 0.31445962112348325, + "grad_norm": 2.5596294401722846, + "learning_rate": 3.923436730884668e-06, + "loss": 1.0405, + "step": 2990 + }, + { + "epoch": 0.31456479156533057, + "grad_norm": 2.5979688517209687, + "learning_rate": 3.922750364697246e-06, + "loss": 1.0104, + "step": 2991 + }, + { + "epoch": 0.3146699620071779, + "grad_norm": 3.0395592306666175, + "learning_rate": 3.922063839862107e-06, + "loss": 1.0278, + "step": 2992 + }, + { + "epoch": 0.3147751324490252, + "grad_norm": 3.095491872011433, + "learning_rate": 3.921377156455802e-06, + "loss": 1.0113, + "step": 2993 + }, + { + "epoch": 0.31488030289087254, + "grad_norm": 3.2475174637972857, + "learning_rate": 3.920690314554903e-06, + "loss": 1.0456, + "step": 2994 + }, + { + "epoch": 0.3149854733327198, + "grad_norm": 2.4088622405130926, + "learning_rate": 3.9200033142359975e-06, + "loss": 1.0592, + "step": 2995 + }, + { + "epoch": 0.31509064377456714, + "grad_norm": 2.71257741566834, + "learning_rate": 3.919316155575692e-06, + "loss": 1.0116, + "step": 2996 + }, + { + "epoch": 0.31519581421641446, + "grad_norm": 2.450376163606724, + "learning_rate": 3.918628838650609e-06, + "loss": 1.0195, + "step": 2997 + }, + { + "epoch": 0.3153009846582618, + "grad_norm": 3.3118566816217427, + "learning_rate": 3.9179413635373895e-06, + "loss": 0.968, + "step": 2998 + }, + { + "epoch": 0.3154061551001091, + "grad_norm": 2.242881590172938, + "learning_rate": 3.917253730312694e-06, + "loss": 0.9813, + "step": 2999 + }, + { + "epoch": 0.31551132554195643, + "grad_norm": 2.427670249904579, + "learning_rate": 3.916565939053198e-06, + "loss": 1.018, + "step": 3000 + }, + { + "epoch": 0.31561649598380376, + "grad_norm": 3.0529942785276436, + "learning_rate": 3.915877989835595e-06, + "loss": 1.03, + "step": 3001 + }, + { + "epoch": 0.3157216664256511, + "grad_norm": 2.5578272107417317, + "learning_rate": 3.915189882736597e-06, + "loss": 1.0433, + "step": 3002 + }, + { + "epoch": 0.3158268368674984, + "grad_norm": 2.585770865143952, + "learning_rate": 3.914501617832935e-06, + "loss": 1.0069, + "step": 3003 + }, + { + "epoch": 0.31593200730934573, + "grad_norm": 3.2812648373996893, + "learning_rate": 3.9138131952013535e-06, + "loss": 1.0027, + "step": 3004 + }, + { + "epoch": 0.31603717775119305, + "grad_norm": 3.3571648099609654, + "learning_rate": 3.913124614918617e-06, + "loss": 1.0312, + "step": 3005 + }, + { + "epoch": 0.3161423481930403, + "grad_norm": 1.7227150762858836, + "learning_rate": 3.9124358770615094e-06, + "loss": 0.9955, + "step": 3006 + }, + { + "epoch": 0.31624751863488765, + "grad_norm": 3.5117489508527022, + "learning_rate": 3.911746981706829e-06, + "loss": 1.0127, + "step": 3007 + }, + { + "epoch": 0.31635268907673497, + "grad_norm": 2.651233249155203, + "learning_rate": 3.911057928931394e-06, + "loss": 0.9847, + "step": 3008 + }, + { + "epoch": 0.3164578595185823, + "grad_norm": 3.406708098715362, + "learning_rate": 3.910368718812037e-06, + "loss": 1.0204, + "step": 3009 + }, + { + "epoch": 0.3165630299604296, + "grad_norm": 2.6490697167241666, + "learning_rate": 3.909679351425612e-06, + "loss": 1.0232, + "step": 3010 + }, + { + "epoch": 0.31666820040227694, + "grad_norm": 3.0367731314433857, + "learning_rate": 3.90898982684899e-06, + "loss": 1.0102, + "step": 3011 + }, + { + "epoch": 0.31677337084412427, + "grad_norm": 1.99254564411588, + "learning_rate": 3.908300145159055e-06, + "loss": 0.9834, + "step": 3012 + }, + { + "epoch": 0.3168785412859716, + "grad_norm": 3.2342199206410482, + "learning_rate": 3.907610306432716e-06, + "loss": 0.9951, + "step": 3013 + }, + { + "epoch": 0.3169837117278189, + "grad_norm": 2.322687279586599, + "learning_rate": 3.906920310746891e-06, + "loss": 1.0255, + "step": 3014 + }, + { + "epoch": 0.31708888216966624, + "grad_norm": 2.2910797838024104, + "learning_rate": 3.906230158178523e-06, + "loss": 1.0091, + "step": 3015 + }, + { + "epoch": 0.3171940526115135, + "grad_norm": 2.5171087691012386, + "learning_rate": 3.9055398488045685e-06, + "loss": 0.935, + "step": 3016 + }, + { + "epoch": 0.31729922305336083, + "grad_norm": 1.9921759243129207, + "learning_rate": 3.9048493827020015e-06, + "loss": 1.0218, + "step": 3017 + }, + { + "epoch": 0.31740439349520816, + "grad_norm": 2.1147110832212928, + "learning_rate": 3.904158759947816e-06, + "loss": 1.0058, + "step": 3018 + }, + { + "epoch": 0.3175095639370555, + "grad_norm": 3.0475021542334555, + "learning_rate": 3.9034679806190204e-06, + "loss": 1.0187, + "step": 3019 + }, + { + "epoch": 0.3176147343789028, + "grad_norm": 2.745517766757522, + "learning_rate": 3.902777044792642e-06, + "loss": 1.0115, + "step": 3020 + }, + { + "epoch": 0.31771990482075013, + "grad_norm": 1.5206699032468878, + "learning_rate": 3.902085952545726e-06, + "loss": 0.9619, + "step": 3021 + }, + { + "epoch": 0.31782507526259746, + "grad_norm": 2.4452139648992395, + "learning_rate": 3.901394703955335e-06, + "loss": 0.9903, + "step": 3022 + }, + { + "epoch": 0.3179302457044448, + "grad_norm": 2.5184421966703594, + "learning_rate": 3.900703299098548e-06, + "loss": 0.9858, + "step": 3023 + }, + { + "epoch": 0.3180354161462921, + "grad_norm": 1.9652874285496023, + "learning_rate": 3.900011738052463e-06, + "loss": 1.0252, + "step": 3024 + }, + { + "epoch": 0.31814058658813943, + "grad_norm": 2.392255529966947, + "learning_rate": 3.899320020894192e-06, + "loss": 1.0227, + "step": 3025 + }, + { + "epoch": 0.3182457570299867, + "grad_norm": 2.1611953399013153, + "learning_rate": 3.898628147700869e-06, + "loss": 0.9738, + "step": 3026 + }, + { + "epoch": 0.318350927471834, + "grad_norm": 2.2430395870179063, + "learning_rate": 3.8979361185496426e-06, + "loss": 0.9749, + "step": 3027 + }, + { + "epoch": 0.31845609791368135, + "grad_norm": 3.541913920661668, + "learning_rate": 3.897243933517679e-06, + "loss": 1.0152, + "step": 3028 + }, + { + "epoch": 0.31856126835552867, + "grad_norm": 1.7600013581050258, + "learning_rate": 3.896551592682164e-06, + "loss": 0.9925, + "step": 3029 + }, + { + "epoch": 0.318666438797376, + "grad_norm": 3.0382688857854387, + "learning_rate": 3.895859096120296e-06, + "loss": 1.005, + "step": 3030 + }, + { + "epoch": 0.3187716092392233, + "grad_norm": 1.7559359887388504, + "learning_rate": 3.8951664439092966e-06, + "loss": 1.0027, + "step": 3031 + }, + { + "epoch": 0.31887677968107064, + "grad_norm": 2.41513756248943, + "learning_rate": 3.8944736361263996e-06, + "loss": 1.0058, + "step": 3032 + }, + { + "epoch": 0.31898195012291797, + "grad_norm": 3.824958571683499, + "learning_rate": 3.89378067284886e-06, + "loss": 0.9754, + "step": 3033 + }, + { + "epoch": 0.3190871205647653, + "grad_norm": 2.8869062987377174, + "learning_rate": 3.893087554153948e-06, + "loss": 0.974, + "step": 3034 + }, + { + "epoch": 0.3191922910066126, + "grad_norm": 3.3376996294364782, + "learning_rate": 3.892394280118952e-06, + "loss": 1.0322, + "step": 3035 + }, + { + "epoch": 0.3192974614484599, + "grad_norm": 2.1493242739855316, + "learning_rate": 3.891700850821178e-06, + "loss": 0.9793, + "step": 3036 + }, + { + "epoch": 0.3194026318903072, + "grad_norm": 2.0600928702389694, + "learning_rate": 3.891007266337947e-06, + "loss": 0.9667, + "step": 3037 + }, + { + "epoch": 0.31950780233215453, + "grad_norm": 2.422518969459341, + "learning_rate": 3.8903135267466e-06, + "loss": 1.0283, + "step": 3038 + }, + { + "epoch": 0.31961297277400186, + "grad_norm": 2.7095503103612764, + "learning_rate": 3.889619632124495e-06, + "loss": 1.0291, + "step": 3039 + }, + { + "epoch": 0.3197181432158492, + "grad_norm": 2.6136848210226833, + "learning_rate": 3.888925582549006e-06, + "loss": 1.0488, + "step": 3040 + }, + { + "epoch": 0.3198233136576965, + "grad_norm": 3.3309108315349962, + "learning_rate": 3.888231378097525e-06, + "loss": 1.0006, + "step": 3041 + }, + { + "epoch": 0.31992848409954383, + "grad_norm": 2.9848323149363716, + "learning_rate": 3.8875370188474606e-06, + "loss": 1.0346, + "step": 3042 + }, + { + "epoch": 0.32003365454139115, + "grad_norm": 1.8815097278144668, + "learning_rate": 3.88684250487624e-06, + "loss": 0.9791, + "step": 3043 + }, + { + "epoch": 0.3201388249832385, + "grad_norm": 2.975369543509321, + "learning_rate": 3.886147836261307e-06, + "loss": 1.0097, + "step": 3044 + }, + { + "epoch": 0.3202439954250858, + "grad_norm": 1.9729548982996215, + "learning_rate": 3.8854530130801226e-06, + "loss": 1.0049, + "step": 3045 + }, + { + "epoch": 0.32034916586693307, + "grad_norm": 2.964599401521195, + "learning_rate": 3.884758035410165e-06, + "loss": 0.9898, + "step": 3046 + }, + { + "epoch": 0.3204543363087804, + "grad_norm": 2.240600934404886, + "learning_rate": 3.884062903328929e-06, + "loss": 0.9923, + "step": 3047 + }, + { + "epoch": 0.3205595067506277, + "grad_norm": 2.598966467579801, + "learning_rate": 3.883367616913927e-06, + "loss": 1.0003, + "step": 3048 + }, + { + "epoch": 0.32066467719247504, + "grad_norm": 2.2641338646409883, + "learning_rate": 3.8826721762426885e-06, + "loss": 1.002, + "step": 3049 + }, + { + "epoch": 0.32076984763432237, + "grad_norm": 2.2997383920918897, + "learning_rate": 3.881976581392763e-06, + "loss": 1.0243, + "step": 3050 + }, + { + "epoch": 0.3208750180761697, + "grad_norm": 2.1550910823149136, + "learning_rate": 3.881280832441712e-06, + "loss": 0.991, + "step": 3051 + }, + { + "epoch": 0.320980188518017, + "grad_norm": 2.855557332734093, + "learning_rate": 3.880584929467119e-06, + "loss": 0.9982, + "step": 3052 + }, + { + "epoch": 0.32108535895986434, + "grad_norm": 3.5691659687435897, + "learning_rate": 3.879888872546581e-06, + "loss": 1.0354, + "step": 3053 + }, + { + "epoch": 0.32119052940171167, + "grad_norm": 2.2916019493138817, + "learning_rate": 3.879192661757715e-06, + "loss": 0.9934, + "step": 3054 + }, + { + "epoch": 0.321295699843559, + "grad_norm": 2.473847967975587, + "learning_rate": 3.878496297178151e-06, + "loss": 1.0425, + "step": 3055 + }, + { + "epoch": 0.32140087028540626, + "grad_norm": 3.677982616082223, + "learning_rate": 3.8777997788855435e-06, + "loss": 0.9895, + "step": 3056 + }, + { + "epoch": 0.3215060407272536, + "grad_norm": 2.7264358103840376, + "learning_rate": 3.877103106957559e-06, + "loss": 1.0241, + "step": 3057 + }, + { + "epoch": 0.3216112111691009, + "grad_norm": 2.2211588348254185, + "learning_rate": 3.876406281471877e-06, + "loss": 1.0182, + "step": 3058 + }, + { + "epoch": 0.32171638161094823, + "grad_norm": 3.091295524354873, + "learning_rate": 3.875709302506204e-06, + "loss": 1.0349, + "step": 3059 + }, + { + "epoch": 0.32182155205279556, + "grad_norm": 2.236128132871001, + "learning_rate": 3.875012170138256e-06, + "loss": 0.9954, + "step": 3060 + }, + { + "epoch": 0.3219267224946429, + "grad_norm": 2.66801073802577, + "learning_rate": 3.874314884445771e-06, + "loss": 0.9817, + "step": 3061 + }, + { + "epoch": 0.3220318929364902, + "grad_norm": 2.6169139547610816, + "learning_rate": 3.873617445506499e-06, + "loss": 1.0343, + "step": 3062 + }, + { + "epoch": 0.32213706337833753, + "grad_norm": 2.709426123429328, + "learning_rate": 3.872919853398212e-06, + "loss": 1.0071, + "step": 3063 + }, + { + "epoch": 0.32224223382018485, + "grad_norm": 2.299843384495024, + "learning_rate": 3.872222108198696e-06, + "loss": 1.0012, + "step": 3064 + }, + { + "epoch": 0.3223474042620322, + "grad_norm": 2.636252219736122, + "learning_rate": 3.871524209985755e-06, + "loss": 1.0427, + "step": 3065 + }, + { + "epoch": 0.3224525747038795, + "grad_norm": 3.117900115113946, + "learning_rate": 3.87082615883721e-06, + "loss": 1.0041, + "step": 3066 + }, + { + "epoch": 0.32255774514572677, + "grad_norm": 2.2357890840611465, + "learning_rate": 3.8701279548309e-06, + "loss": 0.9666, + "step": 3067 + }, + { + "epoch": 0.3226629155875741, + "grad_norm": 2.566115912683812, + "learning_rate": 3.869429598044679e-06, + "loss": 1.0166, + "step": 3068 + }, + { + "epoch": 0.3227680860294214, + "grad_norm": 2.7172969124996165, + "learning_rate": 3.868731088556419e-06, + "loss": 1.0099, + "step": 3069 + }, + { + "epoch": 0.32287325647126874, + "grad_norm": 1.5483818724179716, + "learning_rate": 3.868032426444012e-06, + "loss": 0.9894, + "step": 3070 + }, + { + "epoch": 0.32297842691311607, + "grad_norm": 2.318650017542439, + "learning_rate": 3.867333611785361e-06, + "loss": 1.05, + "step": 3071 + }, + { + "epoch": 0.3230835973549634, + "grad_norm": 1.917805124510381, + "learning_rate": 3.86663464465839e-06, + "loss": 0.9977, + "step": 3072 + }, + { + "epoch": 0.3231887677968107, + "grad_norm": 2.3161890326941275, + "learning_rate": 3.86593552514104e-06, + "loss": 1.003, + "step": 3073 + }, + { + "epoch": 0.32329393823865804, + "grad_norm": 2.1693043396128093, + "learning_rate": 3.865236253311268e-06, + "loss": 0.9695, + "step": 3074 + }, + { + "epoch": 0.32339910868050536, + "grad_norm": 2.8668301075021914, + "learning_rate": 3.864536829247049e-06, + "loss": 0.9888, + "step": 3075 + }, + { + "epoch": 0.3235042791223527, + "grad_norm": 2.4444547149619296, + "learning_rate": 3.863837253026372e-06, + "loss": 1.0067, + "step": 3076 + }, + { + "epoch": 0.32360944956419996, + "grad_norm": 3.3217478770609925, + "learning_rate": 3.863137524727248e-06, + "loss": 1.018, + "step": 3077 + }, + { + "epoch": 0.3237146200060473, + "grad_norm": 3.375894933254088, + "learning_rate": 3.862437644427699e-06, + "loss": 1.0314, + "step": 3078 + }, + { + "epoch": 0.3238197904478946, + "grad_norm": 3.1024834482299135, + "learning_rate": 3.86173761220577e-06, + "loss": 0.9857, + "step": 3079 + }, + { + "epoch": 0.32392496088974193, + "grad_norm": 1.8859603281941375, + "learning_rate": 3.8610374281395205e-06, + "loss": 1.0021, + "step": 3080 + }, + { + "epoch": 0.32403013133158926, + "grad_norm": 2.181930740279361, + "learning_rate": 3.860337092307023e-06, + "loss": 1.0024, + "step": 3081 + }, + { + "epoch": 0.3241353017734366, + "grad_norm": 2.2394462965094806, + "learning_rate": 3.859636604786372e-06, + "loss": 1.0499, + "step": 3082 + }, + { + "epoch": 0.3242404722152839, + "grad_norm": 2.429116428790114, + "learning_rate": 3.8589359656556775e-06, + "loss": 1.0016, + "step": 3083 + }, + { + "epoch": 0.32434564265713123, + "grad_norm": 3.2486086179267697, + "learning_rate": 3.858235174993067e-06, + "loss": 0.9778, + "step": 3084 + }, + { + "epoch": 0.32445081309897855, + "grad_norm": 2.168312151384131, + "learning_rate": 3.857534232876684e-06, + "loss": 0.9809, + "step": 3085 + }, + { + "epoch": 0.3245559835408259, + "grad_norm": 2.894850817884378, + "learning_rate": 3.856833139384687e-06, + "loss": 1.0092, + "step": 3086 + }, + { + "epoch": 0.32466115398267315, + "grad_norm": 3.1973922575609697, + "learning_rate": 3.856131894595255e-06, + "loss": 1.0425, + "step": 3087 + }, + { + "epoch": 0.32476632442452047, + "grad_norm": 2.6786962321285883, + "learning_rate": 3.855430498586582e-06, + "loss": 0.9848, + "step": 3088 + }, + { + "epoch": 0.3248714948663678, + "grad_norm": 1.7068699911559382, + "learning_rate": 3.8547289514368795e-06, + "loss": 0.9864, + "step": 3089 + }, + { + "epoch": 0.3249766653082151, + "grad_norm": 1.762520608645227, + "learning_rate": 3.854027253224375e-06, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 0.32508183575006244, + "grad_norm": 1.756341086493816, + "learning_rate": 3.853325404027313e-06, + "loss": 0.9738, + "step": 3091 + }, + { + "epoch": 0.32518700619190977, + "grad_norm": 1.9319056465036724, + "learning_rate": 3.852623403923955e-06, + "loss": 1.0034, + "step": 3092 + }, + { + "epoch": 0.3252921766337571, + "grad_norm": 2.627736625312551, + "learning_rate": 3.851921252992581e-06, + "loss": 1.0046, + "step": 3093 + }, + { + "epoch": 0.3253973470756044, + "grad_norm": 2.387276425999192, + "learning_rate": 3.851218951311484e-06, + "loss": 1.0061, + "step": 3094 + }, + { + "epoch": 0.32550251751745174, + "grad_norm": 2.6657811160762868, + "learning_rate": 3.850516498958979e-06, + "loss": 0.989, + "step": 3095 + }, + { + "epoch": 0.32560768795929906, + "grad_norm": 3.7563246886039785, + "learning_rate": 3.849813896013392e-06, + "loss": 0.9911, + "step": 3096 + }, + { + "epoch": 0.32571285840114633, + "grad_norm": 3.186401005858273, + "learning_rate": 3.849111142553069e-06, + "loss": 0.9781, + "step": 3097 + }, + { + "epoch": 0.32581802884299366, + "grad_norm": 2.8120268250532128, + "learning_rate": 3.848408238656375e-06, + "loss": 1.0248, + "step": 3098 + }, + { + "epoch": 0.325923199284841, + "grad_norm": 2.5367126653996985, + "learning_rate": 3.8477051844016865e-06, + "loss": 1.0108, + "step": 3099 + }, + { + "epoch": 0.3260283697266883, + "grad_norm": 2.4405235194407804, + "learning_rate": 3.847001979867401e-06, + "loss": 0.9774, + "step": 3100 + }, + { + "epoch": 0.32613354016853563, + "grad_norm": 1.8950206598525756, + "learning_rate": 3.846298625131931e-06, + "loss": 1.0017, + "step": 3101 + }, + { + "epoch": 0.32623871061038295, + "grad_norm": 2.7265934900210964, + "learning_rate": 3.845595120273706e-06, + "loss": 1.0165, + "step": 3102 + }, + { + "epoch": 0.3263438810522303, + "grad_norm": 2.205211886185536, + "learning_rate": 3.8448914653711725e-06, + "loss": 1.037, + "step": 3103 + }, + { + "epoch": 0.3264490514940776, + "grad_norm": 1.9366743565239208, + "learning_rate": 3.844187660502792e-06, + "loss": 0.9738, + "step": 3104 + }, + { + "epoch": 0.3265542219359249, + "grad_norm": 2.593512907813824, + "learning_rate": 3.843483705747045e-06, + "loss": 1.0458, + "step": 3105 + }, + { + "epoch": 0.32665939237777225, + "grad_norm": 2.0167179831155115, + "learning_rate": 3.84277960118243e-06, + "loss": 1.0399, + "step": 3106 + }, + { + "epoch": 0.3267645628196195, + "grad_norm": 2.334539077805553, + "learning_rate": 3.842075346887457e-06, + "loss": 1.0077, + "step": 3107 + }, + { + "epoch": 0.32686973326146684, + "grad_norm": 2.604441795888962, + "learning_rate": 3.841370942940657e-06, + "loss": 1.0063, + "step": 3108 + }, + { + "epoch": 0.32697490370331417, + "grad_norm": 1.8763989090691047, + "learning_rate": 3.840666389420578e-06, + "loss": 1.0246, + "step": 3109 + }, + { + "epoch": 0.3270800741451615, + "grad_norm": 2.1829721118576337, + "learning_rate": 3.839961686405782e-06, + "loss": 1.0295, + "step": 3110 + }, + { + "epoch": 0.3271852445870088, + "grad_norm": 2.235743716349407, + "learning_rate": 3.839256833974848e-06, + "loss": 0.9813, + "step": 3111 + }, + { + "epoch": 0.32729041502885614, + "grad_norm": 2.682923955392616, + "learning_rate": 3.838551832206373e-06, + "loss": 0.9701, + "step": 3112 + }, + { + "epoch": 0.32739558547070347, + "grad_norm": 2.564345157324214, + "learning_rate": 3.8378466811789714e-06, + "loss": 1.0052, + "step": 3113 + }, + { + "epoch": 0.3275007559125508, + "grad_norm": 2.745614634832843, + "learning_rate": 3.837141380971271e-06, + "loss": 0.9663, + "step": 3114 + }, + { + "epoch": 0.3276059263543981, + "grad_norm": 4.102665613851121, + "learning_rate": 3.83643593166192e-06, + "loss": 1.0145, + "step": 3115 + }, + { + "epoch": 0.32771109679624544, + "grad_norm": 1.9187924233062632, + "learning_rate": 3.835730333329581e-06, + "loss": 1.0537, + "step": 3116 + }, + { + "epoch": 0.3278162672380927, + "grad_norm": 2.0366789642980856, + "learning_rate": 3.835024586052933e-06, + "loss": 1.0184, + "step": 3117 + }, + { + "epoch": 0.32792143767994003, + "grad_norm": 2.584247749331405, + "learning_rate": 3.834318689910673e-06, + "loss": 1.0199, + "step": 3118 + }, + { + "epoch": 0.32802660812178736, + "grad_norm": 2.3869440994924385, + "learning_rate": 3.833612644981514e-06, + "loss": 1.0261, + "step": 3119 + }, + { + "epoch": 0.3281317785636347, + "grad_norm": 1.744981904406401, + "learning_rate": 3.832906451344185e-06, + "loss": 0.9887, + "step": 3120 + }, + { + "epoch": 0.328236949005482, + "grad_norm": 2.205884774883593, + "learning_rate": 3.832200109077433e-06, + "loss": 1.0138, + "step": 3121 + }, + { + "epoch": 0.32834211944732933, + "grad_norm": 2.8960434627989566, + "learning_rate": 3.831493618260019e-06, + "loss": 1.0183, + "step": 3122 + }, + { + "epoch": 0.32844728988917665, + "grad_norm": 2.6153093943252803, + "learning_rate": 3.830786978970723e-06, + "loss": 1.0074, + "step": 3123 + }, + { + "epoch": 0.328552460331024, + "grad_norm": 3.2605234910491787, + "learning_rate": 3.830080191288342e-06, + "loss": 0.9695, + "step": 3124 + }, + { + "epoch": 0.3286576307728713, + "grad_norm": 2.3212838057371696, + "learning_rate": 3.829373255291686e-06, + "loss": 1.0006, + "step": 3125 + }, + { + "epoch": 0.3287628012147186, + "grad_norm": 3.1048512567149156, + "learning_rate": 3.828666171059586e-06, + "loss": 1.0273, + "step": 3126 + }, + { + "epoch": 0.32886797165656595, + "grad_norm": 3.432285163654643, + "learning_rate": 3.827958938670886e-06, + "loss": 1.0458, + "step": 3127 + }, + { + "epoch": 0.3289731420984132, + "grad_norm": 2.819849325554979, + "learning_rate": 3.827251558204449e-06, + "loss": 0.9533, + "step": 3128 + }, + { + "epoch": 0.32907831254026054, + "grad_norm": 1.6507578523510593, + "learning_rate": 3.826544029739152e-06, + "loss": 0.9536, + "step": 3129 + }, + { + "epoch": 0.32918348298210787, + "grad_norm": 2.4830132435715537, + "learning_rate": 3.825836353353891e-06, + "loss": 1.0009, + "step": 3130 + }, + { + "epoch": 0.3292886534239552, + "grad_norm": 2.4797089042570533, + "learning_rate": 3.825128529127577e-06, + "loss": 0.9801, + "step": 3131 + }, + { + "epoch": 0.3293938238658025, + "grad_norm": 2.4239798237039594, + "learning_rate": 3.824420557139139e-06, + "loss": 1.0182, + "step": 3132 + }, + { + "epoch": 0.32949899430764984, + "grad_norm": 1.945405865543695, + "learning_rate": 3.82371243746752e-06, + "loss": 1.0321, + "step": 3133 + }, + { + "epoch": 0.32960416474949716, + "grad_norm": 2.084748306095817, + "learning_rate": 3.823004170191681e-06, + "loss": 1.0155, + "step": 3134 + }, + { + "epoch": 0.3297093351913445, + "grad_norm": 2.251126194835179, + "learning_rate": 3.8222957553906e-06, + "loss": 1.0392, + "step": 3135 + }, + { + "epoch": 0.3298145056331918, + "grad_norm": 2.2034664670180626, + "learning_rate": 3.821587193143269e-06, + "loss": 0.9897, + "step": 3136 + }, + { + "epoch": 0.32991967607503914, + "grad_norm": 2.9409945673975626, + "learning_rate": 3.8208784835287e-06, + "loss": 0.9982, + "step": 3137 + }, + { + "epoch": 0.3300248465168864, + "grad_norm": 3.058005825612, + "learning_rate": 3.82016962662592e-06, + "loss": 1.0317, + "step": 3138 + }, + { + "epoch": 0.33013001695873373, + "grad_norm": 3.490847243165153, + "learning_rate": 3.819460622513971e-06, + "loss": 1.0036, + "step": 3139 + }, + { + "epoch": 0.33023518740058105, + "grad_norm": 2.071993271550719, + "learning_rate": 3.818751471271912e-06, + "loss": 0.9886, + "step": 3140 + }, + { + "epoch": 0.3303403578424284, + "grad_norm": 2.799536363219469, + "learning_rate": 3.818042172978821e-06, + "loss": 0.9794, + "step": 3141 + }, + { + "epoch": 0.3304455282842757, + "grad_norm": 3.262735659956398, + "learning_rate": 3.8173327277137876e-06, + "loss": 1.0189, + "step": 3142 + }, + { + "epoch": 0.330550698726123, + "grad_norm": 2.1874156683363664, + "learning_rate": 3.816623135555921e-06, + "loss": 0.963, + "step": 3143 + }, + { + "epoch": 0.33065586916797035, + "grad_norm": 2.487129588775537, + "learning_rate": 3.815913396584348e-06, + "loss": 1.0097, + "step": 3144 + }, + { + "epoch": 0.3307610396098177, + "grad_norm": 2.8581708249255096, + "learning_rate": 3.815203510878209e-06, + "loss": 1.0382, + "step": 3145 + }, + { + "epoch": 0.330866210051665, + "grad_norm": 2.353074717228548, + "learning_rate": 3.814493478516661e-06, + "loss": 1.0092, + "step": 3146 + }, + { + "epoch": 0.3309713804935123, + "grad_norm": 2.7540007639438375, + "learning_rate": 3.813783299578879e-06, + "loss": 1.0189, + "step": 3147 + }, + { + "epoch": 0.3310765509353596, + "grad_norm": 2.4522274567147657, + "learning_rate": 3.8130729741440536e-06, + "loss": 1.0141, + "step": 3148 + }, + { + "epoch": 0.3311817213772069, + "grad_norm": 2.849670426659251, + "learning_rate": 3.8123625022913915e-06, + "loss": 1.0118, + "step": 3149 + }, + { + "epoch": 0.33128689181905424, + "grad_norm": 2.3301835319177373, + "learning_rate": 3.811651884100115e-06, + "loss": 0.9961, + "step": 3150 + }, + { + "epoch": 0.33139206226090157, + "grad_norm": 2.3601708357586006, + "learning_rate": 3.8109411196494638e-06, + "loss": 0.9735, + "step": 3151 + }, + { + "epoch": 0.3314972327027489, + "grad_norm": 2.0573991989526474, + "learning_rate": 3.810230209018694e-06, + "loss": 1.021, + "step": 3152 + }, + { + "epoch": 0.3316024031445962, + "grad_norm": 2.9354196677378037, + "learning_rate": 3.809519152287079e-06, + "loss": 0.992, + "step": 3153 + }, + { + "epoch": 0.33170757358644354, + "grad_norm": 2.544344310226173, + "learning_rate": 3.8088079495339046e-06, + "loss": 0.957, + "step": 3154 + }, + { + "epoch": 0.33181274402829086, + "grad_norm": 2.2971377313074393, + "learning_rate": 3.8080966008384775e-06, + "loss": 1.0143, + "step": 3155 + }, + { + "epoch": 0.3319179144701382, + "grad_norm": 2.9198098245236026, + "learning_rate": 3.807385106280117e-06, + "loss": 0.966, + "step": 3156 + }, + { + "epoch": 0.3320230849119855, + "grad_norm": 2.787741411848439, + "learning_rate": 3.806673465938161e-06, + "loss": 1.0119, + "step": 3157 + }, + { + "epoch": 0.3321282553538328, + "grad_norm": 2.139566525593868, + "learning_rate": 3.805961679891964e-06, + "loss": 1.0094, + "step": 3158 + }, + { + "epoch": 0.3322334257956801, + "grad_norm": 2.55889761213254, + "learning_rate": 3.805249748220893e-06, + "loss": 1.0046, + "step": 3159 + }, + { + "epoch": 0.33233859623752743, + "grad_norm": 2.811994900697266, + "learning_rate": 3.804537671004337e-06, + "loss": 1.0026, + "step": 3160 + }, + { + "epoch": 0.33244376667937475, + "grad_norm": 2.6857582832549745, + "learning_rate": 3.8038254483216962e-06, + "loss": 0.9833, + "step": 3161 + }, + { + "epoch": 0.3325489371212221, + "grad_norm": 2.6228532392360866, + "learning_rate": 3.8031130802523896e-06, + "loss": 1.0335, + "step": 3162 + }, + { + "epoch": 0.3326541075630694, + "grad_norm": 2.6508818240324836, + "learning_rate": 3.802400566875851e-06, + "loss": 1.0416, + "step": 3163 + }, + { + "epoch": 0.3327592780049167, + "grad_norm": 2.708891165068549, + "learning_rate": 3.8016879082715326e-06, + "loss": 1.0311, + "step": 3164 + }, + { + "epoch": 0.33286444844676405, + "grad_norm": 1.9791714208493958, + "learning_rate": 3.8009751045189004e-06, + "loss": 1.0022, + "step": 3165 + }, + { + "epoch": 0.3329696188886114, + "grad_norm": 3.4480804252433592, + "learning_rate": 3.8002621556974367e-06, + "loss": 1.0225, + "step": 3166 + }, + { + "epoch": 0.3330747893304587, + "grad_norm": 1.9335962357260972, + "learning_rate": 3.7995490618866425e-06, + "loss": 0.9969, + "step": 3167 + }, + { + "epoch": 0.33317995977230597, + "grad_norm": 2.388818127758868, + "learning_rate": 3.7988358231660333e-06, + "loss": 1.023, + "step": 3168 + }, + { + "epoch": 0.3332851302141533, + "grad_norm": 3.4063184395962445, + "learning_rate": 3.7981224396151393e-06, + "loss": 1.0006, + "step": 3169 + }, + { + "epoch": 0.3333903006560006, + "grad_norm": 3.1228965688498764, + "learning_rate": 3.7974089113135094e-06, + "loss": 1.0256, + "step": 3170 + }, + { + "epoch": 0.33349547109784794, + "grad_norm": 2.969409956688769, + "learning_rate": 3.7966952383407075e-06, + "loss": 0.9796, + "step": 3171 + }, + { + "epoch": 0.33360064153969526, + "grad_norm": 1.772482770164931, + "learning_rate": 3.7959814207763134e-06, + "loss": 0.9964, + "step": 3172 + }, + { + "epoch": 0.3337058119815426, + "grad_norm": 2.5201047196046034, + "learning_rate": 3.7952674586999226e-06, + "loss": 1.0227, + "step": 3173 + }, + { + "epoch": 0.3338109824233899, + "grad_norm": 2.848018681472792, + "learning_rate": 3.794553352191149e-06, + "loss": 1.0123, + "step": 3174 + }, + { + "epoch": 0.33391615286523724, + "grad_norm": 3.1213302281106547, + "learning_rate": 3.7938391013296193e-06, + "loss": 1.0099, + "step": 3175 + }, + { + "epoch": 0.33402132330708456, + "grad_norm": 2.132897712038548, + "learning_rate": 3.79312470619498e-06, + "loss": 1.0306, + "step": 3176 + }, + { + "epoch": 0.3341264937489319, + "grad_norm": 1.6980061002378293, + "learning_rate": 3.79241016686689e-06, + "loss": 0.9981, + "step": 3177 + }, + { + "epoch": 0.33423166419077915, + "grad_norm": 2.133136371300042, + "learning_rate": 3.791695483425026e-06, + "loss": 0.9986, + "step": 3178 + }, + { + "epoch": 0.3343368346326265, + "grad_norm": 2.1135831702313888, + "learning_rate": 3.7909806559490827e-06, + "loss": 0.9966, + "step": 3179 + }, + { + "epoch": 0.3344420050744738, + "grad_norm": 2.517571111822009, + "learning_rate": 3.790265684518767e-06, + "loss": 1.0228, + "step": 3180 + }, + { + "epoch": 0.3345471755163211, + "grad_norm": 2.98686980884583, + "learning_rate": 3.7895505692138045e-06, + "loss": 1.0223, + "step": 3181 + }, + { + "epoch": 0.33465234595816845, + "grad_norm": 2.261356671644225, + "learning_rate": 3.7888353101139353e-06, + "loss": 1.0245, + "step": 3182 + }, + { + "epoch": 0.3347575164000158, + "grad_norm": 2.409763840683323, + "learning_rate": 3.7881199072989176e-06, + "loss": 0.9886, + "step": 3183 + }, + { + "epoch": 0.3348626868418631, + "grad_norm": 1.9087618932889743, + "learning_rate": 3.7874043608485234e-06, + "loss": 1.0211, + "step": 3184 + }, + { + "epoch": 0.3349678572837104, + "grad_norm": 1.998174510767556, + "learning_rate": 3.7866886708425427e-06, + "loss": 1.0038, + "step": 3185 + }, + { + "epoch": 0.33507302772555775, + "grad_norm": 1.9103622576526025, + "learning_rate": 3.785972837360779e-06, + "loss": 1.0043, + "step": 3186 + }, + { + "epoch": 0.3351781981674051, + "grad_norm": 1.9379810577362075, + "learning_rate": 3.785256860483054e-06, + "loss": 1.0059, + "step": 3187 + }, + { + "epoch": 0.3352833686092524, + "grad_norm": 2.9610384556218, + "learning_rate": 3.7845407402892066e-06, + "loss": 0.9998, + "step": 3188 + }, + { + "epoch": 0.33538853905109967, + "grad_norm": 2.2025225245305076, + "learning_rate": 3.7838244768590866e-06, + "loss": 1.0089, + "step": 3189 + }, + { + "epoch": 0.335493709492947, + "grad_norm": 2.8328379269098076, + "learning_rate": 3.7831080702725643e-06, + "loss": 1.0317, + "step": 3190 + }, + { + "epoch": 0.3355988799347943, + "grad_norm": 1.9624846245151617, + "learning_rate": 3.7823915206095246e-06, + "loss": 0.9584, + "step": 3191 + }, + { + "epoch": 0.33570405037664164, + "grad_norm": 2.3934098571822546, + "learning_rate": 3.7816748279498687e-06, + "loss": 1.0102, + "step": 3192 + }, + { + "epoch": 0.33580922081848896, + "grad_norm": 2.1927978381480053, + "learning_rate": 3.780957992373513e-06, + "loss": 1.0098, + "step": 3193 + }, + { + "epoch": 0.3359143912603363, + "grad_norm": 2.7690159806961896, + "learning_rate": 3.7802410139603908e-06, + "loss": 0.9457, + "step": 3194 + }, + { + "epoch": 0.3360195617021836, + "grad_norm": 2.5415137912355137, + "learning_rate": 3.7795238927904497e-06, + "loss": 0.9811, + "step": 3195 + }, + { + "epoch": 0.33612473214403094, + "grad_norm": 2.895165075185927, + "learning_rate": 3.778806628943655e-06, + "loss": 1.0291, + "step": 3196 + }, + { + "epoch": 0.33622990258587826, + "grad_norm": 3.0795576587129676, + "learning_rate": 3.778089222499987e-06, + "loss": 1.0233, + "step": 3197 + }, + { + "epoch": 0.3363350730277256, + "grad_norm": 2.4751867790109943, + "learning_rate": 3.7773716735394415e-06, + "loss": 1.0516, + "step": 3198 + }, + { + "epoch": 0.33644024346957285, + "grad_norm": 2.4430066060566134, + "learning_rate": 3.776653982142033e-06, + "loss": 0.9791, + "step": 3199 + }, + { + "epoch": 0.3365454139114202, + "grad_norm": 2.5324655109273606, + "learning_rate": 3.7759361483877865e-06, + "loss": 0.9815, + "step": 3200 + }, + { + "epoch": 0.3366505843532675, + "grad_norm": 2.7523454023910365, + "learning_rate": 3.7752181723567484e-06, + "loss": 1.0079, + "step": 3201 + }, + { + "epoch": 0.3367557547951148, + "grad_norm": 2.3630412261657607, + "learning_rate": 3.7745000541289777e-06, + "loss": 1.0065, + "step": 3202 + }, + { + "epoch": 0.33686092523696215, + "grad_norm": 2.775762382934883, + "learning_rate": 3.7737817937845504e-06, + "loss": 0.9977, + "step": 3203 + }, + { + "epoch": 0.3369660956788095, + "grad_norm": 2.4895504075619606, + "learning_rate": 3.7730633914035585e-06, + "loss": 1.0495, + "step": 3204 + }, + { + "epoch": 0.3370712661206568, + "grad_norm": 2.721766046225593, + "learning_rate": 3.7723448470661084e-06, + "loss": 0.949, + "step": 3205 + }, + { + "epoch": 0.3371764365625041, + "grad_norm": 2.9668981245557053, + "learning_rate": 3.771626160852324e-06, + "loss": 0.9979, + "step": 3206 + }, + { + "epoch": 0.33728160700435145, + "grad_norm": 3.793048897027738, + "learning_rate": 3.770907332842344e-06, + "loss": 1.0005, + "step": 3207 + }, + { + "epoch": 0.33738677744619877, + "grad_norm": 2.2379198509213, + "learning_rate": 3.770188363116324e-06, + "loss": 0.9751, + "step": 3208 + }, + { + "epoch": 0.33749194788804604, + "grad_norm": 3.13559481810738, + "learning_rate": 3.769469251754434e-06, + "loss": 0.9865, + "step": 3209 + }, + { + "epoch": 0.33759711832989336, + "grad_norm": 2.717848142691034, + "learning_rate": 3.7687499988368613e-06, + "loss": 1.0444, + "step": 3210 + }, + { + "epoch": 0.3377022887717407, + "grad_norm": 3.9961109065173073, + "learning_rate": 3.7680306044438074e-06, + "loss": 1.0281, + "step": 3211 + }, + { + "epoch": 0.337807459213588, + "grad_norm": 2.3111397830579867, + "learning_rate": 3.7673110686554915e-06, + "loss": 0.9546, + "step": 3212 + }, + { + "epoch": 0.33791262965543534, + "grad_norm": 2.556022750068411, + "learning_rate": 3.766591391552146e-06, + "loss": 1.0199, + "step": 3213 + }, + { + "epoch": 0.33801780009728266, + "grad_norm": 2.8546999903249555, + "learning_rate": 3.7658715732140206e-06, + "loss": 0.985, + "step": 3214 + }, + { + "epoch": 0.33812297053913, + "grad_norm": 3.3523241411809104, + "learning_rate": 3.7651516137213816e-06, + "loss": 1.0066, + "step": 3215 + }, + { + "epoch": 0.3382281409809773, + "grad_norm": 1.9532386436425269, + "learning_rate": 3.7644315131545096e-06, + "loss": 0.972, + "step": 3216 + }, + { + "epoch": 0.33833331142282463, + "grad_norm": 2.721629663326664, + "learning_rate": 3.763711271593702e-06, + "loss": 0.9979, + "step": 3217 + }, + { + "epoch": 0.33843848186467196, + "grad_norm": 3.4541692663759904, + "learning_rate": 3.76299088911927e-06, + "loss": 1.0571, + "step": 3218 + }, + { + "epoch": 0.3385436523065192, + "grad_norm": 2.976934045713143, + "learning_rate": 3.7622703658115435e-06, + "loss": 1.0059, + "step": 3219 + }, + { + "epoch": 0.33864882274836655, + "grad_norm": 3.0140365934725843, + "learning_rate": 3.761549701750865e-06, + "loss": 1.0244, + "step": 3220 + }, + { + "epoch": 0.3387539931902139, + "grad_norm": 1.9982975993871346, + "learning_rate": 3.7608288970175943e-06, + "loss": 1.0215, + "step": 3221 + }, + { + "epoch": 0.3388591636320612, + "grad_norm": 2.8744873417988788, + "learning_rate": 3.7601079516921076e-06, + "loss": 1.0015, + "step": 3222 + }, + { + "epoch": 0.3389643340739085, + "grad_norm": 2.150725684186714, + "learning_rate": 3.759386865854795e-06, + "loss": 1.0129, + "step": 3223 + }, + { + "epoch": 0.33906950451575585, + "grad_norm": 2.6319253916407, + "learning_rate": 3.758665639586064e-06, + "loss": 0.9982, + "step": 3224 + }, + { + "epoch": 0.3391746749576032, + "grad_norm": 2.1192499513393357, + "learning_rate": 3.7579442729663364e-06, + "loss": 0.9852, + "step": 3225 + }, + { + "epoch": 0.3392798453994505, + "grad_norm": 1.842379992675114, + "learning_rate": 3.75722276607605e-06, + "loss": 0.9908, + "step": 3226 + }, + { + "epoch": 0.3393850158412978, + "grad_norm": 2.055975701068765, + "learning_rate": 3.7565011189956597e-06, + "loss": 0.9898, + "step": 3227 + }, + { + "epoch": 0.33949018628314515, + "grad_norm": 2.4934374887026474, + "learning_rate": 3.7557793318056323e-06, + "loss": 0.9996, + "step": 3228 + }, + { + "epoch": 0.3395953567249924, + "grad_norm": 2.845049089637477, + "learning_rate": 3.755057404586455e-06, + "loss": 1.0016, + "step": 3229 + }, + { + "epoch": 0.33970052716683974, + "grad_norm": 3.0414746615738957, + "learning_rate": 3.7543353374186274e-06, + "loss": 1.0384, + "step": 3230 + }, + { + "epoch": 0.33980569760868706, + "grad_norm": 2.3969907781571793, + "learning_rate": 3.753613130382666e-06, + "loss": 1.0169, + "step": 3231 + }, + { + "epoch": 0.3399108680505344, + "grad_norm": 3.100902023873532, + "learning_rate": 3.752890783559102e-06, + "loss": 0.9903, + "step": 3232 + }, + { + "epoch": 0.3400160384923817, + "grad_norm": 2.515669648178997, + "learning_rate": 3.7521682970284827e-06, + "loss": 1.014, + "step": 3233 + }, + { + "epoch": 0.34012120893422904, + "grad_norm": 2.0868339076531215, + "learning_rate": 3.7514456708713717e-06, + "loss": 0.9637, + "step": 3234 + }, + { + "epoch": 0.34022637937607636, + "grad_norm": 2.837174645842647, + "learning_rate": 3.750722905168346e-06, + "loss": 0.9769, + "step": 3235 + }, + { + "epoch": 0.3403315498179237, + "grad_norm": 2.3872202244360547, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.003, + "step": 3236 + }, + { + "epoch": 0.340436720259771, + "grad_norm": 2.512997542175295, + "learning_rate": 3.7492769554469443e-06, + "loss": 0.9801, + "step": 3237 + }, + { + "epoch": 0.34054189070161833, + "grad_norm": 2.2402243033296507, + "learning_rate": 3.7485537715898036e-06, + "loss": 1.0344, + "step": 3238 + }, + { + "epoch": 0.3406470611434656, + "grad_norm": 1.788467788663251, + "learning_rate": 3.7478304485092174e-06, + "loss": 0.9945, + "step": 3239 + }, + { + "epoch": 0.3407522315853129, + "grad_norm": 2.438950326014985, + "learning_rate": 3.7471069862858446e-06, + "loss": 1.0212, + "step": 3240 + }, + { + "epoch": 0.34085740202716025, + "grad_norm": 2.625325390678325, + "learning_rate": 3.7463833850003532e-06, + "loss": 1.0471, + "step": 3241 + }, + { + "epoch": 0.3409625724690076, + "grad_norm": 2.298721915181398, + "learning_rate": 3.7456596447334324e-06, + "loss": 1.0091, + "step": 3242 + }, + { + "epoch": 0.3410677429108549, + "grad_norm": 2.393036922682323, + "learning_rate": 3.7449357655657854e-06, + "loss": 1.0367, + "step": 3243 + }, + { + "epoch": 0.3411729133527022, + "grad_norm": 2.439476214622686, + "learning_rate": 3.744211747578129e-06, + "loss": 1.0245, + "step": 3244 + }, + { + "epoch": 0.34127808379454955, + "grad_norm": 2.3404240819465536, + "learning_rate": 3.743487590851197e-06, + "loss": 0.9768, + "step": 3245 + }, + { + "epoch": 0.34138325423639687, + "grad_norm": 2.4310810439248467, + "learning_rate": 3.742763295465739e-06, + "loss": 1.0063, + "step": 3246 + }, + { + "epoch": 0.3414884246782442, + "grad_norm": 2.7737923645649105, + "learning_rate": 3.7420388615025194e-06, + "loss": 1.004, + "step": 3247 + }, + { + "epoch": 0.3415935951200915, + "grad_norm": 2.525162719248003, + "learning_rate": 3.7413142890423187e-06, + "loss": 1.0317, + "step": 3248 + }, + { + "epoch": 0.34169876556193884, + "grad_norm": 2.453175198423566, + "learning_rate": 3.740589578165932e-06, + "loss": 1.0053, + "step": 3249 + }, + { + "epoch": 0.3418039360037861, + "grad_norm": 2.412643165410365, + "learning_rate": 3.7398647289541703e-06, + "loss": 1.0517, + "step": 3250 + }, + { + "epoch": 0.34190910644563344, + "grad_norm": 1.8217035736085005, + "learning_rate": 3.7391397414878583e-06, + "loss": 0.9601, + "step": 3251 + }, + { + "epoch": 0.34201427688748076, + "grad_norm": 2.542322284034248, + "learning_rate": 3.73841461584784e-06, + "loss": 0.9978, + "step": 3252 + }, + { + "epoch": 0.3421194473293281, + "grad_norm": 1.9724168119288237, + "learning_rate": 3.7376893521149713e-06, + "loss": 0.9739, + "step": 3253 + }, + { + "epoch": 0.3422246177711754, + "grad_norm": 2.3405127012102325, + "learning_rate": 3.736963950370126e-06, + "loss": 1.0283, + "step": 3254 + }, + { + "epoch": 0.34232978821302273, + "grad_norm": 2.373486349876508, + "learning_rate": 3.7362384106941906e-06, + "loss": 0.9996, + "step": 3255 + }, + { + "epoch": 0.34243495865487006, + "grad_norm": 2.312609174057517, + "learning_rate": 3.7355127331680695e-06, + "loss": 1.0166, + "step": 3256 + }, + { + "epoch": 0.3425401290967174, + "grad_norm": 1.9211058932583842, + "learning_rate": 3.7347869178726804e-06, + "loss": 0.9955, + "step": 3257 + }, + { + "epoch": 0.3426452995385647, + "grad_norm": 6.881539412158539, + "learning_rate": 3.7340609648889575e-06, + "loss": 1.0193, + "step": 3258 + }, + { + "epoch": 0.34275046998041203, + "grad_norm": 2.0321978311810445, + "learning_rate": 3.7333348742978515e-06, + "loss": 0.9756, + "step": 3259 + }, + { + "epoch": 0.3428556404222593, + "grad_norm": 2.6613297784968886, + "learning_rate": 3.7326086461803247e-06, + "loss": 0.9983, + "step": 3260 + }, + { + "epoch": 0.3429608108641066, + "grad_norm": 2.0304428187193784, + "learning_rate": 3.7318822806173594e-06, + "loss": 1.0042, + "step": 3261 + }, + { + "epoch": 0.34306598130595395, + "grad_norm": 2.5112230342073842, + "learning_rate": 3.73115577768995e-06, + "loss": 0.9995, + "step": 3262 + }, + { + "epoch": 0.3431711517478013, + "grad_norm": 2.0313985757227706, + "learning_rate": 3.7304291374791085e-06, + "loss": 0.9986, + "step": 3263 + }, + { + "epoch": 0.3432763221896486, + "grad_norm": 2.5690827796961737, + "learning_rate": 3.7297023600658586e-06, + "loss": 0.9691, + "step": 3264 + }, + { + "epoch": 0.3433814926314959, + "grad_norm": 2.193452123288259, + "learning_rate": 3.7289754455312434e-06, + "loss": 0.9885, + "step": 3265 + }, + { + "epoch": 0.34348666307334325, + "grad_norm": 2.6601267632676575, + "learning_rate": 3.728248393956319e-06, + "loss": 1.0152, + "step": 3266 + }, + { + "epoch": 0.34359183351519057, + "grad_norm": 2.42078911309029, + "learning_rate": 3.7275212054221566e-06, + "loss": 1.0108, + "step": 3267 + }, + { + "epoch": 0.3436970039570379, + "grad_norm": 2.269243127837986, + "learning_rate": 3.7267938800098454e-06, + "loss": 0.9881, + "step": 3268 + }, + { + "epoch": 0.3438021743988852, + "grad_norm": 2.694411424356494, + "learning_rate": 3.7260664178004858e-06, + "loss": 1.0177, + "step": 3269 + }, + { + "epoch": 0.3439073448407325, + "grad_norm": 3.1436582660691896, + "learning_rate": 3.725338818875196e-06, + "loss": 1.0144, + "step": 3270 + }, + { + "epoch": 0.3440125152825798, + "grad_norm": 3.099436287839511, + "learning_rate": 3.724611083315109e-06, + "loss": 1.0101, + "step": 3271 + }, + { + "epoch": 0.34411768572442714, + "grad_norm": 2.7296113243028284, + "learning_rate": 3.7238832112013734e-06, + "loss": 1.023, + "step": 3272 + }, + { + "epoch": 0.34422285616627446, + "grad_norm": 2.2215641054092155, + "learning_rate": 3.723155202615153e-06, + "loss": 1.0077, + "step": 3273 + }, + { + "epoch": 0.3443280266081218, + "grad_norm": 2.627762150099357, + "learning_rate": 3.7224270576376243e-06, + "loss": 1.0031, + "step": 3274 + }, + { + "epoch": 0.3444331970499691, + "grad_norm": 2.368570833568216, + "learning_rate": 3.7216987763499835e-06, + "loss": 1.0204, + "step": 3275 + }, + { + "epoch": 0.34453836749181643, + "grad_norm": 2.1971410365900406, + "learning_rate": 3.7209703588334384e-06, + "loss": 1.0001, + "step": 3276 + }, + { + "epoch": 0.34464353793366376, + "grad_norm": 3.176673998132595, + "learning_rate": 3.7202418051692134e-06, + "loss": 1.0113, + "step": 3277 + }, + { + "epoch": 0.3447487083755111, + "grad_norm": 2.304560441480544, + "learning_rate": 3.719513115438548e-06, + "loss": 1.0286, + "step": 3278 + }, + { + "epoch": 0.3448538788173584, + "grad_norm": 2.3874420483075878, + "learning_rate": 3.7187842897226965e-06, + "loss": 1.0086, + "step": 3279 + }, + { + "epoch": 0.3449590492592057, + "grad_norm": 2.6394476203678696, + "learning_rate": 3.7180553281029295e-06, + "loss": 0.9717, + "step": 3280 + }, + { + "epoch": 0.345064219701053, + "grad_norm": 2.4270461247707034, + "learning_rate": 3.7173262306605307e-06, + "loss": 1.0398, + "step": 3281 + }, + { + "epoch": 0.3451693901429003, + "grad_norm": 3.626716130228977, + "learning_rate": 3.7165969974768005e-06, + "loss": 1.0027, + "step": 3282 + }, + { + "epoch": 0.34527456058474765, + "grad_norm": 3.345320093587124, + "learning_rate": 3.715867628633055e-06, + "loss": 0.9806, + "step": 3283 + }, + { + "epoch": 0.34537973102659497, + "grad_norm": 2.4384035873967025, + "learning_rate": 3.7151381242106232e-06, + "loss": 0.9817, + "step": 3284 + }, + { + "epoch": 0.3454849014684423, + "grad_norm": 2.6159320854024397, + "learning_rate": 3.7144084842908506e-06, + "loss": 0.9761, + "step": 3285 + }, + { + "epoch": 0.3455900719102896, + "grad_norm": 2.8847114804108696, + "learning_rate": 3.7136787089550986e-06, + "loss": 0.9864, + "step": 3286 + }, + { + "epoch": 0.34569524235213694, + "grad_norm": 3.029486595806022, + "learning_rate": 3.7129487982847422e-06, + "loss": 1.0003, + "step": 3287 + }, + { + "epoch": 0.34580041279398427, + "grad_norm": 3.109537564490848, + "learning_rate": 3.7122187523611724e-06, + "loss": 1.009, + "step": 3288 + }, + { + "epoch": 0.3459055832358316, + "grad_norm": 2.9156800483209326, + "learning_rate": 3.711488571265795e-06, + "loss": 0.9898, + "step": 3289 + }, + { + "epoch": 0.34601075367767886, + "grad_norm": 2.711869560964191, + "learning_rate": 3.71075825508003e-06, + "loss": 0.9939, + "step": 3290 + }, + { + "epoch": 0.3461159241195262, + "grad_norm": 1.4908550438638197, + "learning_rate": 3.7100278038853157e-06, + "loss": 1.0175, + "step": 3291 + }, + { + "epoch": 0.3462210945613735, + "grad_norm": 2.7237754282190534, + "learning_rate": 3.7092972177630998e-06, + "loss": 0.999, + "step": 3292 + }, + { + "epoch": 0.34632626500322083, + "grad_norm": 2.1709243952269577, + "learning_rate": 3.708566496794851e-06, + "loss": 0.9716, + "step": 3293 + }, + { + "epoch": 0.34643143544506816, + "grad_norm": 2.316753104081801, + "learning_rate": 3.7078356410620484e-06, + "loss": 1.0317, + "step": 3294 + }, + { + "epoch": 0.3465366058869155, + "grad_norm": 2.3542621040463567, + "learning_rate": 3.7071046506461893e-06, + "loss": 1.0029, + "step": 3295 + }, + { + "epoch": 0.3466417763287628, + "grad_norm": 1.9708389723051476, + "learning_rate": 3.7063735256287854e-06, + "loss": 0.9978, + "step": 3296 + }, + { + "epoch": 0.34674694677061013, + "grad_norm": 2.8033237545972414, + "learning_rate": 3.705642266091361e-06, + "loss": 0.9875, + "step": 3297 + }, + { + "epoch": 0.34685211721245746, + "grad_norm": 2.6769139841372196, + "learning_rate": 3.7049108721154586e-06, + "loss": 1.0103, + "step": 3298 + }, + { + "epoch": 0.3469572876543048, + "grad_norm": 2.2403444086333053, + "learning_rate": 3.7041793437826336e-06, + "loss": 1.0481, + "step": 3299 + }, + { + "epoch": 0.34706245809615205, + "grad_norm": 2.260213376943109, + "learning_rate": 3.703447681174458e-06, + "loss": 1.0359, + "step": 3300 + }, + { + "epoch": 0.3471676285379994, + "grad_norm": 2.367948574095755, + "learning_rate": 3.7027158843725164e-06, + "loss": 1.0121, + "step": 3301 + }, + { + "epoch": 0.3472727989798467, + "grad_norm": 2.250568883640813, + "learning_rate": 3.7019839534584113e-06, + "loss": 0.9927, + "step": 3302 + }, + { + "epoch": 0.347377969421694, + "grad_norm": 3.1053336133122396, + "learning_rate": 3.7012518885137572e-06, + "loss": 1.0122, + "step": 3303 + }, + { + "epoch": 0.34748313986354135, + "grad_norm": 2.1013260166293732, + "learning_rate": 3.7005196896201867e-06, + "loss": 1.0169, + "step": 3304 + }, + { + "epoch": 0.34758831030538867, + "grad_norm": 1.959095033611609, + "learning_rate": 3.6997873568593446e-06, + "loss": 1.0015, + "step": 3305 + }, + { + "epoch": 0.347693480747236, + "grad_norm": 2.511723319973884, + "learning_rate": 3.699054890312892e-06, + "loss": 0.9941, + "step": 3306 + }, + { + "epoch": 0.3477986511890833, + "grad_norm": 2.5395710231112902, + "learning_rate": 3.6983222900625047e-06, + "loss": 0.9469, + "step": 3307 + }, + { + "epoch": 0.34790382163093064, + "grad_norm": 2.7171955825527783, + "learning_rate": 3.697589556189873e-06, + "loss": 1.0311, + "step": 3308 + }, + { + "epoch": 0.34800899207277797, + "grad_norm": 2.5690913311688997, + "learning_rate": 3.6968566887767027e-06, + "loss": 1.0443, + "step": 3309 + }, + { + "epoch": 0.3481141625146253, + "grad_norm": 3.0893677787025147, + "learning_rate": 3.6961236879047135e-06, + "loss": 1.0376, + "step": 3310 + }, + { + "epoch": 0.34821933295647256, + "grad_norm": 2.3846973615171243, + "learning_rate": 3.695390553655642e-06, + "loss": 1.0051, + "step": 3311 + }, + { + "epoch": 0.3483245033983199, + "grad_norm": 2.6749794579639006, + "learning_rate": 3.6946572861112373e-06, + "loss": 0.9813, + "step": 3312 + }, + { + "epoch": 0.3484296738401672, + "grad_norm": 2.07377234986436, + "learning_rate": 3.6939238853532657e-06, + "loss": 0.9667, + "step": 3313 + }, + { + "epoch": 0.34853484428201453, + "grad_norm": 3.060207599086186, + "learning_rate": 3.693190351463505e-06, + "loss": 1.0142, + "step": 3314 + }, + { + "epoch": 0.34864001472386186, + "grad_norm": 2.936425505722273, + "learning_rate": 3.6924566845237508e-06, + "loss": 0.9837, + "step": 3315 + }, + { + "epoch": 0.3487451851657092, + "grad_norm": 3.587810211037129, + "learning_rate": 3.691722884615814e-06, + "loss": 1.0268, + "step": 3316 + }, + { + "epoch": 0.3488503556075565, + "grad_norm": 2.1046353489321077, + "learning_rate": 3.690988951821517e-06, + "loss": 1.0083, + "step": 3317 + }, + { + "epoch": 0.34895552604940383, + "grad_norm": 2.5495061254512805, + "learning_rate": 3.6902548862227007e-06, + "loss": 1.0373, + "step": 3318 + }, + { + "epoch": 0.34906069649125115, + "grad_norm": 3.116414932564101, + "learning_rate": 3.689520687901218e-06, + "loss": 1.039, + "step": 3319 + }, + { + "epoch": 0.3491658669330985, + "grad_norm": 2.0740724861509197, + "learning_rate": 3.6887863569389388e-06, + "loss": 1.0328, + "step": 3320 + }, + { + "epoch": 0.34927103737494575, + "grad_norm": 3.0783194643210865, + "learning_rate": 3.688051893417745e-06, + "loss": 1.0286, + "step": 3321 + }, + { + "epoch": 0.34937620781679307, + "grad_norm": 2.66641944778877, + "learning_rate": 3.687317297419536e-06, + "loss": 1.037, + "step": 3322 + }, + { + "epoch": 0.3494813782586404, + "grad_norm": 2.7985552282879804, + "learning_rate": 3.6865825690262256e-06, + "loss": 1.0158, + "step": 3323 + }, + { + "epoch": 0.3495865487004877, + "grad_norm": 2.3794909564668623, + "learning_rate": 3.6858477083197403e-06, + "loss": 1.0267, + "step": 3324 + }, + { + "epoch": 0.34969171914233504, + "grad_norm": 2.6094670978251115, + "learning_rate": 3.6851127153820243e-06, + "loss": 1.0293, + "step": 3325 + }, + { + "epoch": 0.34979688958418237, + "grad_norm": 3.154815924297272, + "learning_rate": 3.684377590295034e-06, + "loss": 0.9958, + "step": 3326 + }, + { + "epoch": 0.3499020600260297, + "grad_norm": 2.383540391351354, + "learning_rate": 3.6836423331407416e-06, + "loss": 1.0109, + "step": 3327 + }, + { + "epoch": 0.350007230467877, + "grad_norm": 1.9040090397223854, + "learning_rate": 3.6829069440011343e-06, + "loss": 0.9934, + "step": 3328 + }, + { + "epoch": 0.35011240090972434, + "grad_norm": 2.779759189993991, + "learning_rate": 3.682171422958214e-06, + "loss": 1.007, + "step": 3329 + }, + { + "epoch": 0.35021757135157167, + "grad_norm": 3.2401364529589824, + "learning_rate": 3.681435770093996e-06, + "loss": 1.0305, + "step": 3330 + }, + { + "epoch": 0.35032274179341893, + "grad_norm": 3.727910090979757, + "learning_rate": 3.6806999854905117e-06, + "loss": 1.0082, + "step": 3331 + }, + { + "epoch": 0.35042791223526626, + "grad_norm": 2.5069421827992153, + "learning_rate": 3.6799640692298076e-06, + "loss": 0.9588, + "step": 3332 + }, + { + "epoch": 0.3505330826771136, + "grad_norm": 3.2086550817958877, + "learning_rate": 3.6792280213939433e-06, + "loss": 1.0142, + "step": 3333 + }, + { + "epoch": 0.3506382531189609, + "grad_norm": 1.736538961113674, + "learning_rate": 3.6784918420649952e-06, + "loss": 0.9666, + "step": 3334 + }, + { + "epoch": 0.35074342356080823, + "grad_norm": 1.8790958449036552, + "learning_rate": 3.6777555313250505e-06, + "loss": 1.0285, + "step": 3335 + }, + { + "epoch": 0.35084859400265556, + "grad_norm": 2.3400912078637677, + "learning_rate": 3.6770190892562154e-06, + "loss": 1.0094, + "step": 3336 + }, + { + "epoch": 0.3509537644445029, + "grad_norm": 1.9690870053528573, + "learning_rate": 3.676282515940608e-06, + "loss": 0.9816, + "step": 3337 + }, + { + "epoch": 0.3510589348863502, + "grad_norm": 2.517883459106263, + "learning_rate": 3.675545811460362e-06, + "loss": 1.0008, + "step": 3338 + }, + { + "epoch": 0.35116410532819753, + "grad_norm": 4.054324173962527, + "learning_rate": 3.6748089758976267e-06, + "loss": 0.9958, + "step": 3339 + }, + { + "epoch": 0.35126927577004485, + "grad_norm": 3.008203369375549, + "learning_rate": 3.6740720093345646e-06, + "loss": 0.9959, + "step": 3340 + }, + { + "epoch": 0.3513744462118921, + "grad_norm": 2.2054094909667543, + "learning_rate": 3.6733349118533524e-06, + "loss": 0.9362, + "step": 3341 + }, + { + "epoch": 0.35147961665373945, + "grad_norm": 2.147723777459706, + "learning_rate": 3.6725976835361832e-06, + "loss": 1.0227, + "step": 3342 + }, + { + "epoch": 0.35158478709558677, + "grad_norm": 3.4189072148500466, + "learning_rate": 3.6718603244652634e-06, + "loss": 1.0129, + "step": 3343 + }, + { + "epoch": 0.3516899575374341, + "grad_norm": 2.39516829951863, + "learning_rate": 3.671122834722814e-06, + "loss": 1.0158, + "step": 3344 + }, + { + "epoch": 0.3517951279792814, + "grad_norm": 2.9471635187278786, + "learning_rate": 3.67038521439107e-06, + "loss": 0.9988, + "step": 3345 + }, + { + "epoch": 0.35190029842112874, + "grad_norm": 2.7956147749271136, + "learning_rate": 3.669647463552284e-06, + "loss": 1.0219, + "step": 3346 + }, + { + "epoch": 0.35200546886297607, + "grad_norm": 2.549496541093747, + "learning_rate": 3.6689095822887188e-06, + "loss": 1.0342, + "step": 3347 + }, + { + "epoch": 0.3521106393048234, + "grad_norm": 3.3421799590829755, + "learning_rate": 3.6681715706826555e-06, + "loss": 0.9693, + "step": 3348 + }, + { + "epoch": 0.3522158097466707, + "grad_norm": 2.7877765286056575, + "learning_rate": 3.6674334288163872e-06, + "loss": 0.9698, + "step": 3349 + }, + { + "epoch": 0.35232098018851804, + "grad_norm": 2.850956387184573, + "learning_rate": 3.666695156772222e-06, + "loss": 1.0254, + "step": 3350 + }, + { + "epoch": 0.3524261506303653, + "grad_norm": 2.5846160246925156, + "learning_rate": 3.6659567546324843e-06, + "loss": 0.9714, + "step": 3351 + }, + { + "epoch": 0.35253132107221263, + "grad_norm": 3.0980328608661667, + "learning_rate": 3.6652182224795108e-06, + "loss": 1.0372, + "step": 3352 + }, + { + "epoch": 0.35263649151405996, + "grad_norm": 2.2612646017522002, + "learning_rate": 3.6644795603956535e-06, + "loss": 0.978, + "step": 3353 + }, + { + "epoch": 0.3527416619559073, + "grad_norm": 2.549186265914262, + "learning_rate": 3.663740768463279e-06, + "loss": 1.0212, + "step": 3354 + }, + { + "epoch": 0.3528468323977546, + "grad_norm": 2.747876314122542, + "learning_rate": 3.663001846764769e-06, + "loss": 1.0214, + "step": 3355 + }, + { + "epoch": 0.35295200283960193, + "grad_norm": 2.235824760868568, + "learning_rate": 3.6622627953825187e-06, + "loss": 0.9917, + "step": 3356 + }, + { + "epoch": 0.35305717328144925, + "grad_norm": 2.4010747810798603, + "learning_rate": 3.661523614398938e-06, + "loss": 0.9649, + "step": 3357 + }, + { + "epoch": 0.3531623437232966, + "grad_norm": 2.828908496537335, + "learning_rate": 3.6607843038964515e-06, + "loss": 0.9981, + "step": 3358 + }, + { + "epoch": 0.3532675141651439, + "grad_norm": 2.555400473058865, + "learning_rate": 3.660044863957497e-06, + "loss": 1.0143, + "step": 3359 + }, + { + "epoch": 0.3533726846069912, + "grad_norm": 2.617313449912603, + "learning_rate": 3.6593052946645293e-06, + "loss": 1.0176, + "step": 3360 + }, + { + "epoch": 0.3534778550488385, + "grad_norm": 1.6229208590309265, + "learning_rate": 3.6585655961000144e-06, + "loss": 0.9889, + "step": 3361 + }, + { + "epoch": 0.3535830254906858, + "grad_norm": 2.6242669270631156, + "learning_rate": 3.6578257683464363e-06, + "loss": 0.9977, + "step": 3362 + }, + { + "epoch": 0.35368819593253314, + "grad_norm": 2.730302244599849, + "learning_rate": 3.6570858114862905e-06, + "loss": 0.9535, + "step": 3363 + }, + { + "epoch": 0.35379336637438047, + "grad_norm": 2.5049239813451427, + "learning_rate": 3.656345725602089e-06, + "loss": 1.0363, + "step": 3364 + }, + { + "epoch": 0.3538985368162278, + "grad_norm": 2.3239114473104454, + "learning_rate": 3.6556055107763553e-06, + "loss": 1.003, + "step": 3365 + }, + { + "epoch": 0.3540037072580751, + "grad_norm": 3.122736859914737, + "learning_rate": 3.6548651670916302e-06, + "loss": 1.013, + "step": 3366 + }, + { + "epoch": 0.35410887769992244, + "grad_norm": 3.046846196486878, + "learning_rate": 3.654124694630468e-06, + "loss": 1.0142, + "step": 3367 + }, + { + "epoch": 0.35421404814176977, + "grad_norm": 2.636989697235664, + "learning_rate": 3.653384093475436e-06, + "loss": 1.0422, + "step": 3368 + }, + { + "epoch": 0.3543192185836171, + "grad_norm": 2.61843008906023, + "learning_rate": 3.6526433637091193e-06, + "loss": 1.037, + "step": 3369 + }, + { + "epoch": 0.3544243890254644, + "grad_norm": 1.915867272619924, + "learning_rate": 3.651902505414112e-06, + "loss": 0.9925, + "step": 3370 + }, + { + "epoch": 0.35452955946731174, + "grad_norm": 2.9271823410792797, + "learning_rate": 3.651161518673028e-06, + "loss": 1.0368, + "step": 3371 + }, + { + "epoch": 0.354634729909159, + "grad_norm": 2.409589901528896, + "learning_rate": 3.6504204035684915e-06, + "loss": 1.0037, + "step": 3372 + }, + { + "epoch": 0.35473990035100633, + "grad_norm": 2.1387568533332395, + "learning_rate": 3.649679160183144e-06, + "loss": 1.0186, + "step": 3373 + }, + { + "epoch": 0.35484507079285366, + "grad_norm": 2.4112770744675314, + "learning_rate": 3.6489377885996385e-06, + "loss": 1.0667, + "step": 3374 + }, + { + "epoch": 0.354950241234701, + "grad_norm": 2.4610952866669096, + "learning_rate": 3.648196288900644e-06, + "loss": 1.0212, + "step": 3375 + }, + { + "epoch": 0.3550554116765483, + "grad_norm": 3.23300202577758, + "learning_rate": 3.6474546611688446e-06, + "loss": 1.0027, + "step": 3376 + }, + { + "epoch": 0.35516058211839563, + "grad_norm": 2.121250895820615, + "learning_rate": 3.646712905486936e-06, + "loss": 1.0263, + "step": 3377 + }, + { + "epoch": 0.35526575256024295, + "grad_norm": 3.2579471158022195, + "learning_rate": 3.6459710219376317e-06, + "loss": 1.019, + "step": 3378 + }, + { + "epoch": 0.3553709230020903, + "grad_norm": 2.283154927593308, + "learning_rate": 3.645229010603655e-06, + "loss": 1.0052, + "step": 3379 + }, + { + "epoch": 0.3554760934439376, + "grad_norm": 2.844999630621272, + "learning_rate": 3.6444868715677475e-06, + "loss": 0.9872, + "step": 3380 + }, + { + "epoch": 0.3555812638857849, + "grad_norm": 2.110778090946952, + "learning_rate": 3.6437446049126636e-06, + "loss": 0.9785, + "step": 3381 + }, + { + "epoch": 0.3556864343276322, + "grad_norm": 3.104483713885235, + "learning_rate": 3.6430022107211705e-06, + "loss": 1.058, + "step": 3382 + }, + { + "epoch": 0.3557916047694795, + "grad_norm": 3.698736872503185, + "learning_rate": 3.642259689076052e-06, + "loss": 0.9923, + "step": 3383 + }, + { + "epoch": 0.35589677521132684, + "grad_norm": 2.6468434275616657, + "learning_rate": 3.641517040060105e-06, + "loss": 1.0115, + "step": 3384 + }, + { + "epoch": 0.35600194565317417, + "grad_norm": 3.429273915761346, + "learning_rate": 3.6407742637561407e-06, + "loss": 0.9963, + "step": 3385 + }, + { + "epoch": 0.3561071160950215, + "grad_norm": 2.2796812206143393, + "learning_rate": 3.6400313602469835e-06, + "loss": 0.9957, + "step": 3386 + }, + { + "epoch": 0.3562122865368688, + "grad_norm": 2.2684651957658435, + "learning_rate": 3.639288329615474e-06, + "loss": 0.9733, + "step": 3387 + }, + { + "epoch": 0.35631745697871614, + "grad_norm": 2.7538582807610092, + "learning_rate": 3.6385451719444655e-06, + "loss": 0.9873, + "step": 3388 + }, + { + "epoch": 0.35642262742056346, + "grad_norm": 2.1667916415130244, + "learning_rate": 3.6378018873168254e-06, + "loss": 0.9706, + "step": 3389 + }, + { + "epoch": 0.3565277978624108, + "grad_norm": 2.6101805735697092, + "learning_rate": 3.6370584758154366e-06, + "loss": 1.0247, + "step": 3390 + }, + { + "epoch": 0.3566329683042581, + "grad_norm": 2.6339328347622066, + "learning_rate": 3.6363149375231937e-06, + "loss": 1.0401, + "step": 3391 + }, + { + "epoch": 0.3567381387461054, + "grad_norm": 2.3753106980243843, + "learning_rate": 3.6355712725230093e-06, + "loss": 0.9831, + "step": 3392 + }, + { + "epoch": 0.3568433091879527, + "grad_norm": 4.0920704573771305, + "learning_rate": 3.6348274808978063e-06, + "loss": 1.0566, + "step": 3393 + }, + { + "epoch": 0.35694847962980003, + "grad_norm": 2.1861120277976256, + "learning_rate": 3.634083562730523e-06, + "loss": 1.0138, + "step": 3394 + }, + { + "epoch": 0.35705365007164735, + "grad_norm": 2.4463244703652367, + "learning_rate": 3.633339518104113e-06, + "loss": 0.9825, + "step": 3395 + }, + { + "epoch": 0.3571588205134947, + "grad_norm": 2.2656990738852016, + "learning_rate": 3.632595347101543e-06, + "loss": 1.0078, + "step": 3396 + }, + { + "epoch": 0.357263990955342, + "grad_norm": 2.8104237008735944, + "learning_rate": 3.631851049805793e-06, + "loss": 1.0039, + "step": 3397 + }, + { + "epoch": 0.3573691613971893, + "grad_norm": 1.6073481837060561, + "learning_rate": 3.6311066262998585e-06, + "loss": 0.9729, + "step": 3398 + }, + { + "epoch": 0.35747433183903665, + "grad_norm": 2.1246171429605836, + "learning_rate": 3.630362076666748e-06, + "loss": 0.9902, + "step": 3399 + }, + { + "epoch": 0.357579502280884, + "grad_norm": 2.0248681289481048, + "learning_rate": 3.6296174009894856e-06, + "loss": 1.0255, + "step": 3400 + }, + { + "epoch": 0.3576846727227313, + "grad_norm": 2.938773915640707, + "learning_rate": 3.628872599351108e-06, + "loss": 1.0085, + "step": 3401 + }, + { + "epoch": 0.35778984316457857, + "grad_norm": 2.8664540166582944, + "learning_rate": 3.628127671834665e-06, + "loss": 1.0408, + "step": 3402 + }, + { + "epoch": 0.3578950136064259, + "grad_norm": 1.626968288052293, + "learning_rate": 3.627382618523224e-06, + "loss": 0.9455, + "step": 3403 + }, + { + "epoch": 0.3580001840482732, + "grad_norm": 2.748931967001064, + "learning_rate": 3.626637439499864e-06, + "loss": 1.0007, + "step": 3404 + }, + { + "epoch": 0.35810535449012054, + "grad_norm": 2.474980683402302, + "learning_rate": 3.6258921348476754e-06, + "loss": 0.9864, + "step": 3405 + }, + { + "epoch": 0.35821052493196787, + "grad_norm": 2.7019560736927595, + "learning_rate": 3.625146704649769e-06, + "loss": 0.9493, + "step": 3406 + }, + { + "epoch": 0.3583156953738152, + "grad_norm": 2.901227686551035, + "learning_rate": 3.6244011489892645e-06, + "loss": 1.0309, + "step": 3407 + }, + { + "epoch": 0.3584208658156625, + "grad_norm": 2.4436082587640886, + "learning_rate": 3.623655467949297e-06, + "loss": 1.0221, + "step": 3408 + }, + { + "epoch": 0.35852603625750984, + "grad_norm": 3.134715394017839, + "learning_rate": 3.6229096616130154e-06, + "loss": 0.9888, + "step": 3409 + }, + { + "epoch": 0.35863120669935716, + "grad_norm": 2.396056272037577, + "learning_rate": 3.6221637300635844e-06, + "loss": 1.056, + "step": 3410 + }, + { + "epoch": 0.3587363771412045, + "grad_norm": 2.0197687516513647, + "learning_rate": 3.6214176733841792e-06, + "loss": 0.9827, + "step": 3411 + }, + { + "epoch": 0.35884154758305176, + "grad_norm": 2.934753315186612, + "learning_rate": 3.6206714916579925e-06, + "loss": 0.9909, + "step": 3412 + }, + { + "epoch": 0.3589467180248991, + "grad_norm": 2.136669283224677, + "learning_rate": 3.619925184968229e-06, + "loss": 1.0157, + "step": 3413 + }, + { + "epoch": 0.3590518884667464, + "grad_norm": 2.5314749644598344, + "learning_rate": 3.6191787533981075e-06, + "loss": 1.0389, + "step": 3414 + }, + { + "epoch": 0.35915705890859373, + "grad_norm": 2.4095375951453004, + "learning_rate": 3.618432197030861e-06, + "loss": 1.0118, + "step": 3415 + }, + { + "epoch": 0.35926222935044105, + "grad_norm": 2.325532442349783, + "learning_rate": 3.617685515949736e-06, + "loss": 1.0057, + "step": 3416 + }, + { + "epoch": 0.3593673997922884, + "grad_norm": 2.4488118992083354, + "learning_rate": 3.6169387102379935e-06, + "loss": 1.0158, + "step": 3417 + }, + { + "epoch": 0.3594725702341357, + "grad_norm": 2.624600631200174, + "learning_rate": 3.6161917799789076e-06, + "loss": 1.0063, + "step": 3418 + }, + { + "epoch": 0.359577740675983, + "grad_norm": 2.643328190762798, + "learning_rate": 3.615444725255768e-06, + "loss": 0.9978, + "step": 3419 + }, + { + "epoch": 0.35968291111783035, + "grad_norm": 2.407743326176636, + "learning_rate": 3.6146975461518765e-06, + "loss": 0.981, + "step": 3420 + }, + { + "epoch": 0.3597880815596777, + "grad_norm": 2.311733214584104, + "learning_rate": 3.613950242750549e-06, + "loss": 0.9686, + "step": 3421 + }, + { + "epoch": 0.35989325200152494, + "grad_norm": 2.1921054999722123, + "learning_rate": 3.6132028151351163e-06, + "loss": 0.9877, + "step": 3422 + }, + { + "epoch": 0.35999842244337227, + "grad_norm": 1.9973124149153578, + "learning_rate": 3.6124552633889217e-06, + "loss": 0.9562, + "step": 3423 + }, + { + "epoch": 0.3601035928852196, + "grad_norm": 2.2987560515883567, + "learning_rate": 3.6117075875953233e-06, + "loss": 1.0459, + "step": 3424 + }, + { + "epoch": 0.3602087633270669, + "grad_norm": 3.20659343785868, + "learning_rate": 3.610959787837693e-06, + "loss": 1.0328, + "step": 3425 + }, + { + "epoch": 0.36031393376891424, + "grad_norm": 2.7744645057080355, + "learning_rate": 3.6102118641994166e-06, + "loss": 0.9981, + "step": 3426 + }, + { + "epoch": 0.36041910421076157, + "grad_norm": 1.4642559942958495, + "learning_rate": 3.6094638167638924e-06, + "loss": 0.9947, + "step": 3427 + }, + { + "epoch": 0.3605242746526089, + "grad_norm": 2.386402726019697, + "learning_rate": 3.608715645614534e-06, + "loss": 1.0195, + "step": 3428 + }, + { + "epoch": 0.3606294450944562, + "grad_norm": 2.6634628273386465, + "learning_rate": 3.607967350834769e-06, + "loss": 1.0473, + "step": 3429 + }, + { + "epoch": 0.36073461553630354, + "grad_norm": 3.495355937233298, + "learning_rate": 3.6072189325080364e-06, + "loss": 1.0102, + "step": 3430 + }, + { + "epoch": 0.36083978597815086, + "grad_norm": 1.9319582065248462, + "learning_rate": 3.6064703907177923e-06, + "loss": 0.9659, + "step": 3431 + }, + { + "epoch": 0.3609449564199982, + "grad_norm": 2.3818832688536973, + "learning_rate": 3.6057217255475034e-06, + "loss": 0.9997, + "step": 3432 + }, + { + "epoch": 0.36105012686184546, + "grad_norm": 1.7493814880171161, + "learning_rate": 3.6049729370806534e-06, + "loss": 1.0365, + "step": 3433 + }, + { + "epoch": 0.3611552973036928, + "grad_norm": 2.8422167028984644, + "learning_rate": 3.6042240254007367e-06, + "loss": 0.9969, + "step": 3434 + }, + { + "epoch": 0.3612604677455401, + "grad_norm": 2.27792880127726, + "learning_rate": 3.6034749905912637e-06, + "loss": 1.0197, + "step": 3435 + }, + { + "epoch": 0.36136563818738743, + "grad_norm": 3.38034146624978, + "learning_rate": 3.6027258327357573e-06, + "loss": 1.0358, + "step": 3436 + }, + { + "epoch": 0.36147080862923475, + "grad_norm": 1.915172769306197, + "learning_rate": 3.6019765519177536e-06, + "loss": 1.0184, + "step": 3437 + }, + { + "epoch": 0.3615759790710821, + "grad_norm": 3.7012827312157732, + "learning_rate": 3.6012271482208043e-06, + "loss": 1.0315, + "step": 3438 + }, + { + "epoch": 0.3616811495129294, + "grad_norm": 2.8348305338817665, + "learning_rate": 3.600477621728473e-06, + "loss": 1.0226, + "step": 3439 + }, + { + "epoch": 0.3617863199547767, + "grad_norm": 2.226936028972617, + "learning_rate": 3.5997279725243382e-06, + "loss": 0.9864, + "step": 3440 + }, + { + "epoch": 0.36189149039662405, + "grad_norm": 2.8560998684935988, + "learning_rate": 3.5989782006919915e-06, + "loss": 1.0336, + "step": 3441 + }, + { + "epoch": 0.3619966608384714, + "grad_norm": 2.229618061905727, + "learning_rate": 3.5982283063150388e-06, + "loss": 0.9575, + "step": 3442 + }, + { + "epoch": 0.36210183128031864, + "grad_norm": 1.5501631952100456, + "learning_rate": 3.5974782894770983e-06, + "loss": 1.0172, + "step": 3443 + }, + { + "epoch": 0.36220700172216597, + "grad_norm": 2.803536569294912, + "learning_rate": 3.5967281502618035e-06, + "loss": 1.0584, + "step": 3444 + }, + { + "epoch": 0.3623121721640133, + "grad_norm": 2.1481647342373957, + "learning_rate": 3.5959778887527995e-06, + "loss": 1.0145, + "step": 3445 + }, + { + "epoch": 0.3624173426058606, + "grad_norm": 3.0517742861625896, + "learning_rate": 3.595227505033747e-06, + "loss": 1.0183, + "step": 3446 + }, + { + "epoch": 0.36252251304770794, + "grad_norm": 2.6479450779631217, + "learning_rate": 3.5944769991883197e-06, + "loss": 0.9737, + "step": 3447 + }, + { + "epoch": 0.36262768348955526, + "grad_norm": 2.5993294962929605, + "learning_rate": 3.5937263713002046e-06, + "loss": 1.0112, + "step": 3448 + }, + { + "epoch": 0.3627328539314026, + "grad_norm": 2.2975185450286353, + "learning_rate": 3.5929756214531035e-06, + "loss": 1.0724, + "step": 3449 + }, + { + "epoch": 0.3628380243732499, + "grad_norm": 3.128724604849356, + "learning_rate": 3.59222474973073e-06, + "loss": 1.0058, + "step": 3450 + }, + { + "epoch": 0.36294319481509724, + "grad_norm": 2.551432018630201, + "learning_rate": 3.5914737562168116e-06, + "loss": 1.0239, + "step": 3451 + }, + { + "epoch": 0.36304836525694456, + "grad_norm": 2.505173837642558, + "learning_rate": 3.590722640995091e-06, + "loss": 0.9775, + "step": 3452 + }, + { + "epoch": 0.36315353569879183, + "grad_norm": 2.5187594676149434, + "learning_rate": 3.589971404149323e-06, + "loss": 1.0362, + "step": 3453 + }, + { + "epoch": 0.36325870614063915, + "grad_norm": 2.239747984102128, + "learning_rate": 3.589220045763276e-06, + "loss": 1.0336, + "step": 3454 + }, + { + "epoch": 0.3633638765824865, + "grad_norm": 2.9744557369992184, + "learning_rate": 3.588468565920732e-06, + "loss": 0.9874, + "step": 3455 + }, + { + "epoch": 0.3634690470243338, + "grad_norm": 2.0034051449703902, + "learning_rate": 3.5877169647054875e-06, + "loss": 0.9759, + "step": 3456 + }, + { + "epoch": 0.3635742174661811, + "grad_norm": 2.4849493412176065, + "learning_rate": 3.586965242201353e-06, + "loss": 1.0216, + "step": 3457 + }, + { + "epoch": 0.36367938790802845, + "grad_norm": 2.4957735320651775, + "learning_rate": 3.586213398492149e-06, + "loss": 1.0352, + "step": 3458 + }, + { + "epoch": 0.3637845583498758, + "grad_norm": 2.966574153623298, + "learning_rate": 3.585461433661714e-06, + "loss": 1.0324, + "step": 3459 + }, + { + "epoch": 0.3638897287917231, + "grad_norm": 2.0761534835735658, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.9981, + "step": 3460 + }, + { + "epoch": 0.3639948992335704, + "grad_norm": 2.400801379499366, + "learning_rate": 3.5839571409725593e-06, + "loss": 1.0048, + "step": 3461 + }, + { + "epoch": 0.36410006967541775, + "grad_norm": 2.871894855158759, + "learning_rate": 3.583204813281581e-06, + "loss": 1.0233, + "step": 3462 + }, + { + "epoch": 0.364205240117265, + "grad_norm": 2.7896234095062895, + "learning_rate": 3.582452364804852e-06, + "loss": 1.0171, + "step": 3463 + }, + { + "epoch": 0.36431041055911234, + "grad_norm": 2.1993954405877942, + "learning_rate": 3.581699795626275e-06, + "loss": 0.9991, + "step": 3464 + }, + { + "epoch": 0.36441558100095967, + "grad_norm": 2.054066351626218, + "learning_rate": 3.580947105829769e-06, + "loss": 1.0175, + "step": 3465 + }, + { + "epoch": 0.364520751442807, + "grad_norm": 3.0539518525223213, + "learning_rate": 3.580194295499263e-06, + "loss": 1.0232, + "step": 3466 + }, + { + "epoch": 0.3646259218846543, + "grad_norm": 3.168788256078409, + "learning_rate": 3.5794413647187008e-06, + "loss": 0.9885, + "step": 3467 + }, + { + "epoch": 0.36473109232650164, + "grad_norm": 1.775472584199268, + "learning_rate": 3.578688313572042e-06, + "loss": 0.9917, + "step": 3468 + }, + { + "epoch": 0.36483626276834896, + "grad_norm": 2.6600939684935625, + "learning_rate": 3.577935142143256e-06, + "loss": 0.9925, + "step": 3469 + }, + { + "epoch": 0.3649414332101963, + "grad_norm": 3.0526490958871295, + "learning_rate": 3.577181850516329e-06, + "loss": 1.029, + "step": 3470 + }, + { + "epoch": 0.3650466036520436, + "grad_norm": 2.5112352942954383, + "learning_rate": 3.576428438775257e-06, + "loss": 1.0038, + "step": 3471 + }, + { + "epoch": 0.36515177409389094, + "grad_norm": 2.4210733566582645, + "learning_rate": 3.575674907004052e-06, + "loss": 1.0058, + "step": 3472 + }, + { + "epoch": 0.3652569445357382, + "grad_norm": 2.513245854367, + "learning_rate": 3.5749212552867397e-06, + "loss": 1.0608, + "step": 3473 + }, + { + "epoch": 0.36536211497758553, + "grad_norm": 2.7039176655465473, + "learning_rate": 3.5741674837073563e-06, + "loss": 0.9913, + "step": 3474 + }, + { + "epoch": 0.36546728541943285, + "grad_norm": 3.0641705314770693, + "learning_rate": 3.5734135923499548e-06, + "loss": 1.0289, + "step": 3475 + }, + { + "epoch": 0.3655724558612802, + "grad_norm": 2.7553733431065064, + "learning_rate": 3.572659581298598e-06, + "loss": 0.9869, + "step": 3476 + }, + { + "epoch": 0.3656776263031275, + "grad_norm": 2.876816353464465, + "learning_rate": 3.571905450637366e-06, + "loss": 1.0163, + "step": 3477 + }, + { + "epoch": 0.3657827967449748, + "grad_norm": 2.7493296725215712, + "learning_rate": 3.571151200450349e-06, + "loss": 1.0349, + "step": 3478 + }, + { + "epoch": 0.36588796718682215, + "grad_norm": 2.7429718758626693, + "learning_rate": 3.5703968308216523e-06, + "loss": 0.9933, + "step": 3479 + }, + { + "epoch": 0.3659931376286695, + "grad_norm": 2.7233182700143024, + "learning_rate": 3.569642341835394e-06, + "loss": 0.9801, + "step": 3480 + }, + { + "epoch": 0.3660983080705168, + "grad_norm": 2.898665070653661, + "learning_rate": 3.5688877335757055e-06, + "loss": 1.0327, + "step": 3481 + }, + { + "epoch": 0.3662034785123641, + "grad_norm": 3.3379320870265663, + "learning_rate": 3.5681330061267317e-06, + "loss": 1.0193, + "step": 3482 + }, + { + "epoch": 0.36630864895421145, + "grad_norm": 2.6749109290262223, + "learning_rate": 3.5673781595726286e-06, + "loss": 0.9996, + "step": 3483 + }, + { + "epoch": 0.3664138193960587, + "grad_norm": 2.316180353529294, + "learning_rate": 3.5666231939975702e-06, + "loss": 1.044, + "step": 3484 + }, + { + "epoch": 0.36651898983790604, + "grad_norm": 2.343425285023339, + "learning_rate": 3.5658681094857394e-06, + "loss": 0.9644, + "step": 3485 + }, + { + "epoch": 0.36662416027975336, + "grad_norm": 2.8549983640033703, + "learning_rate": 3.5651129061213345e-06, + "loss": 1.0054, + "step": 3486 + }, + { + "epoch": 0.3667293307216007, + "grad_norm": 2.8622892093311907, + "learning_rate": 3.5643575839885667e-06, + "loss": 1.0235, + "step": 3487 + }, + { + "epoch": 0.366834501163448, + "grad_norm": 2.9948754783989253, + "learning_rate": 3.5636021431716604e-06, + "loss": 1.0314, + "step": 3488 + }, + { + "epoch": 0.36693967160529534, + "grad_norm": 2.1124215741074, + "learning_rate": 3.5628465837548526e-06, + "loss": 1.0036, + "step": 3489 + }, + { + "epoch": 0.36704484204714266, + "grad_norm": 2.985214300844437, + "learning_rate": 3.5620909058223933e-06, + "loss": 1.0203, + "step": 3490 + }, + { + "epoch": 0.36715001248899, + "grad_norm": 3.111175604817724, + "learning_rate": 3.5613351094585484e-06, + "loss": 1.0222, + "step": 3491 + }, + { + "epoch": 0.3672551829308373, + "grad_norm": 2.3456559320452195, + "learning_rate": 3.5605791947475934e-06, + "loss": 0.9925, + "step": 3492 + }, + { + "epoch": 0.36736035337268463, + "grad_norm": 2.1181973524224675, + "learning_rate": 3.55982316177382e-06, + "loss": 0.992, + "step": 3493 + }, + { + "epoch": 0.3674655238145319, + "grad_norm": 3.0413234842482844, + "learning_rate": 3.5590670106215307e-06, + "loss": 1.0221, + "step": 3494 + }, + { + "epoch": 0.3675706942563792, + "grad_norm": 2.564889170052048, + "learning_rate": 3.5583107413750427e-06, + "loss": 1.0259, + "step": 3495 + }, + { + "epoch": 0.36767586469822655, + "grad_norm": 2.626728709218182, + "learning_rate": 3.5575543541186853e-06, + "loss": 1.0114, + "step": 3496 + }, + { + "epoch": 0.3677810351400739, + "grad_norm": 2.1294990688555417, + "learning_rate": 3.5567978489368026e-06, + "loss": 1.01, + "step": 3497 + }, + { + "epoch": 0.3678862055819212, + "grad_norm": 2.502867155421716, + "learning_rate": 3.55604122591375e-06, + "loss": 1.0494, + "step": 3498 + }, + { + "epoch": 0.3679913760237685, + "grad_norm": 3.537928483150429, + "learning_rate": 3.5552844851338973e-06, + "loss": 1.0098, + "step": 3499 + }, + { + "epoch": 0.36809654646561585, + "grad_norm": 2.1233082823635665, + "learning_rate": 3.5545276266816265e-06, + "loss": 1.0037, + "step": 3500 + }, + { + "epoch": 0.3682017169074632, + "grad_norm": 2.065621137457995, + "learning_rate": 3.5537706506413338e-06, + "loss": 1.0426, + "step": 3501 + }, + { + "epoch": 0.3683068873493105, + "grad_norm": 2.382029858431779, + "learning_rate": 3.553013557097428e-06, + "loss": 1.0496, + "step": 3502 + }, + { + "epoch": 0.3684120577911578, + "grad_norm": 1.6297673370173709, + "learning_rate": 3.552256346134329e-06, + "loss": 1.012, + "step": 3503 + }, + { + "epoch": 0.3685172282330051, + "grad_norm": 2.575845849912579, + "learning_rate": 3.5514990178364746e-06, + "loss": 0.9622, + "step": 3504 + }, + { + "epoch": 0.3686223986748524, + "grad_norm": 1.8413351349315177, + "learning_rate": 3.550741572288311e-06, + "loss": 1.0271, + "step": 3505 + }, + { + "epoch": 0.36872756911669974, + "grad_norm": 2.2981599308221745, + "learning_rate": 3.5499840095742987e-06, + "loss": 1.0653, + "step": 3506 + }, + { + "epoch": 0.36883273955854706, + "grad_norm": 2.770027195890931, + "learning_rate": 3.549226329778914e-06, + "loss": 1.0416, + "step": 3507 + }, + { + "epoch": 0.3689379100003944, + "grad_norm": 2.172783977718339, + "learning_rate": 3.5484685329866424e-06, + "loss": 1.0382, + "step": 3508 + }, + { + "epoch": 0.3690430804422417, + "grad_norm": 2.7336060159802953, + "learning_rate": 3.547710619281985e-06, + "loss": 1.0135, + "step": 3509 + }, + { + "epoch": 0.36914825088408904, + "grad_norm": 3.5054838300829463, + "learning_rate": 3.546952588749454e-06, + "loss": 0.9847, + "step": 3510 + }, + { + "epoch": 0.36925342132593636, + "grad_norm": 3.127625095566459, + "learning_rate": 3.5461944414735766e-06, + "loss": 0.9888, + "step": 3511 + }, + { + "epoch": 0.3693585917677837, + "grad_norm": 2.4488246566949448, + "learning_rate": 3.545436177538892e-06, + "loss": 0.9815, + "step": 3512 + }, + { + "epoch": 0.369463762209631, + "grad_norm": 3.6498915209813, + "learning_rate": 3.544677797029952e-06, + "loss": 0.998, + "step": 3513 + }, + { + "epoch": 0.3695689326514783, + "grad_norm": 2.8541317738893306, + "learning_rate": 3.5439193000313226e-06, + "loss": 1.0012, + "step": 3514 + }, + { + "epoch": 0.3696741030933256, + "grad_norm": 2.912304761031327, + "learning_rate": 3.5431606866275812e-06, + "loss": 0.984, + "step": 3515 + }, + { + "epoch": 0.3697792735351729, + "grad_norm": 2.2203764999621467, + "learning_rate": 3.542401956903321e-06, + "loss": 1.0176, + "step": 3516 + }, + { + "epoch": 0.36988444397702025, + "grad_norm": 2.439171891410255, + "learning_rate": 3.5416431109431437e-06, + "loss": 1.0556, + "step": 3517 + }, + { + "epoch": 0.3699896144188676, + "grad_norm": 2.797287931857089, + "learning_rate": 3.540884148831668e-06, + "loss": 1.0811, + "step": 3518 + }, + { + "epoch": 0.3700947848607149, + "grad_norm": 2.6586762875266614, + "learning_rate": 3.540125070653524e-06, + "loss": 1.0006, + "step": 3519 + }, + { + "epoch": 0.3701999553025622, + "grad_norm": 2.9224263695935817, + "learning_rate": 3.5393658764933546e-06, + "loss": 1.0327, + "step": 3520 + }, + { + "epoch": 0.37030512574440955, + "grad_norm": 2.7522305202993405, + "learning_rate": 3.5386065664358164e-06, + "loss": 1.0175, + "step": 3521 + }, + { + "epoch": 0.37041029618625687, + "grad_norm": 2.195682252666581, + "learning_rate": 3.5378471405655768e-06, + "loss": 0.958, + "step": 3522 + }, + { + "epoch": 0.3705154666281042, + "grad_norm": 2.4183425267496985, + "learning_rate": 3.5370875989673198e-06, + "loss": 1.0164, + "step": 3523 + }, + { + "epoch": 0.37062063706995146, + "grad_norm": 2.315338386367098, + "learning_rate": 3.536327941725739e-06, + "loss": 1.0022, + "step": 3524 + }, + { + "epoch": 0.3707258075117988, + "grad_norm": 2.3589678245581815, + "learning_rate": 3.5355681689255417e-06, + "loss": 0.9943, + "step": 3525 + }, + { + "epoch": 0.3708309779536461, + "grad_norm": 2.575779344588469, + "learning_rate": 3.5348082806514496e-06, + "loss": 1.0326, + "step": 3526 + }, + { + "epoch": 0.37093614839549344, + "grad_norm": 2.172676292007147, + "learning_rate": 3.534048276988195e-06, + "loss": 0.9872, + "step": 3527 + }, + { + "epoch": 0.37104131883734076, + "grad_norm": 2.676488772279958, + "learning_rate": 3.5332881580205246e-06, + "loss": 0.9993, + "step": 3528 + }, + { + "epoch": 0.3711464892791881, + "grad_norm": 2.137926314108314, + "learning_rate": 3.5325279238331977e-06, + "loss": 1.0049, + "step": 3529 + }, + { + "epoch": 0.3712516597210354, + "grad_norm": 2.7103603439452075, + "learning_rate": 3.531767574510987e-06, + "loss": 0.957, + "step": 3530 + }, + { + "epoch": 0.37135683016288273, + "grad_norm": 2.558627297098295, + "learning_rate": 3.5310071101386766e-06, + "loss": 1.0162, + "step": 3531 + }, + { + "epoch": 0.37146200060473006, + "grad_norm": 2.610302442944242, + "learning_rate": 3.530246530801064e-06, + "loss": 0.9467, + "step": 3532 + }, + { + "epoch": 0.3715671710465774, + "grad_norm": 2.5992181909547543, + "learning_rate": 3.52948583658296e-06, + "loss": 0.997, + "step": 3533 + }, + { + "epoch": 0.37167234148842465, + "grad_norm": 2.943011406158968, + "learning_rate": 3.528725027569188e-06, + "loss": 1.0061, + "step": 3534 + }, + { + "epoch": 0.371777511930272, + "grad_norm": 2.0335971995059103, + "learning_rate": 3.527964103844583e-06, + "loss": 1.0035, + "step": 3535 + }, + { + "epoch": 0.3718826823721193, + "grad_norm": 2.290860416398897, + "learning_rate": 3.5272030654939967e-06, + "loss": 1.0089, + "step": 3536 + }, + { + "epoch": 0.3719878528139666, + "grad_norm": 3.705983857104791, + "learning_rate": 3.526441912602288e-06, + "loss": 1.0453, + "step": 3537 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 2.910250579235628, + "learning_rate": 3.525680645254333e-06, + "loss": 1.0259, + "step": 3538 + }, + { + "epoch": 0.3721981936976613, + "grad_norm": 2.7958380455087366, + "learning_rate": 3.5249192635350178e-06, + "loss": 1.0077, + "step": 3539 + }, + { + "epoch": 0.3723033641395086, + "grad_norm": 2.415766053951578, + "learning_rate": 3.5241577675292426e-06, + "loss": 0.9892, + "step": 3540 + }, + { + "epoch": 0.3724085345813559, + "grad_norm": 3.1343561189937725, + "learning_rate": 3.5233961573219203e-06, + "loss": 1.0475, + "step": 3541 + }, + { + "epoch": 0.37251370502320325, + "grad_norm": 2.476762825422412, + "learning_rate": 3.5226344329979767e-06, + "loss": 1.01, + "step": 3542 + }, + { + "epoch": 0.37261887546505057, + "grad_norm": 2.4464917162110815, + "learning_rate": 3.52187259464235e-06, + "loss": 1.0814, + "step": 3543 + }, + { + "epoch": 0.3727240459068979, + "grad_norm": 2.54651260509731, + "learning_rate": 3.521110642339991e-06, + "loss": 1.0308, + "step": 3544 + }, + { + "epoch": 0.37282921634874516, + "grad_norm": 2.757074308896569, + "learning_rate": 3.5203485761758627e-06, + "loss": 0.9986, + "step": 3545 + }, + { + "epoch": 0.3729343867905925, + "grad_norm": 2.2476503257651643, + "learning_rate": 3.519586396234942e-06, + "loss": 1.045, + "step": 3546 + }, + { + "epoch": 0.3730395572324398, + "grad_norm": 3.4160260399141134, + "learning_rate": 3.518824102602217e-06, + "loss": 0.9736, + "step": 3547 + }, + { + "epoch": 0.37314472767428714, + "grad_norm": 2.3235809456017242, + "learning_rate": 3.5180616953626905e-06, + "loss": 1.026, + "step": 3548 + }, + { + "epoch": 0.37324989811613446, + "grad_norm": 3.1866772084593977, + "learning_rate": 3.5172991746013764e-06, + "loss": 1.0147, + "step": 3549 + }, + { + "epoch": 0.3733550685579818, + "grad_norm": 1.7405784529022383, + "learning_rate": 3.516536540403302e-06, + "loss": 0.9932, + "step": 3550 + }, + { + "epoch": 0.3734602389998291, + "grad_norm": 2.4307045279664665, + "learning_rate": 3.5157737928535065e-06, + "loss": 1.0486, + "step": 3551 + }, + { + "epoch": 0.37356540944167643, + "grad_norm": 2.4827334628389925, + "learning_rate": 3.5150109320370425e-06, + "loss": 1.0143, + "step": 3552 + }, + { + "epoch": 0.37367057988352376, + "grad_norm": 2.17954336422211, + "learning_rate": 3.5142479580389744e-06, + "loss": 0.9822, + "step": 3553 + }, + { + "epoch": 0.3737757503253711, + "grad_norm": 2.009056281408665, + "learning_rate": 3.5134848709443793e-06, + "loss": 1.0011, + "step": 3554 + }, + { + "epoch": 0.37388092076721835, + "grad_norm": 2.70716892667249, + "learning_rate": 3.5127216708383495e-06, + "loss": 1.0292, + "step": 3555 + }, + { + "epoch": 0.3739860912090657, + "grad_norm": 1.9705536861580673, + "learning_rate": 3.5119583578059845e-06, + "loss": 1.005, + "step": 3556 + }, + { + "epoch": 0.374091261650913, + "grad_norm": 2.6162266436594415, + "learning_rate": 3.5111949319324027e-06, + "loss": 1.0156, + "step": 3557 + }, + { + "epoch": 0.3741964320927603, + "grad_norm": 2.360521949456891, + "learning_rate": 3.51043139330273e-06, + "loss": 0.9847, + "step": 3558 + }, + { + "epoch": 0.37430160253460765, + "grad_norm": 3.006787570352432, + "learning_rate": 3.5096677420021092e-06, + "loss": 1.0322, + "step": 3559 + }, + { + "epoch": 0.37440677297645497, + "grad_norm": 2.2412182060746964, + "learning_rate": 3.508903978115691e-06, + "loss": 0.986, + "step": 3560 + }, + { + "epoch": 0.3745119434183023, + "grad_norm": 2.3336931601024395, + "learning_rate": 3.508140101728641e-06, + "loss": 1.0246, + "step": 3561 + }, + { + "epoch": 0.3746171138601496, + "grad_norm": 2.562229225549696, + "learning_rate": 3.5073761129261393e-06, + "loss": 0.9975, + "step": 3562 + }, + { + "epoch": 0.37472228430199694, + "grad_norm": 3.1758702644776156, + "learning_rate": 3.5066120117933743e-06, + "loss": 1.02, + "step": 3563 + }, + { + "epoch": 0.37482745474384427, + "grad_norm": 2.228109854729782, + "learning_rate": 3.505847798415551e-06, + "loss": 1.0244, + "step": 3564 + }, + { + "epoch": 0.37493262518569154, + "grad_norm": 2.355915255145524, + "learning_rate": 3.505083472877884e-06, + "loss": 1.0427, + "step": 3565 + }, + { + "epoch": 0.37503779562753886, + "grad_norm": 2.71234930662769, + "learning_rate": 3.5043190352656026e-06, + "loss": 0.9927, + "step": 3566 + }, + { + "epoch": 0.3751429660693862, + "grad_norm": 3.125287938693249, + "learning_rate": 3.503554485663947e-06, + "loss": 0.9746, + "step": 3567 + }, + { + "epoch": 0.3752481365112335, + "grad_norm": 2.832305455594945, + "learning_rate": 3.50278982415817e-06, + "loss": 0.9868, + "step": 3568 + }, + { + "epoch": 0.37535330695308083, + "grad_norm": 2.16624778915707, + "learning_rate": 3.502025050833538e-06, + "loss": 0.9823, + "step": 3569 + }, + { + "epoch": 0.37545847739492816, + "grad_norm": 2.6723466436176517, + "learning_rate": 3.5012601657753283e-06, + "loss": 0.9912, + "step": 3570 + }, + { + "epoch": 0.3755636478367755, + "grad_norm": 2.894125932929896, + "learning_rate": 3.500495169068832e-06, + "loss": 0.9625, + "step": 3571 + }, + { + "epoch": 0.3756688182786228, + "grad_norm": 2.349145898282107, + "learning_rate": 3.499730060799352e-06, + "loss": 0.9749, + "step": 3572 + }, + { + "epoch": 0.37577398872047013, + "grad_norm": 2.2715747784546636, + "learning_rate": 3.4989648410522053e-06, + "loss": 1.0067, + "step": 3573 + }, + { + "epoch": 0.37587915916231746, + "grad_norm": 2.7664541979733435, + "learning_rate": 3.498199509912718e-06, + "loss": 1.0479, + "step": 3574 + }, + { + "epoch": 0.3759843296041647, + "grad_norm": 2.120229567329133, + "learning_rate": 3.497434067466231e-06, + "loss": 1.0106, + "step": 3575 + }, + { + "epoch": 0.37608950004601205, + "grad_norm": 2.1692031969688608, + "learning_rate": 3.4966685137980972e-06, + "loss": 0.9755, + "step": 3576 + }, + { + "epoch": 0.3761946704878594, + "grad_norm": 2.304539050607103, + "learning_rate": 3.4959028489936815e-06, + "loss": 1.0251, + "step": 3577 + }, + { + "epoch": 0.3762998409297067, + "grad_norm": 3.3999675880797784, + "learning_rate": 3.4951370731383615e-06, + "loss": 0.9857, + "step": 3578 + }, + { + "epoch": 0.376405011371554, + "grad_norm": 2.254087700165053, + "learning_rate": 3.494371186317528e-06, + "loss": 0.9828, + "step": 3579 + }, + { + "epoch": 0.37651018181340135, + "grad_norm": 2.8458404135362048, + "learning_rate": 3.4936051886165825e-06, + "loss": 1.0431, + "step": 3580 + }, + { + "epoch": 0.37661535225524867, + "grad_norm": 2.4716575338002214, + "learning_rate": 3.4928390801209395e-06, + "loss": 0.9775, + "step": 3581 + }, + { + "epoch": 0.376720522697096, + "grad_norm": 2.6453228157698714, + "learning_rate": 3.492072860916027e-06, + "loss": 1.0048, + "step": 3582 + }, + { + "epoch": 0.3768256931389433, + "grad_norm": 2.7675242977400454, + "learning_rate": 3.4913065310872834e-06, + "loss": 1.0323, + "step": 3583 + }, + { + "epoch": 0.37693086358079064, + "grad_norm": 2.204262494467231, + "learning_rate": 3.4905400907201604e-06, + "loss": 1.0081, + "step": 3584 + }, + { + "epoch": 0.3770360340226379, + "grad_norm": 2.9513125690747883, + "learning_rate": 3.489773539900123e-06, + "loss": 0.9778, + "step": 3585 + }, + { + "epoch": 0.37714120446448524, + "grad_norm": 2.363516897078722, + "learning_rate": 3.4890068787126475e-06, + "loss": 1.0189, + "step": 3586 + }, + { + "epoch": 0.37724637490633256, + "grad_norm": 3.122013375469763, + "learning_rate": 3.4882401072432215e-06, + "loss": 1.0105, + "step": 3587 + }, + { + "epoch": 0.3773515453481799, + "grad_norm": 2.36989045728494, + "learning_rate": 3.487473225577347e-06, + "loss": 0.9898, + "step": 3588 + }, + { + "epoch": 0.3774567157900272, + "grad_norm": 2.9849863757199784, + "learning_rate": 3.486706233800537e-06, + "loss": 1.0172, + "step": 3589 + }, + { + "epoch": 0.37756188623187453, + "grad_norm": 2.6450922737910116, + "learning_rate": 3.4859391319983165e-06, + "loss": 1.0169, + "step": 3590 + }, + { + "epoch": 0.37766705667372186, + "grad_norm": 2.521241818346483, + "learning_rate": 3.485171920256224e-06, + "loss": 1.0099, + "step": 3591 + }, + { + "epoch": 0.3777722271155692, + "grad_norm": 3.926184947651807, + "learning_rate": 3.484404598659809e-06, + "loss": 1.026, + "step": 3592 + }, + { + "epoch": 0.3778773975574165, + "grad_norm": 2.5873477697553944, + "learning_rate": 3.483637167294634e-06, + "loss": 0.9857, + "step": 3593 + }, + { + "epoch": 0.37798256799926383, + "grad_norm": 2.890617758160231, + "learning_rate": 3.4828696262462743e-06, + "loss": 1.0289, + "step": 3594 + }, + { + "epoch": 0.3780877384411111, + "grad_norm": 2.7950755400661755, + "learning_rate": 3.482101975600316e-06, + "loss": 0.9954, + "step": 3595 + }, + { + "epoch": 0.3781929088829584, + "grad_norm": 1.9102337369098177, + "learning_rate": 3.481334215442358e-06, + "loss": 0.9304, + "step": 3596 + }, + { + "epoch": 0.37829807932480575, + "grad_norm": 3.195058034843074, + "learning_rate": 3.4805663458580113e-06, + "loss": 1.0091, + "step": 3597 + }, + { + "epoch": 0.37840324976665307, + "grad_norm": 2.1115956503169757, + "learning_rate": 3.4797983669329004e-06, + "loss": 1.0348, + "step": 3598 + }, + { + "epoch": 0.3785084202085004, + "grad_norm": 2.3416744434406813, + "learning_rate": 3.47903027875266e-06, + "loss": 0.9704, + "step": 3599 + }, + { + "epoch": 0.3786135906503477, + "grad_norm": 2.363772195609984, + "learning_rate": 3.4782620814029376e-06, + "loss": 0.9948, + "step": 3600 + }, + { + "epoch": 0.37871876109219504, + "grad_norm": 1.9628721511439282, + "learning_rate": 3.4774937749693947e-06, + "loss": 1.0062, + "step": 3601 + }, + { + "epoch": 0.37882393153404237, + "grad_norm": 2.426600466574066, + "learning_rate": 3.4767253595377015e-06, + "loss": 0.9645, + "step": 3602 + }, + { + "epoch": 0.3789291019758897, + "grad_norm": 2.6977870839090645, + "learning_rate": 3.4759568351935446e-06, + "loss": 1.0291, + "step": 3603 + }, + { + "epoch": 0.379034272417737, + "grad_norm": 2.676605696492157, + "learning_rate": 3.4751882020226174e-06, + "loss": 1.0354, + "step": 3604 + }, + { + "epoch": 0.37913944285958434, + "grad_norm": 3.363753270722036, + "learning_rate": 3.4744194601106314e-06, + "loss": 0.9662, + "step": 3605 + }, + { + "epoch": 0.3792446133014316, + "grad_norm": 2.063579478740677, + "learning_rate": 3.4736506095433053e-06, + "loss": 1.0097, + "step": 3606 + }, + { + "epoch": 0.37934978374327893, + "grad_norm": 1.9293491925383035, + "learning_rate": 3.472881650406373e-06, + "loss": 0.9955, + "step": 3607 + }, + { + "epoch": 0.37945495418512626, + "grad_norm": 2.0019157816680124, + "learning_rate": 3.472112582785579e-06, + "loss": 0.958, + "step": 3608 + }, + { + "epoch": 0.3795601246269736, + "grad_norm": 1.9340739804799547, + "learning_rate": 3.4713434067666803e-06, + "loss": 0.9936, + "step": 3609 + }, + { + "epoch": 0.3796652950688209, + "grad_norm": 2.621910541531949, + "learning_rate": 3.4705741224354463e-06, + "loss": 1.0269, + "step": 3610 + }, + { + "epoch": 0.37977046551066823, + "grad_norm": 2.319507440485628, + "learning_rate": 3.4698047298776578e-06, + "loss": 0.9785, + "step": 3611 + }, + { + "epoch": 0.37987563595251556, + "grad_norm": 3.5152668337700885, + "learning_rate": 3.4690352291791084e-06, + "loss": 1.0375, + "step": 3612 + }, + { + "epoch": 0.3799808063943629, + "grad_norm": 2.56130563777068, + "learning_rate": 3.4682656204256033e-06, + "loss": 0.9976, + "step": 3613 + }, + { + "epoch": 0.3800859768362102, + "grad_norm": 3.0184814971313294, + "learning_rate": 3.4674959037029593e-06, + "loss": 1.028, + "step": 3614 + }, + { + "epoch": 0.38019114727805753, + "grad_norm": 2.846303670335661, + "learning_rate": 3.4667260790970065e-06, + "loss": 0.9842, + "step": 3615 + }, + { + "epoch": 0.3802963177199048, + "grad_norm": 2.4611491100975558, + "learning_rate": 3.465956146693586e-06, + "loss": 0.9622, + "step": 3616 + }, + { + "epoch": 0.3804014881617521, + "grad_norm": 2.615213958589618, + "learning_rate": 3.4651861065785515e-06, + "loss": 1.0227, + "step": 3617 + }, + { + "epoch": 0.38050665860359945, + "grad_norm": 2.541915515993435, + "learning_rate": 3.464415958837768e-06, + "loss": 1.0021, + "step": 3618 + }, + { + "epoch": 0.38061182904544677, + "grad_norm": 2.3551759643491184, + "learning_rate": 3.4636457035571135e-06, + "loss": 0.9849, + "step": 3619 + }, + { + "epoch": 0.3807169994872941, + "grad_norm": 3.3489736874070846, + "learning_rate": 3.462875340822477e-06, + "loss": 0.9757, + "step": 3620 + }, + { + "epoch": 0.3808221699291414, + "grad_norm": 3.2681188585536973, + "learning_rate": 3.4621048707197607e-06, + "loss": 1.0341, + "step": 3621 + }, + { + "epoch": 0.38092734037098874, + "grad_norm": 3.4147560699832744, + "learning_rate": 3.461334293334877e-06, + "loss": 1.0442, + "step": 3622 + }, + { + "epoch": 0.38103251081283607, + "grad_norm": 2.2145079608120346, + "learning_rate": 3.4605636087537514e-06, + "loss": 0.998, + "step": 3623 + }, + { + "epoch": 0.3811376812546834, + "grad_norm": 1.8033358102527874, + "learning_rate": 3.4597928170623217e-06, + "loss": 1.0069, + "step": 3624 + }, + { + "epoch": 0.3812428516965307, + "grad_norm": 2.7695898589555465, + "learning_rate": 3.459021918346537e-06, + "loss": 1.018, + "step": 3625 + }, + { + "epoch": 0.381348022138378, + "grad_norm": 3.306734754623489, + "learning_rate": 3.458250912692359e-06, + "loss": 1.006, + "step": 3626 + }, + { + "epoch": 0.3814531925802253, + "grad_norm": 2.986143628061839, + "learning_rate": 3.4574798001857598e-06, + "loss": 1.0543, + "step": 3627 + }, + { + "epoch": 0.38155836302207263, + "grad_norm": 2.3585732168057922, + "learning_rate": 3.4567085809127247e-06, + "loss": 1.0696, + "step": 3628 + }, + { + "epoch": 0.38166353346391996, + "grad_norm": 2.876800068207621, + "learning_rate": 3.4559372549592513e-06, + "loss": 1.0084, + "step": 3629 + }, + { + "epoch": 0.3817687039057673, + "grad_norm": 2.7444522756355534, + "learning_rate": 3.455165822411347e-06, + "loss": 1.0238, + "step": 3630 + }, + { + "epoch": 0.3818738743476146, + "grad_norm": 2.73044610583874, + "learning_rate": 3.4543942833550347e-06, + "loss": 0.982, + "step": 3631 + }, + { + "epoch": 0.38197904478946193, + "grad_norm": 3.4323322120847712, + "learning_rate": 3.453622637876346e-06, + "loss": 1.0404, + "step": 3632 + }, + { + "epoch": 0.38208421523130925, + "grad_norm": 3.64343859536563, + "learning_rate": 3.4528508860613243e-06, + "loss": 1.0047, + "step": 3633 + }, + { + "epoch": 0.3821893856731566, + "grad_norm": 2.3123411149550885, + "learning_rate": 3.452079027996027e-06, + "loss": 0.9852, + "step": 3634 + }, + { + "epoch": 0.3822945561150039, + "grad_norm": 2.2206426400316, + "learning_rate": 3.451307063766522e-06, + "loss": 1.0439, + "step": 3635 + }, + { + "epoch": 0.38239972655685117, + "grad_norm": 2.0863520447051855, + "learning_rate": 3.45053499345889e-06, + "loss": 0.9948, + "step": 3636 + }, + { + "epoch": 0.3825048969986985, + "grad_norm": 1.7414750885086039, + "learning_rate": 3.449762817159221e-06, + "loss": 1.0387, + "step": 3637 + }, + { + "epoch": 0.3826100674405458, + "grad_norm": 2.8068429720399295, + "learning_rate": 3.4489905349536207e-06, + "loss": 1.0214, + "step": 3638 + }, + { + "epoch": 0.38271523788239314, + "grad_norm": 2.3872404074955216, + "learning_rate": 3.448218146928204e-06, + "loss": 1.0203, + "step": 3639 + }, + { + "epoch": 0.38282040832424047, + "grad_norm": 3.3031263883141846, + "learning_rate": 3.4474456531690976e-06, + "loss": 1.0072, + "step": 3640 + }, + { + "epoch": 0.3829255787660878, + "grad_norm": 2.6702786020242044, + "learning_rate": 3.4466730537624406e-06, + "loss": 1.0556, + "step": 3641 + }, + { + "epoch": 0.3830307492079351, + "grad_norm": 2.2459257036074343, + "learning_rate": 3.4459003487943842e-06, + "loss": 0.9824, + "step": 3642 + }, + { + "epoch": 0.38313591964978244, + "grad_norm": 2.809514281952401, + "learning_rate": 3.4451275383510905e-06, + "loss": 1.0374, + "step": 3643 + }, + { + "epoch": 0.38324109009162977, + "grad_norm": 2.6516762564393455, + "learning_rate": 3.444354622518735e-06, + "loss": 0.9732, + "step": 3644 + }, + { + "epoch": 0.3833462605334771, + "grad_norm": 2.7733996194687043, + "learning_rate": 3.443581601383503e-06, + "loss": 1.007, + "step": 3645 + }, + { + "epoch": 0.38345143097532436, + "grad_norm": 2.7799328910992718, + "learning_rate": 3.4428084750315925e-06, + "loss": 1.0295, + "step": 3646 + }, + { + "epoch": 0.3835566014171717, + "grad_norm": 2.195837771998775, + "learning_rate": 3.442035243549213e-06, + "loss": 1.0024, + "step": 3647 + }, + { + "epoch": 0.383661771859019, + "grad_norm": 1.9838499795852829, + "learning_rate": 3.441261907022585e-06, + "loss": 0.9941, + "step": 3648 + }, + { + "epoch": 0.38376694230086633, + "grad_norm": 2.8357895335106758, + "learning_rate": 3.4404884655379433e-06, + "loss": 1.0466, + "step": 3649 + }, + { + "epoch": 0.38387211274271366, + "grad_norm": 3.490737509449956, + "learning_rate": 3.4397149191815317e-06, + "loss": 0.9822, + "step": 3650 + }, + { + "epoch": 0.383977283184561, + "grad_norm": 3.0020658384708194, + "learning_rate": 3.438941268039606e-06, + "loss": 1.0228, + "step": 3651 + }, + { + "epoch": 0.3840824536264083, + "grad_norm": 2.580567442677589, + "learning_rate": 3.438167512198436e-06, + "loss": 0.9814, + "step": 3652 + }, + { + "epoch": 0.38418762406825563, + "grad_norm": 2.6240587564325732, + "learning_rate": 3.4373936517442996e-06, + "loss": 0.9582, + "step": 3653 + }, + { + "epoch": 0.38429279451010295, + "grad_norm": 2.80704824521018, + "learning_rate": 3.43661968676349e-06, + "loss": 1.0803, + "step": 3654 + }, + { + "epoch": 0.3843979649519503, + "grad_norm": 2.019265289979809, + "learning_rate": 3.4358456173423084e-06, + "loss": 0.9665, + "step": 3655 + }, + { + "epoch": 0.38450313539379755, + "grad_norm": 2.130869954578095, + "learning_rate": 3.4350714435670706e-06, + "loss": 0.9747, + "step": 3656 + }, + { + "epoch": 0.38460830583564487, + "grad_norm": 2.6188509617557267, + "learning_rate": 3.434297165524103e-06, + "loss": 0.9966, + "step": 3657 + }, + { + "epoch": 0.3847134762774922, + "grad_norm": 3.1899105918775748, + "learning_rate": 3.433522783299744e-06, + "loss": 1.0068, + "step": 3658 + }, + { + "epoch": 0.3848186467193395, + "grad_norm": 2.6244822778152224, + "learning_rate": 3.432748296980343e-06, + "loss": 0.998, + "step": 3659 + }, + { + "epoch": 0.38492381716118684, + "grad_norm": 2.1576133938956885, + "learning_rate": 3.4319737066522603e-06, + "loss": 1.0098, + "step": 3660 + }, + { + "epoch": 0.38502898760303417, + "grad_norm": 2.218275955657075, + "learning_rate": 3.43119901240187e-06, + "loss": 0.9752, + "step": 3661 + }, + { + "epoch": 0.3851341580448815, + "grad_norm": 3.06330157636448, + "learning_rate": 3.430424214315556e-06, + "loss": 0.9907, + "step": 3662 + }, + { + "epoch": 0.3852393284867288, + "grad_norm": 2.559143144501293, + "learning_rate": 3.429649312479714e-06, + "loss": 1.0291, + "step": 3663 + }, + { + "epoch": 0.38534449892857614, + "grad_norm": 2.7644506663996316, + "learning_rate": 3.4288743069807516e-06, + "loss": 0.9873, + "step": 3664 + }, + { + "epoch": 0.38544966937042346, + "grad_norm": 2.511298257225812, + "learning_rate": 3.4280991979050892e-06, + "loss": 1.0193, + "step": 3665 + }, + { + "epoch": 0.3855548398122708, + "grad_norm": 3.51758390169175, + "learning_rate": 3.427323985339156e-06, + "loss": 1.0199, + "step": 3666 + }, + { + "epoch": 0.38566001025411806, + "grad_norm": 2.8645279133183448, + "learning_rate": 3.4265486693693944e-06, + "loss": 0.9971, + "step": 3667 + }, + { + "epoch": 0.3857651806959654, + "grad_norm": 2.9018074235287155, + "learning_rate": 3.4257732500822592e-06, + "loss": 0.9952, + "step": 3668 + }, + { + "epoch": 0.3858703511378127, + "grad_norm": 2.0588163183524357, + "learning_rate": 3.4249977275642147e-06, + "loss": 0.9869, + "step": 3669 + }, + { + "epoch": 0.38597552157966003, + "grad_norm": 2.686052380256393, + "learning_rate": 3.424222101901738e-06, + "loss": 1.0185, + "step": 3670 + }, + { + "epoch": 0.38608069202150735, + "grad_norm": 2.3591003789868448, + "learning_rate": 3.423446373181317e-06, + "loss": 1.018, + "step": 3671 + }, + { + "epoch": 0.3861858624633547, + "grad_norm": 4.273636944885451, + "learning_rate": 3.4226705414894517e-06, + "loss": 1.0388, + "step": 3672 + }, + { + "epoch": 0.386291032905202, + "grad_norm": 2.9970837835983386, + "learning_rate": 3.4218946069126534e-06, + "loss": 1.0069, + "step": 3673 + }, + { + "epoch": 0.3863962033470493, + "grad_norm": 1.7215807328411599, + "learning_rate": 3.4211185695374454e-06, + "loss": 0.9614, + "step": 3674 + }, + { + "epoch": 0.38650137378889665, + "grad_norm": 3.4161522944359737, + "learning_rate": 3.4203424294503617e-06, + "loss": 1.0311, + "step": 3675 + }, + { + "epoch": 0.386606544230744, + "grad_norm": 2.734529352859378, + "learning_rate": 3.419566186737947e-06, + "loss": 0.9555, + "step": 3676 + }, + { + "epoch": 0.38671171467259124, + "grad_norm": 2.091680566056881, + "learning_rate": 3.4187898414867594e-06, + "loss": 0.998, + "step": 3677 + }, + { + "epoch": 0.38681688511443857, + "grad_norm": 2.5504380693881066, + "learning_rate": 3.418013393783367e-06, + "loss": 1.0147, + "step": 3678 + }, + { + "epoch": 0.3869220555562859, + "grad_norm": 2.4427536161465766, + "learning_rate": 3.4172368437143495e-06, + "loss": 1.0142, + "step": 3679 + }, + { + "epoch": 0.3870272259981332, + "grad_norm": 1.9842607417290865, + "learning_rate": 3.4164601913662985e-06, + "loss": 1.0323, + "step": 3680 + }, + { + "epoch": 0.38713239643998054, + "grad_norm": 2.65586599370129, + "learning_rate": 3.4156834368258175e-06, + "loss": 1.0053, + "step": 3681 + }, + { + "epoch": 0.38723756688182787, + "grad_norm": 2.0732316500738306, + "learning_rate": 3.4149065801795196e-06, + "loss": 0.9952, + "step": 3682 + }, + { + "epoch": 0.3873427373236752, + "grad_norm": 2.7781109716188843, + "learning_rate": 3.4141296215140307e-06, + "loss": 1.0386, + "step": 3683 + }, + { + "epoch": 0.3874479077655225, + "grad_norm": 2.0689240430312554, + "learning_rate": 3.4133525609159883e-06, + "loss": 0.9956, + "step": 3684 + }, + { + "epoch": 0.38755307820736984, + "grad_norm": 1.9357441976440448, + "learning_rate": 3.4125753984720392e-06, + "loss": 1.0061, + "step": 3685 + }, + { + "epoch": 0.38765824864921716, + "grad_norm": 3.5270821137244472, + "learning_rate": 3.411798134268845e-06, + "loss": 0.9688, + "step": 3686 + }, + { + "epoch": 0.38776341909106443, + "grad_norm": 2.7246597051655637, + "learning_rate": 3.4110207683930754e-06, + "loss": 1.018, + "step": 3687 + }, + { + "epoch": 0.38786858953291176, + "grad_norm": 2.84204082065096, + "learning_rate": 3.410243300931413e-06, + "loss": 1.0036, + "step": 3688 + }, + { + "epoch": 0.3879737599747591, + "grad_norm": 2.8073515518648535, + "learning_rate": 3.409465731970551e-06, + "loss": 0.987, + "step": 3689 + }, + { + "epoch": 0.3880789304166064, + "grad_norm": 2.203385890249724, + "learning_rate": 3.408688061597196e-06, + "loss": 1.0379, + "step": 3690 + }, + { + "epoch": 0.38818410085845373, + "grad_norm": 1.9853510788665791, + "learning_rate": 3.4079102898980633e-06, + "loss": 1.008, + "step": 3691 + }, + { + "epoch": 0.38828927130030105, + "grad_norm": 2.0269843653629502, + "learning_rate": 3.407132416959879e-06, + "loss": 0.9997, + "step": 3692 + }, + { + "epoch": 0.3883944417421484, + "grad_norm": 2.513094917455823, + "learning_rate": 3.406354442869384e-06, + "loss": 1.033, + "step": 3693 + }, + { + "epoch": 0.3884996121839957, + "grad_norm": 2.1492929808384917, + "learning_rate": 3.405576367713328e-06, + "loss": 0.9705, + "step": 3694 + }, + { + "epoch": 0.388604782625843, + "grad_norm": 2.4692888812089064, + "learning_rate": 3.404798191578472e-06, + "loss": 1.0106, + "step": 3695 + }, + { + "epoch": 0.38870995306769035, + "grad_norm": 3.2356374351548935, + "learning_rate": 3.4040199145515882e-06, + "loss": 0.9841, + "step": 3696 + }, + { + "epoch": 0.3888151235095376, + "grad_norm": 2.6289580492215268, + "learning_rate": 3.4032415367194628e-06, + "loss": 1.0185, + "step": 3697 + }, + { + "epoch": 0.38892029395138494, + "grad_norm": 2.66254807289422, + "learning_rate": 3.4024630581688895e-06, + "loss": 1.022, + "step": 3698 + }, + { + "epoch": 0.38902546439323227, + "grad_norm": 2.312866252842055, + "learning_rate": 3.4016844789866733e-06, + "loss": 1.0001, + "step": 3699 + }, + { + "epoch": 0.3891306348350796, + "grad_norm": 2.748756895399873, + "learning_rate": 3.400905799259634e-06, + "loss": 1.0213, + "step": 3700 + }, + { + "epoch": 0.3892358052769269, + "grad_norm": 2.422215122838384, + "learning_rate": 3.4001270190745988e-06, + "loss": 0.9726, + "step": 3701 + }, + { + "epoch": 0.38934097571877424, + "grad_norm": 2.3324336660284413, + "learning_rate": 3.3993481385184097e-06, + "loss": 1.0022, + "step": 3702 + }, + { + "epoch": 0.38944614616062156, + "grad_norm": 3.3738941893509407, + "learning_rate": 3.398569157677916e-06, + "loss": 1.0698, + "step": 3703 + }, + { + "epoch": 0.3895513166024689, + "grad_norm": 1.9996879578710953, + "learning_rate": 3.3977900766399807e-06, + "loss": 1.0164, + "step": 3704 + }, + { + "epoch": 0.3896564870443162, + "grad_norm": 3.7223647380800693, + "learning_rate": 3.3970108954914782e-06, + "loss": 1.0151, + "step": 3705 + }, + { + "epoch": 0.38976165748616354, + "grad_norm": 2.9087827912362934, + "learning_rate": 3.3962316143192923e-06, + "loss": 1.0174, + "step": 3706 + }, + { + "epoch": 0.3898668279280108, + "grad_norm": 2.222505824431483, + "learning_rate": 3.3954522332103197e-06, + "loss": 1.0393, + "step": 3707 + }, + { + "epoch": 0.38997199836985813, + "grad_norm": 2.110921600030713, + "learning_rate": 3.394672752251466e-06, + "loss": 0.9682, + "step": 3708 + }, + { + "epoch": 0.39007716881170545, + "grad_norm": 2.6718880567786507, + "learning_rate": 3.3938931715296515e-06, + "loss": 0.9888, + "step": 3709 + }, + { + "epoch": 0.3901823392535528, + "grad_norm": 3.1737106251642637, + "learning_rate": 3.393113491131803e-06, + "loss": 0.9869, + "step": 3710 + }, + { + "epoch": 0.3902875096954001, + "grad_norm": 3.316157096089004, + "learning_rate": 3.3923337111448635e-06, + "loss": 1.0071, + "step": 3711 + }, + { + "epoch": 0.3903926801372474, + "grad_norm": 2.393156968914557, + "learning_rate": 3.391553831655783e-06, + "loss": 1.0382, + "step": 3712 + }, + { + "epoch": 0.39049785057909475, + "grad_norm": 2.8352293224211986, + "learning_rate": 3.3907738527515243e-06, + "loss": 0.9929, + "step": 3713 + }, + { + "epoch": 0.3906030210209421, + "grad_norm": 2.950692846867964, + "learning_rate": 3.3899937745190616e-06, + "loss": 1.0297, + "step": 3714 + }, + { + "epoch": 0.3907081914627894, + "grad_norm": 1.9201546485115777, + "learning_rate": 3.3892135970453787e-06, + "loss": 0.9889, + "step": 3715 + }, + { + "epoch": 0.3908133619046367, + "grad_norm": 3.0185805923153746, + "learning_rate": 3.3884333204174722e-06, + "loss": 1.0277, + "step": 3716 + }, + { + "epoch": 0.390918532346484, + "grad_norm": 2.574076886352291, + "learning_rate": 3.387652944722349e-06, + "loss": 1.0126, + "step": 3717 + }, + { + "epoch": 0.3910237027883313, + "grad_norm": 2.9898536035964702, + "learning_rate": 3.3868724700470273e-06, + "loss": 1.0308, + "step": 3718 + }, + { + "epoch": 0.39112887323017864, + "grad_norm": 3.170586149546781, + "learning_rate": 3.3860918964785354e-06, + "loss": 1.012, + "step": 3719 + }, + { + "epoch": 0.39123404367202597, + "grad_norm": 2.162024803561226, + "learning_rate": 3.3853112241039136e-06, + "loss": 1.0032, + "step": 3720 + }, + { + "epoch": 0.3913392141138733, + "grad_norm": 2.800456326423131, + "learning_rate": 3.3845304530102137e-06, + "loss": 0.9699, + "step": 3721 + }, + { + "epoch": 0.3914443845557206, + "grad_norm": 2.230103160597217, + "learning_rate": 3.3837495832844958e-06, + "loss": 0.9907, + "step": 3722 + }, + { + "epoch": 0.39154955499756794, + "grad_norm": 1.9475989647841914, + "learning_rate": 3.382968615013835e-06, + "loss": 0.9762, + "step": 3723 + }, + { + "epoch": 0.39165472543941526, + "grad_norm": 1.690195126900537, + "learning_rate": 3.382187548285314e-06, + "loss": 0.9697, + "step": 3724 + }, + { + "epoch": 0.3917598958812626, + "grad_norm": 2.043759641357441, + "learning_rate": 3.3814063831860282e-06, + "loss": 0.9833, + "step": 3725 + }, + { + "epoch": 0.3918650663231099, + "grad_norm": 2.798655205453248, + "learning_rate": 3.3806251198030843e-06, + "loss": 0.9836, + "step": 3726 + }, + { + "epoch": 0.39197023676495724, + "grad_norm": 2.8030545959030553, + "learning_rate": 3.3798437582235984e-06, + "loss": 0.9983, + "step": 3727 + }, + { + "epoch": 0.3920754072068045, + "grad_norm": 1.88892717549588, + "learning_rate": 3.3790622985346982e-06, + "loss": 0.9806, + "step": 3728 + }, + { + "epoch": 0.39218057764865183, + "grad_norm": 2.9555789052928256, + "learning_rate": 3.378280740823523e-06, + "loss": 1.0005, + "step": 3729 + }, + { + "epoch": 0.39228574809049915, + "grad_norm": 2.999318912879433, + "learning_rate": 3.377499085177223e-06, + "loss": 1.037, + "step": 3730 + }, + { + "epoch": 0.3923909185323465, + "grad_norm": 3.187290454242505, + "learning_rate": 3.3767173316829577e-06, + "loss": 1.0223, + "step": 3731 + }, + { + "epoch": 0.3924960889741938, + "grad_norm": 2.739342019449803, + "learning_rate": 3.3759354804279e-06, + "loss": 1.0258, + "step": 3732 + }, + { + "epoch": 0.3926012594160411, + "grad_norm": 2.358011734083977, + "learning_rate": 3.375153531499231e-06, + "loss": 1.0051, + "step": 3733 + }, + { + "epoch": 0.39270642985788845, + "grad_norm": 2.4014349663864345, + "learning_rate": 3.3743714849841457e-06, + "loss": 1.0244, + "step": 3734 + }, + { + "epoch": 0.3928116002997358, + "grad_norm": 2.621545965673851, + "learning_rate": 3.373589340969847e-06, + "loss": 0.9887, + "step": 3735 + }, + { + "epoch": 0.3929167707415831, + "grad_norm": 1.4862634418648064, + "learning_rate": 3.37280709954355e-06, + "loss": 1.0241, + "step": 3736 + }, + { + "epoch": 0.3930219411834304, + "grad_norm": 2.3418718121471893, + "learning_rate": 3.3720247607924816e-06, + "loss": 1.0129, + "step": 3737 + }, + { + "epoch": 0.3931271116252777, + "grad_norm": 2.029673077254618, + "learning_rate": 3.3712423248038785e-06, + "loss": 1.0308, + "step": 3738 + }, + { + "epoch": 0.393232282067125, + "grad_norm": 2.15890181900642, + "learning_rate": 3.370459791664988e-06, + "loss": 0.9973, + "step": 3739 + }, + { + "epoch": 0.39333745250897234, + "grad_norm": 2.790195503492097, + "learning_rate": 3.369677161463068e-06, + "loss": 1.0607, + "step": 3740 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 2.3702375741582253, + "learning_rate": 3.3688944342853904e-06, + "loss": 0.9909, + "step": 3741 + }, + { + "epoch": 0.393547793392667, + "grad_norm": 2.6321034738325975, + "learning_rate": 3.368111610219232e-06, + "loss": 0.9887, + "step": 3742 + }, + { + "epoch": 0.3936529638345143, + "grad_norm": 3.016851750333276, + "learning_rate": 3.367328689351886e-06, + "loss": 1.0107, + "step": 3743 + }, + { + "epoch": 0.39375813427636164, + "grad_norm": 2.2979782597568774, + "learning_rate": 3.3665456717706522e-06, + "loss": 1.0155, + "step": 3744 + }, + { + "epoch": 0.39386330471820896, + "grad_norm": 2.695466999857748, + "learning_rate": 3.3657625575628462e-06, + "loss": 1.0072, + "step": 3745 + }, + { + "epoch": 0.3939684751600563, + "grad_norm": 1.8645001661672012, + "learning_rate": 3.3649793468157887e-06, + "loss": 0.99, + "step": 3746 + }, + { + "epoch": 0.3940736456019036, + "grad_norm": 2.324980227855501, + "learning_rate": 3.364196039616815e-06, + "loss": 0.9843, + "step": 3747 + }, + { + "epoch": 0.3941788160437509, + "grad_norm": 3.100195028994209, + "learning_rate": 3.3634126360532694e-06, + "loss": 0.9901, + "step": 3748 + }, + { + "epoch": 0.3942839864855982, + "grad_norm": 2.3643331155710534, + "learning_rate": 3.3626291362125075e-06, + "loss": 1.0829, + "step": 3749 + }, + { + "epoch": 0.39438915692744553, + "grad_norm": 2.2726875865978444, + "learning_rate": 3.3618455401818963e-06, + "loss": 0.9908, + "step": 3750 + }, + { + "epoch": 0.39449432736929285, + "grad_norm": 2.259480175273652, + "learning_rate": 3.3610618480488115e-06, + "loss": 0.9894, + "step": 3751 + }, + { + "epoch": 0.3945994978111402, + "grad_norm": 2.6256746329463154, + "learning_rate": 3.3602780599006425e-06, + "loss": 1.0088, + "step": 3752 + }, + { + "epoch": 0.3947046682529875, + "grad_norm": 2.8544786484384534, + "learning_rate": 3.359494175824787e-06, + "loss": 1.0236, + "step": 3753 + }, + { + "epoch": 0.3948098386948348, + "grad_norm": 2.055256613594115, + "learning_rate": 3.358710195908653e-06, + "loss": 1.0129, + "step": 3754 + }, + { + "epoch": 0.39491500913668215, + "grad_norm": 2.6762903150895165, + "learning_rate": 3.3579261202396624e-06, + "loss": 1.0084, + "step": 3755 + }, + { + "epoch": 0.3950201795785295, + "grad_norm": 3.105067449951284, + "learning_rate": 3.357141948905244e-06, + "loss": 1.0396, + "step": 3756 + }, + { + "epoch": 0.3951253500203768, + "grad_norm": 2.7357680717640736, + "learning_rate": 3.356357681992841e-06, + "loss": 1.0104, + "step": 3757 + }, + { + "epoch": 0.39523052046222407, + "grad_norm": 2.5272957675398615, + "learning_rate": 3.3555733195899034e-06, + "loss": 0.9918, + "step": 3758 + }, + { + "epoch": 0.3953356909040714, + "grad_norm": 2.2839849999981823, + "learning_rate": 3.354788861783894e-06, + "loss": 0.9403, + "step": 3759 + }, + { + "epoch": 0.3954408613459187, + "grad_norm": 2.68537039553363, + "learning_rate": 3.3540043086622865e-06, + "loss": 1.0048, + "step": 3760 + }, + { + "epoch": 0.39554603178776604, + "grad_norm": 1.95762032240909, + "learning_rate": 3.3532196603125646e-06, + "loss": 1.0022, + "step": 3761 + }, + { + "epoch": 0.39565120222961336, + "grad_norm": 2.4783053417448278, + "learning_rate": 3.3524349168222224e-06, + "loss": 1.0038, + "step": 3762 + }, + { + "epoch": 0.3957563726714607, + "grad_norm": 2.2138346751080373, + "learning_rate": 3.3516500782787653e-06, + "loss": 1.0094, + "step": 3763 + }, + { + "epoch": 0.395861543113308, + "grad_norm": 2.936093102972425, + "learning_rate": 3.3508651447697083e-06, + "loss": 1.0291, + "step": 3764 + }, + { + "epoch": 0.39596671355515534, + "grad_norm": 2.971080492228416, + "learning_rate": 3.350080116382578e-06, + "loss": 1.0148, + "step": 3765 + }, + { + "epoch": 0.39607188399700266, + "grad_norm": 2.4766119317435167, + "learning_rate": 3.3492949932049115e-06, + "loss": 0.9808, + "step": 3766 + }, + { + "epoch": 0.39617705443885, + "grad_norm": 2.0419716907450707, + "learning_rate": 3.3485097753242546e-06, + "loss": 0.9947, + "step": 3767 + }, + { + "epoch": 0.39628222488069725, + "grad_norm": 2.6590218339263787, + "learning_rate": 3.3477244628281667e-06, + "loss": 0.9918, + "step": 3768 + }, + { + "epoch": 0.3963873953225446, + "grad_norm": 2.711175294896815, + "learning_rate": 3.3469390558042163e-06, + "loss": 0.9712, + "step": 3769 + }, + { + "epoch": 0.3964925657643919, + "grad_norm": 3.5550676117965385, + "learning_rate": 3.346153554339982e-06, + "loss": 1.0413, + "step": 3770 + }, + { + "epoch": 0.3965977362062392, + "grad_norm": 2.9614092689255473, + "learning_rate": 3.3453679585230532e-06, + "loss": 1.0076, + "step": 3771 + }, + { + "epoch": 0.39670290664808655, + "grad_norm": 2.3025973292096507, + "learning_rate": 3.3445822684410295e-06, + "loss": 0.9737, + "step": 3772 + }, + { + "epoch": 0.3968080770899339, + "grad_norm": 2.2173191627458158, + "learning_rate": 3.3437964841815226e-06, + "loss": 0.9836, + "step": 3773 + }, + { + "epoch": 0.3969132475317812, + "grad_norm": 2.3309333133092243, + "learning_rate": 3.3430106058321517e-06, + "loss": 1.0039, + "step": 3774 + }, + { + "epoch": 0.3970184179736285, + "grad_norm": 1.9871063924389158, + "learning_rate": 3.3422246334805504e-06, + "loss": 0.9684, + "step": 3775 + }, + { + "epoch": 0.39712358841547585, + "grad_norm": 2.4572887046393204, + "learning_rate": 3.34143856721436e-06, + "loss": 0.9899, + "step": 3776 + }, + { + "epoch": 0.39722875885732317, + "grad_norm": 2.365568699436195, + "learning_rate": 3.3406524071212327e-06, + "loss": 0.9614, + "step": 3777 + }, + { + "epoch": 0.39733392929917044, + "grad_norm": 1.7818280646816138, + "learning_rate": 3.3398661532888314e-06, + "loss": 0.9849, + "step": 3778 + }, + { + "epoch": 0.39743909974101777, + "grad_norm": 1.9046459086271046, + "learning_rate": 3.339079805804829e-06, + "loss": 1.0143, + "step": 3779 + }, + { + "epoch": 0.3975442701828651, + "grad_norm": 2.3309655746278333, + "learning_rate": 3.3382933647569115e-06, + "loss": 0.9559, + "step": 3780 + }, + { + "epoch": 0.3976494406247124, + "grad_norm": 1.9564505802602077, + "learning_rate": 3.337506830232771e-06, + "loss": 1.0231, + "step": 3781 + }, + { + "epoch": 0.39775461106655974, + "grad_norm": 2.474905869601421, + "learning_rate": 3.3367202023201128e-06, + "loss": 1.0038, + "step": 3782 + }, + { + "epoch": 0.39785978150840706, + "grad_norm": 2.598347095424969, + "learning_rate": 3.3359334811066524e-06, + "loss": 1.0089, + "step": 3783 + }, + { + "epoch": 0.3979649519502544, + "grad_norm": 2.164211187048747, + "learning_rate": 3.3351466666801147e-06, + "loss": 1.0008, + "step": 3784 + }, + { + "epoch": 0.3980701223921017, + "grad_norm": 2.7808448041427845, + "learning_rate": 3.3343597591282366e-06, + "loss": 1.0307, + "step": 3785 + }, + { + "epoch": 0.39817529283394903, + "grad_norm": 3.473039231593379, + "learning_rate": 3.3335727585387636e-06, + "loss": 1.0376, + "step": 3786 + }, + { + "epoch": 0.39828046327579636, + "grad_norm": 2.8034840356520485, + "learning_rate": 3.332785664999453e-06, + "loss": 1.0261, + "step": 3787 + }, + { + "epoch": 0.3983856337176437, + "grad_norm": 2.7072024059878474, + "learning_rate": 3.3319984785980703e-06, + "loss": 1.0098, + "step": 3788 + }, + { + "epoch": 0.39849080415949095, + "grad_norm": 2.4114719388071872, + "learning_rate": 3.331211199422395e-06, + "loss": 1.0064, + "step": 3789 + }, + { + "epoch": 0.3985959746013383, + "grad_norm": 1.7540733706136926, + "learning_rate": 3.330423827560213e-06, + "loss": 0.9798, + "step": 3790 + }, + { + "epoch": 0.3987011450431856, + "grad_norm": 2.370248082622637, + "learning_rate": 3.3296363630993245e-06, + "loss": 0.999, + "step": 3791 + }, + { + "epoch": 0.3988063154850329, + "grad_norm": 2.51665004694854, + "learning_rate": 3.3288488061275365e-06, + "loss": 0.9833, + "step": 3792 + }, + { + "epoch": 0.39891148592688025, + "grad_norm": 2.5790260976139705, + "learning_rate": 3.328061156732667e-06, + "loss": 1.0166, + "step": 3793 + }, + { + "epoch": 0.3990166563687276, + "grad_norm": 2.090788901444674, + "learning_rate": 3.3272734150025467e-06, + "loss": 0.9734, + "step": 3794 + }, + { + "epoch": 0.3991218268105749, + "grad_norm": 2.0061905903896005, + "learning_rate": 3.3264855810250137e-06, + "loss": 1.0343, + "step": 3795 + }, + { + "epoch": 0.3992269972524222, + "grad_norm": 2.829412278323214, + "learning_rate": 3.3256976548879183e-06, + "loss": 0.9787, + "step": 3796 + }, + { + "epoch": 0.39933216769426955, + "grad_norm": 2.2031781208991723, + "learning_rate": 3.3249096366791196e-06, + "loss": 0.984, + "step": 3797 + }, + { + "epoch": 0.39943733813611687, + "grad_norm": 2.6789806053409944, + "learning_rate": 3.324121526486489e-06, + "loss": 1.0294, + "step": 3798 + }, + { + "epoch": 0.39954250857796414, + "grad_norm": 2.2888330035394273, + "learning_rate": 3.323333324397906e-06, + "loss": 1.0565, + "step": 3799 + }, + { + "epoch": 0.39964767901981146, + "grad_norm": 2.247074905242807, + "learning_rate": 3.3225450305012614e-06, + "loss": 1.012, + "step": 3800 + }, + { + "epoch": 0.3997528494616588, + "grad_norm": 2.694527096396108, + "learning_rate": 3.321756644884456e-06, + "loss": 1.0569, + "step": 3801 + }, + { + "epoch": 0.3998580199035061, + "grad_norm": 1.7511580004306093, + "learning_rate": 3.3209681676354012e-06, + "loss": 1.018, + "step": 3802 + }, + { + "epoch": 0.39996319034535344, + "grad_norm": 2.8140466000278463, + "learning_rate": 3.3201795988420184e-06, + "loss": 1.0134, + "step": 3803 + }, + { + "epoch": 0.40006836078720076, + "grad_norm": 2.892670193426442, + "learning_rate": 3.3193909385922385e-06, + "loss": 0.9829, + "step": 3804 + }, + { + "epoch": 0.4001735312290481, + "grad_norm": 2.0932756519682356, + "learning_rate": 3.3186021869740036e-06, + "loss": 1.0221, + "step": 3805 + }, + { + "epoch": 0.4002787016708954, + "grad_norm": 2.690319929330406, + "learning_rate": 3.3178133440752664e-06, + "loss": 1.0595, + "step": 3806 + }, + { + "epoch": 0.40038387211274273, + "grad_norm": 3.1784389270535365, + "learning_rate": 3.317024409983988e-06, + "loss": 1.0084, + "step": 3807 + }, + { + "epoch": 0.40048904255459006, + "grad_norm": 1.8851550338498042, + "learning_rate": 3.3162353847881414e-06, + "loss": 0.9487, + "step": 3808 + }, + { + "epoch": 0.4005942129964373, + "grad_norm": 2.5639704456014574, + "learning_rate": 3.3154462685757077e-06, + "loss": 0.9514, + "step": 3809 + }, + { + "epoch": 0.40069938343828465, + "grad_norm": 2.3978339213837856, + "learning_rate": 3.3146570614346814e-06, + "loss": 1.0111, + "step": 3810 + }, + { + "epoch": 0.400804553880132, + "grad_norm": 2.6853925911053285, + "learning_rate": 3.3138677634530637e-06, + "loss": 1.0216, + "step": 3811 + }, + { + "epoch": 0.4009097243219793, + "grad_norm": 2.138313165527399, + "learning_rate": 3.313078374718868e-06, + "loss": 0.9739, + "step": 3812 + }, + { + "epoch": 0.4010148947638266, + "grad_norm": 2.1570278204299753, + "learning_rate": 3.3122888953201176e-06, + "loss": 0.9708, + "step": 3813 + }, + { + "epoch": 0.40112006520567395, + "grad_norm": 2.128469182048962, + "learning_rate": 3.3114993253448457e-06, + "loss": 1.0079, + "step": 3814 + }, + { + "epoch": 0.40122523564752127, + "grad_norm": 3.0955012345320623, + "learning_rate": 3.3107096648810945e-06, + "loss": 1.0154, + "step": 3815 + }, + { + "epoch": 0.4013304060893686, + "grad_norm": 3.5600261230012413, + "learning_rate": 3.309919914016918e-06, + "loss": 1.0091, + "step": 3816 + }, + { + "epoch": 0.4014355765312159, + "grad_norm": 3.6965619154449274, + "learning_rate": 3.309130072840379e-06, + "loss": 0.9899, + "step": 3817 + }, + { + "epoch": 0.40154074697306325, + "grad_norm": 3.1497857281534882, + "learning_rate": 3.3083401414395516e-06, + "loss": 1.0138, + "step": 3818 + }, + { + "epoch": 0.4016459174149105, + "grad_norm": 2.752987084802028, + "learning_rate": 3.3075501199025194e-06, + "loss": 1.0456, + "step": 3819 + }, + { + "epoch": 0.40175108785675784, + "grad_norm": 2.3240624321572456, + "learning_rate": 3.3067600083173752e-06, + "loss": 0.963, + "step": 3820 + }, + { + "epoch": 0.40185625829860516, + "grad_norm": 3.583739472871665, + "learning_rate": 3.3059698067722236e-06, + "loss": 0.9924, + "step": 3821 + }, + { + "epoch": 0.4019614287404525, + "grad_norm": 2.1565141093715754, + "learning_rate": 3.305179515355177e-06, + "loss": 1.0152, + "step": 3822 + }, + { + "epoch": 0.4020665991822998, + "grad_norm": 2.697943981821852, + "learning_rate": 3.304389134154359e-06, + "loss": 1.0333, + "step": 3823 + }, + { + "epoch": 0.40217176962414714, + "grad_norm": 2.7143533256140215, + "learning_rate": 3.303598663257904e-06, + "loss": 0.9832, + "step": 3824 + }, + { + "epoch": 0.40227694006599446, + "grad_norm": 2.750860124589298, + "learning_rate": 3.3028081027539562e-06, + "loss": 1.0203, + "step": 3825 + }, + { + "epoch": 0.4023821105078418, + "grad_norm": 3.058694203710453, + "learning_rate": 3.302017452730668e-06, + "loss": 0.9606, + "step": 3826 + }, + { + "epoch": 0.4024872809496891, + "grad_norm": 2.6222751926965073, + "learning_rate": 3.301226713276203e-06, + "loss": 1.0546, + "step": 3827 + }, + { + "epoch": 0.40259245139153643, + "grad_norm": 3.5103197855371935, + "learning_rate": 3.300435884478737e-06, + "loss": 1.0299, + "step": 3828 + }, + { + "epoch": 0.4026976218333837, + "grad_norm": 2.8240018305917975, + "learning_rate": 3.2996449664264506e-06, + "loss": 1.0326, + "step": 3829 + }, + { + "epoch": 0.402802792275231, + "grad_norm": 2.5409628619006646, + "learning_rate": 3.298853959207539e-06, + "loss": 1.0357, + "step": 3830 + }, + { + "epoch": 0.40290796271707835, + "grad_norm": 2.6396149602766235, + "learning_rate": 3.298062862910205e-06, + "loss": 1.0238, + "step": 3831 + }, + { + "epoch": 0.4030131331589257, + "grad_norm": 2.2641809884701716, + "learning_rate": 3.297271677622662e-06, + "loss": 1.0312, + "step": 3832 + }, + { + "epoch": 0.403118303600773, + "grad_norm": 2.1404633046302535, + "learning_rate": 3.2964804034331344e-06, + "loss": 1.0002, + "step": 3833 + }, + { + "epoch": 0.4032234740426203, + "grad_norm": 3.02584736223, + "learning_rate": 3.2956890404298537e-06, + "loss": 1.0608, + "step": 3834 + }, + { + "epoch": 0.40332864448446765, + "grad_norm": 1.916867136991986, + "learning_rate": 3.2948975887010643e-06, + "loss": 0.9875, + "step": 3835 + }, + { + "epoch": 0.40343381492631497, + "grad_norm": 3.057690418288368, + "learning_rate": 3.2941060483350183e-06, + "loss": 0.9777, + "step": 3836 + }, + { + "epoch": 0.4035389853681623, + "grad_norm": 2.0359787957493336, + "learning_rate": 3.29331441941998e-06, + "loss": 0.9759, + "step": 3837 + }, + { + "epoch": 0.4036441558100096, + "grad_norm": 2.565513639029297, + "learning_rate": 3.292522702044221e-06, + "loss": 1.0005, + "step": 3838 + }, + { + "epoch": 0.4037493262518569, + "grad_norm": 2.6810471066425743, + "learning_rate": 3.2917308962960232e-06, + "loss": 1.0066, + "step": 3839 + }, + { + "epoch": 0.4038544966937042, + "grad_norm": 3.16105856831009, + "learning_rate": 3.2909390022636813e-06, + "loss": 1.0143, + "step": 3840 + }, + { + "epoch": 0.40395966713555154, + "grad_norm": 2.9822084656388923, + "learning_rate": 3.2901470200354953e-06, + "loss": 0.998, + "step": 3841 + }, + { + "epoch": 0.40406483757739886, + "grad_norm": 2.19021483535333, + "learning_rate": 3.289354949699779e-06, + "loss": 0.9732, + "step": 3842 + }, + { + "epoch": 0.4041700080192462, + "grad_norm": 3.090611395109951, + "learning_rate": 3.288562791344854e-06, + "loss": 1.0176, + "step": 3843 + }, + { + "epoch": 0.4042751784610935, + "grad_norm": 2.468177042950385, + "learning_rate": 3.2877705450590525e-06, + "loss": 1.04, + "step": 3844 + }, + { + "epoch": 0.40438034890294083, + "grad_norm": 3.2507866817267086, + "learning_rate": 3.2869782109307147e-06, + "loss": 1.024, + "step": 3845 + }, + { + "epoch": 0.40448551934478816, + "grad_norm": 2.749363551983819, + "learning_rate": 3.2861857890481928e-06, + "loss": 1.0086, + "step": 3846 + }, + { + "epoch": 0.4045906897866355, + "grad_norm": 3.3223260711708282, + "learning_rate": 3.2853932794998487e-06, + "loss": 1.026, + "step": 3847 + }, + { + "epoch": 0.4046958602284828, + "grad_norm": 2.0959836207172917, + "learning_rate": 3.2846006823740527e-06, + "loss": 0.963, + "step": 3848 + }, + { + "epoch": 0.40480103067033013, + "grad_norm": 2.601661822818262, + "learning_rate": 3.283807997759186e-06, + "loss": 1.0286, + "step": 3849 + }, + { + "epoch": 0.4049062011121774, + "grad_norm": 2.223265802293675, + "learning_rate": 3.283015225743638e-06, + "loss": 0.9474, + "step": 3850 + }, + { + "epoch": 0.4050113715540247, + "grad_norm": 3.994077202727601, + "learning_rate": 3.2822223664158103e-06, + "loss": 0.9945, + "step": 3851 + }, + { + "epoch": 0.40511654199587205, + "grad_norm": 2.035477588921751, + "learning_rate": 3.281429419864112e-06, + "loss": 0.9734, + "step": 3852 + }, + { + "epoch": 0.4052217124377194, + "grad_norm": 3.15506782103542, + "learning_rate": 3.2806363861769626e-06, + "loss": 1.0234, + "step": 3853 + }, + { + "epoch": 0.4053268828795667, + "grad_norm": 3.0102187470478428, + "learning_rate": 3.2798432654427925e-06, + "loss": 0.9851, + "step": 3854 + }, + { + "epoch": 0.405432053321414, + "grad_norm": 1.8770708121120634, + "learning_rate": 3.2790500577500393e-06, + "loss": 0.9907, + "step": 3855 + }, + { + "epoch": 0.40553722376326135, + "grad_norm": 2.6017298144724914, + "learning_rate": 3.2782567631871536e-06, + "loss": 1.0101, + "step": 3856 + }, + { + "epoch": 0.40564239420510867, + "grad_norm": 2.3770058244751833, + "learning_rate": 3.2774633818425932e-06, + "loss": 1.0182, + "step": 3857 + }, + { + "epoch": 0.405747564646956, + "grad_norm": 2.2714004871527003, + "learning_rate": 3.2766699138048265e-06, + "loss": 0.9867, + "step": 3858 + }, + { + "epoch": 0.4058527350888033, + "grad_norm": 2.0842568889627793, + "learning_rate": 3.27587635916233e-06, + "loss": 0.95, + "step": 3859 + }, + { + "epoch": 0.4059579055306506, + "grad_norm": 2.0840650081817884, + "learning_rate": 3.2750827180035927e-06, + "loss": 1.0121, + "step": 3860 + }, + { + "epoch": 0.4060630759724979, + "grad_norm": 1.9672860296600982, + "learning_rate": 3.2742889904171114e-06, + "loss": 1.0028, + "step": 3861 + }, + { + "epoch": 0.40616824641434524, + "grad_norm": 2.885167297028863, + "learning_rate": 3.2734951764913926e-06, + "loss": 0.9963, + "step": 3862 + }, + { + "epoch": 0.40627341685619256, + "grad_norm": 2.704006997564469, + "learning_rate": 3.2727012763149535e-06, + "loss": 1.0243, + "step": 3863 + }, + { + "epoch": 0.4063785872980399, + "grad_norm": 2.3809414848304398, + "learning_rate": 3.2719072899763186e-06, + "loss": 0.9912, + "step": 3864 + }, + { + "epoch": 0.4064837577398872, + "grad_norm": 2.6369184204411287, + "learning_rate": 3.271113217564025e-06, + "loss": 0.9984, + "step": 3865 + }, + { + "epoch": 0.40658892818173453, + "grad_norm": 2.8295484338886734, + "learning_rate": 3.2703190591666174e-06, + "loss": 1.0434, + "step": 3866 + }, + { + "epoch": 0.40669409862358186, + "grad_norm": 2.925767740990065, + "learning_rate": 3.2695248148726513e-06, + "loss": 1.032, + "step": 3867 + }, + { + "epoch": 0.4067992690654292, + "grad_norm": 2.6452717628361406, + "learning_rate": 3.268730484770689e-06, + "loss": 1.0057, + "step": 3868 + }, + { + "epoch": 0.4069044395072765, + "grad_norm": 2.0361868641093652, + "learning_rate": 3.2679360689493067e-06, + "loss": 0.9866, + "step": 3869 + }, + { + "epoch": 0.4070096099491238, + "grad_norm": 2.8896506691583648, + "learning_rate": 3.2671415674970874e-06, + "loss": 0.9859, + "step": 3870 + }, + { + "epoch": 0.4071147803909711, + "grad_norm": 2.3143761737562305, + "learning_rate": 3.266346980502624e-06, + "loss": 0.9912, + "step": 3871 + }, + { + "epoch": 0.4072199508328184, + "grad_norm": 2.821772166631868, + "learning_rate": 3.2655523080545188e-06, + "loss": 1.0074, + "step": 3872 + }, + { + "epoch": 0.40732512127466575, + "grad_norm": 2.539562382624043, + "learning_rate": 3.264757550241384e-06, + "loss": 1.0218, + "step": 3873 + }, + { + "epoch": 0.40743029171651307, + "grad_norm": 2.401844779407129, + "learning_rate": 3.263962707151842e-06, + "loss": 0.9949, + "step": 3874 + }, + { + "epoch": 0.4075354621583604, + "grad_norm": 1.6438140292650527, + "learning_rate": 3.263167778874523e-06, + "loss": 0.9944, + "step": 3875 + }, + { + "epoch": 0.4076406326002077, + "grad_norm": 2.1162759607114006, + "learning_rate": 3.2623727654980686e-06, + "loss": 1.0216, + "step": 3876 + }, + { + "epoch": 0.40774580304205504, + "grad_norm": 2.4748412890168354, + "learning_rate": 3.2615776671111284e-06, + "loss": 0.9946, + "step": 3877 + }, + { + "epoch": 0.40785097348390237, + "grad_norm": 2.4536684362236123, + "learning_rate": 3.2607824838023616e-06, + "loss": 1.0266, + "step": 3878 + }, + { + "epoch": 0.4079561439257497, + "grad_norm": 2.62038083169509, + "learning_rate": 3.259987215660439e-06, + "loss": 1.0633, + "step": 3879 + }, + { + "epoch": 0.40806131436759696, + "grad_norm": 3.030423450158621, + "learning_rate": 3.259191862774037e-06, + "loss": 1.0133, + "step": 3880 + }, + { + "epoch": 0.4081664848094443, + "grad_norm": 2.629007143942859, + "learning_rate": 3.2583964252318457e-06, + "loss": 0.986, + "step": 3881 + }, + { + "epoch": 0.4082716552512916, + "grad_norm": 2.7588881309590634, + "learning_rate": 3.25760090312256e-06, + "loss": 0.9893, + "step": 3882 + }, + { + "epoch": 0.40837682569313893, + "grad_norm": 1.8943250545350006, + "learning_rate": 3.25680529653489e-06, + "loss": 1.0228, + "step": 3883 + }, + { + "epoch": 0.40848199613498626, + "grad_norm": 2.1333974399767763, + "learning_rate": 3.2560096055575495e-06, + "loss": 1.0113, + "step": 3884 + }, + { + "epoch": 0.4085871665768336, + "grad_norm": 2.963505372102945, + "learning_rate": 3.2552138302792652e-06, + "loss": 0.9818, + "step": 3885 + }, + { + "epoch": 0.4086923370186809, + "grad_norm": 1.9028453643067, + "learning_rate": 3.254417970788772e-06, + "loss": 1.0293, + "step": 3886 + }, + { + "epoch": 0.40879750746052823, + "grad_norm": 2.733815135749297, + "learning_rate": 3.2536220271748154e-06, + "loss": 1.0211, + "step": 3887 + }, + { + "epoch": 0.40890267790237556, + "grad_norm": 2.7610883994558195, + "learning_rate": 3.252825999526148e-06, + "loss": 1.0258, + "step": 3888 + }, + { + "epoch": 0.4090078483442229, + "grad_norm": 1.5656271357264868, + "learning_rate": 3.252029887931533e-06, + "loss": 0.9811, + "step": 3889 + }, + { + "epoch": 0.40911301878607015, + "grad_norm": 1.9443105818052695, + "learning_rate": 3.251233692479744e-06, + "loss": 0.9607, + "step": 3890 + }, + { + "epoch": 0.4092181892279175, + "grad_norm": 2.4504615742047497, + "learning_rate": 3.2504374132595617e-06, + "loss": 0.9724, + "step": 3891 + }, + { + "epoch": 0.4093233596697648, + "grad_norm": 2.209862220160882, + "learning_rate": 3.249641050359779e-06, + "loss": 1.0212, + "step": 3892 + }, + { + "epoch": 0.4094285301116121, + "grad_norm": 3.0205649466760898, + "learning_rate": 3.2488446038691962e-06, + "loss": 1.0367, + "step": 3893 + }, + { + "epoch": 0.40953370055345945, + "grad_norm": 2.6987146362653918, + "learning_rate": 3.2480480738766222e-06, + "loss": 1.0034, + "step": 3894 + }, + { + "epoch": 0.40963887099530677, + "grad_norm": 2.147797158916421, + "learning_rate": 3.247251460470877e-06, + "loss": 0.9863, + "step": 3895 + }, + { + "epoch": 0.4097440414371541, + "grad_norm": 2.7248827223178025, + "learning_rate": 3.246454763740789e-06, + "loss": 1.0234, + "step": 3896 + }, + { + "epoch": 0.4098492118790014, + "grad_norm": 2.221702170406418, + "learning_rate": 3.2456579837751964e-06, + "loss": 1.0635, + "step": 3897 + }, + { + "epoch": 0.40995438232084874, + "grad_norm": 2.3438438321838726, + "learning_rate": 3.2448611206629456e-06, + "loss": 0.9872, + "step": 3898 + }, + { + "epoch": 0.41005955276269607, + "grad_norm": 1.8808604121161838, + "learning_rate": 3.2440641744928942e-06, + "loss": 0.9978, + "step": 3899 + }, + { + "epoch": 0.41016472320454334, + "grad_norm": 2.5185303976659172, + "learning_rate": 3.2432671453539074e-06, + "loss": 1.0102, + "step": 3900 + }, + { + "epoch": 0.41026989364639066, + "grad_norm": 3.0639940895334576, + "learning_rate": 3.242470033334859e-06, + "loss": 1.0326, + "step": 3901 + }, + { + "epoch": 0.410375064088238, + "grad_norm": 2.5153081413305505, + "learning_rate": 3.2416728385246354e-06, + "loss": 0.9968, + "step": 3902 + }, + { + "epoch": 0.4104802345300853, + "grad_norm": 2.3345795279528536, + "learning_rate": 3.2408755610121277e-06, + "loss": 1.0283, + "step": 3903 + }, + { + "epoch": 0.41058540497193263, + "grad_norm": 2.797650533754245, + "learning_rate": 3.2400782008862402e-06, + "loss": 1.0079, + "step": 3904 + }, + { + "epoch": 0.41069057541377996, + "grad_norm": 1.9715704104207485, + "learning_rate": 3.2392807582358838e-06, + "loss": 1.0157, + "step": 3905 + }, + { + "epoch": 0.4107957458556273, + "grad_norm": 2.3447329388547105, + "learning_rate": 3.2384832331499804e-06, + "loss": 1.039, + "step": 3906 + }, + { + "epoch": 0.4109009162974746, + "grad_norm": 2.7086394522324007, + "learning_rate": 3.2376856257174596e-06, + "loss": 1.0044, + "step": 3907 + }, + { + "epoch": 0.41100608673932193, + "grad_norm": 2.2145844261459966, + "learning_rate": 3.236887936027261e-06, + "loss": 1.0186, + "step": 3908 + }, + { + "epoch": 0.41111125718116925, + "grad_norm": 2.144833660377574, + "learning_rate": 3.2360901641683335e-06, + "loss": 0.9982, + "step": 3909 + }, + { + "epoch": 0.4112164276230166, + "grad_norm": 3.0103082487238346, + "learning_rate": 3.2352923102296336e-06, + "loss": 0.981, + "step": 3910 + }, + { + "epoch": 0.41132159806486385, + "grad_norm": 3.4747483035986786, + "learning_rate": 3.23449437430013e-06, + "loss": 1.0245, + "step": 3911 + }, + { + "epoch": 0.41142676850671117, + "grad_norm": 3.1896133858883626, + "learning_rate": 3.233696356468797e-06, + "loss": 0.9647, + "step": 3912 + }, + { + "epoch": 0.4115319389485585, + "grad_norm": 2.630457285599166, + "learning_rate": 3.2328982568246213e-06, + "loss": 1.0004, + "step": 3913 + }, + { + "epoch": 0.4116371093904058, + "grad_norm": 2.523456894015238, + "learning_rate": 3.2321000754565972e-06, + "loss": 1.0134, + "step": 3914 + }, + { + "epoch": 0.41174227983225314, + "grad_norm": 2.1853258787221623, + "learning_rate": 3.2313018124537273e-06, + "loss": 0.9849, + "step": 3915 + }, + { + "epoch": 0.41184745027410047, + "grad_norm": 2.207439047305829, + "learning_rate": 3.230503467905024e-06, + "loss": 0.9982, + "step": 3916 + }, + { + "epoch": 0.4119526207159478, + "grad_norm": 2.0123219656783427, + "learning_rate": 3.229705041899509e-06, + "loss": 0.9751, + "step": 3917 + }, + { + "epoch": 0.4120577911577951, + "grad_norm": 2.4532617794552687, + "learning_rate": 3.228906534526214e-06, + "loss": 1.0182, + "step": 3918 + }, + { + "epoch": 0.41216296159964244, + "grad_norm": 3.1167065793191897, + "learning_rate": 3.228107945874177e-06, + "loss": 0.9758, + "step": 3919 + }, + { + "epoch": 0.41226813204148977, + "grad_norm": 2.01605320694452, + "learning_rate": 3.2273092760324488e-06, + "loss": 1.0439, + "step": 3920 + }, + { + "epoch": 0.41237330248333703, + "grad_norm": 2.0503930931336134, + "learning_rate": 3.2265105250900857e-06, + "loss": 1.0149, + "step": 3921 + }, + { + "epoch": 0.41247847292518436, + "grad_norm": 2.535946058767984, + "learning_rate": 3.225711693136156e-06, + "loss": 1.0071, + "step": 3922 + }, + { + "epoch": 0.4125836433670317, + "grad_norm": 3.0494247439611684, + "learning_rate": 3.224912780259736e-06, + "loss": 1.0153, + "step": 3923 + }, + { + "epoch": 0.412688813808879, + "grad_norm": 1.9610941081321251, + "learning_rate": 3.2241137865499073e-06, + "loss": 1.0368, + "step": 3924 + }, + { + "epoch": 0.41279398425072633, + "grad_norm": 2.187054374833785, + "learning_rate": 3.2233147120957674e-06, + "loss": 0.9997, + "step": 3925 + }, + { + "epoch": 0.41289915469257366, + "grad_norm": 2.032475210558614, + "learning_rate": 3.2225155569864187e-06, + "loss": 0.9925, + "step": 3926 + }, + { + "epoch": 0.413004325134421, + "grad_norm": 2.778151381349566, + "learning_rate": 3.221716321310972e-06, + "loss": 0.9881, + "step": 3927 + }, + { + "epoch": 0.4131094955762683, + "grad_norm": 2.84516595462591, + "learning_rate": 3.2209170051585486e-06, + "loss": 1.0279, + "step": 3928 + }, + { + "epoch": 0.41321466601811563, + "grad_norm": 2.706803603589453, + "learning_rate": 3.2201176086182796e-06, + "loss": 1.0087, + "step": 3929 + }, + { + "epoch": 0.41331983645996295, + "grad_norm": 2.3853452577702137, + "learning_rate": 3.2193181317793034e-06, + "loss": 0.9627, + "step": 3930 + }, + { + "epoch": 0.4134250069018102, + "grad_norm": 2.8455662869523275, + "learning_rate": 3.2185185747307675e-06, + "loss": 1.0292, + "step": 3931 + }, + { + "epoch": 0.41353017734365755, + "grad_norm": 2.861868010646851, + "learning_rate": 3.217718937561829e-06, + "loss": 1.0368, + "step": 3932 + }, + { + "epoch": 0.41363534778550487, + "grad_norm": 2.665200169651637, + "learning_rate": 3.2169192203616525e-06, + "loss": 1.0477, + "step": 3933 + }, + { + "epoch": 0.4137405182273522, + "grad_norm": 2.7937901564994334, + "learning_rate": 3.2161194232194144e-06, + "loss": 0.9551, + "step": 3934 + }, + { + "epoch": 0.4138456886691995, + "grad_norm": 2.488699321040164, + "learning_rate": 3.2153195462242968e-06, + "loss": 1.0131, + "step": 3935 + }, + { + "epoch": 0.41395085911104684, + "grad_norm": 1.861341029966568, + "learning_rate": 3.2145195894654947e-06, + "loss": 1.0209, + "step": 3936 + }, + { + "epoch": 0.41405602955289417, + "grad_norm": 2.814340851769455, + "learning_rate": 3.2137195530322067e-06, + "loss": 1.0021, + "step": 3937 + }, + { + "epoch": 0.4141611999947415, + "grad_norm": 3.4877888465072187, + "learning_rate": 3.2129194370136447e-06, + "loss": 0.9719, + "step": 3938 + }, + { + "epoch": 0.4142663704365888, + "grad_norm": 1.8405494233280453, + "learning_rate": 3.2121192414990273e-06, + "loss": 0.9498, + "step": 3939 + }, + { + "epoch": 0.41437154087843614, + "grad_norm": 2.2875747831185893, + "learning_rate": 3.2113189665775812e-06, + "loss": 1.0207, + "step": 3940 + }, + { + "epoch": 0.4144767113202834, + "grad_norm": 1.7886487845468368, + "learning_rate": 3.2105186123385457e-06, + "loss": 0.9669, + "step": 3941 + }, + { + "epoch": 0.41458188176213073, + "grad_norm": 3.5285401524389775, + "learning_rate": 3.209718178871165e-06, + "loss": 1.0066, + "step": 3942 + }, + { + "epoch": 0.41468705220397806, + "grad_norm": 2.8080439416391902, + "learning_rate": 3.2089176662646946e-06, + "loss": 1.0376, + "step": 3943 + }, + { + "epoch": 0.4147922226458254, + "grad_norm": 2.8063762324719694, + "learning_rate": 3.2081170746083972e-06, + "loss": 0.9974, + "step": 3944 + }, + { + "epoch": 0.4148973930876727, + "grad_norm": 2.7926830931363478, + "learning_rate": 3.2073164039915457e-06, + "loss": 1.0089, + "step": 3945 + }, + { + "epoch": 0.41500256352952003, + "grad_norm": 3.0491045661913265, + "learning_rate": 3.20651565450342e-06, + "loss": 0.97, + "step": 3946 + }, + { + "epoch": 0.41510773397136735, + "grad_norm": 3.0426107615622855, + "learning_rate": 3.20571482623331e-06, + "loss": 1.0066, + "step": 3947 + }, + { + "epoch": 0.4152129044132147, + "grad_norm": 2.396107694194826, + "learning_rate": 3.2049139192705146e-06, + "loss": 1.0248, + "step": 3948 + }, + { + "epoch": 0.415318074855062, + "grad_norm": 2.149480243498675, + "learning_rate": 3.2041129337043415e-06, + "loss": 1.0269, + "step": 3949 + }, + { + "epoch": 0.4154232452969093, + "grad_norm": 2.619422758873159, + "learning_rate": 3.203311869624107e-06, + "loss": 1.0147, + "step": 3950 + }, + { + "epoch": 0.4155284157387566, + "grad_norm": 2.575801899112643, + "learning_rate": 3.202510727119135e-06, + "loss": 1.0062, + "step": 3951 + }, + { + "epoch": 0.4156335861806039, + "grad_norm": 2.2109556260255743, + "learning_rate": 3.20170950627876e-06, + "loss": 0.9984, + "step": 3952 + }, + { + "epoch": 0.41573875662245124, + "grad_norm": 2.664759138194936, + "learning_rate": 3.2009082071923243e-06, + "loss": 0.9412, + "step": 3953 + }, + { + "epoch": 0.41584392706429857, + "grad_norm": 3.5647874059605105, + "learning_rate": 3.2001068299491777e-06, + "loss": 1.0679, + "step": 3954 + }, + { + "epoch": 0.4159490975061459, + "grad_norm": 1.9762308009149232, + "learning_rate": 3.1993053746386817e-06, + "loss": 0.9786, + "step": 3955 + }, + { + "epoch": 0.4160542679479932, + "grad_norm": 2.685694396514915, + "learning_rate": 3.1985038413502035e-06, + "loss": 0.9744, + "step": 3956 + }, + { + "epoch": 0.41615943838984054, + "grad_norm": 1.788013679923769, + "learning_rate": 3.197702230173121e-06, + "loss": 0.9708, + "step": 3957 + }, + { + "epoch": 0.41626460883168787, + "grad_norm": 2.7969444556997947, + "learning_rate": 3.19690054119682e-06, + "loss": 1.0122, + "step": 3958 + }, + { + "epoch": 0.4163697792735352, + "grad_norm": 3.247388808518861, + "learning_rate": 3.1960987745106954e-06, + "loss": 1.023, + "step": 3959 + }, + { + "epoch": 0.4164749497153825, + "grad_norm": 2.891195304951478, + "learning_rate": 3.195296930204149e-06, + "loss": 1.0265, + "step": 3960 + }, + { + "epoch": 0.4165801201572298, + "grad_norm": 2.3527631916312184, + "learning_rate": 3.1944950083665943e-06, + "loss": 0.9563, + "step": 3961 + }, + { + "epoch": 0.4166852905990771, + "grad_norm": 2.1596860621844423, + "learning_rate": 3.1936930090874506e-06, + "loss": 0.9826, + "step": 3962 + }, + { + "epoch": 0.41679046104092443, + "grad_norm": 2.6617240165685665, + "learning_rate": 3.192890932456148e-06, + "loss": 1.0048, + "step": 3963 + }, + { + "epoch": 0.41689563148277176, + "grad_norm": 2.092563955479962, + "learning_rate": 3.1920887785621233e-06, + "loss": 1.0019, + "step": 3964 + }, + { + "epoch": 0.4170008019246191, + "grad_norm": 2.2736516404112326, + "learning_rate": 3.1912865474948242e-06, + "loss": 1.0307, + "step": 3965 + }, + { + "epoch": 0.4171059723664664, + "grad_norm": 3.3937785745994526, + "learning_rate": 3.190484239343705e-06, + "loss": 1.0387, + "step": 3966 + }, + { + "epoch": 0.41721114280831373, + "grad_norm": 3.086439637922143, + "learning_rate": 3.1896818541982288e-06, + "loss": 0.9966, + "step": 3967 + }, + { + "epoch": 0.41731631325016105, + "grad_norm": 2.161092398681452, + "learning_rate": 3.188879392147869e-06, + "loss": 0.9979, + "step": 3968 + }, + { + "epoch": 0.4174214836920084, + "grad_norm": 2.732897747501538, + "learning_rate": 3.188076853282105e-06, + "loss": 0.9888, + "step": 3969 + }, + { + "epoch": 0.4175266541338557, + "grad_norm": 2.3755042681989584, + "learning_rate": 3.187274237690426e-06, + "loss": 0.9807, + "step": 3970 + }, + { + "epoch": 0.417631824575703, + "grad_norm": 3.236466063215594, + "learning_rate": 3.1864715454623312e-06, + "loss": 0.9789, + "step": 3971 + }, + { + "epoch": 0.4177369950175503, + "grad_norm": 1.9463819496777226, + "learning_rate": 3.1856687766873262e-06, + "loss": 0.9797, + "step": 3972 + }, + { + "epoch": 0.4178421654593976, + "grad_norm": 2.2662145233490234, + "learning_rate": 3.184865931454926e-06, + "loss": 1.0288, + "step": 3973 + }, + { + "epoch": 0.41794733590124494, + "grad_norm": 1.866662882697477, + "learning_rate": 3.184063009854655e-06, + "loss": 1.0038, + "step": 3974 + }, + { + "epoch": 0.41805250634309227, + "grad_norm": 2.650269702092644, + "learning_rate": 3.183260011976044e-06, + "loss": 1.0019, + "step": 3975 + }, + { + "epoch": 0.4181576767849396, + "grad_norm": 2.8276642060293913, + "learning_rate": 3.1824569379086334e-06, + "loss": 1.0599, + "step": 3976 + }, + { + "epoch": 0.4182628472267869, + "grad_norm": 3.5709056704281057, + "learning_rate": 3.1816537877419735e-06, + "loss": 1.0059, + "step": 3977 + }, + { + "epoch": 0.41836801766863424, + "grad_norm": 2.071198284904167, + "learning_rate": 3.180850561565621e-06, + "loss": 1.0026, + "step": 3978 + }, + { + "epoch": 0.41847318811048156, + "grad_norm": 2.9106218215179527, + "learning_rate": 3.180047259469141e-06, + "loss": 1.0052, + "step": 3979 + }, + { + "epoch": 0.4185783585523289, + "grad_norm": 3.6515550019028513, + "learning_rate": 3.1792438815421094e-06, + "loss": 1.0022, + "step": 3980 + }, + { + "epoch": 0.4186835289941762, + "grad_norm": 2.730798220509796, + "learning_rate": 3.178440427874109e-06, + "loss": 1.0587, + "step": 3981 + }, + { + "epoch": 0.4187886994360235, + "grad_norm": 1.887706779651145, + "learning_rate": 3.1776368985547296e-06, + "loss": 0.9977, + "step": 3982 + }, + { + "epoch": 0.4188938698778708, + "grad_norm": 3.609708212124221, + "learning_rate": 3.176833293673572e-06, + "loss": 1.001, + "step": 3983 + }, + { + "epoch": 0.41899904031971813, + "grad_norm": 2.8850263626073764, + "learning_rate": 3.1760296133202444e-06, + "loss": 1.0177, + "step": 3984 + }, + { + "epoch": 0.41910421076156545, + "grad_norm": 2.131234221005343, + "learning_rate": 3.175225857584364e-06, + "loss": 1.0055, + "step": 3985 + }, + { + "epoch": 0.4192093812034128, + "grad_norm": 2.991084252493002, + "learning_rate": 3.174422026555554e-06, + "loss": 1.0366, + "step": 3986 + }, + { + "epoch": 0.4193145516452601, + "grad_norm": 2.6654269600235274, + "learning_rate": 3.17361812032345e-06, + "loss": 0.9736, + "step": 3987 + }, + { + "epoch": 0.4194197220871074, + "grad_norm": 2.6426415681720643, + "learning_rate": 3.172814138977692e-06, + "loss": 0.9885, + "step": 3988 + }, + { + "epoch": 0.41952489252895475, + "grad_norm": 2.684767657076028, + "learning_rate": 3.172010082607932e-06, + "loss": 0.9982, + "step": 3989 + }, + { + "epoch": 0.4196300629708021, + "grad_norm": 2.891495817722396, + "learning_rate": 3.1712059513038264e-06, + "loss": 1.016, + "step": 3990 + }, + { + "epoch": 0.4197352334126494, + "grad_norm": 1.9233695211634947, + "learning_rate": 3.1704017451550435e-06, + "loss": 0.9776, + "step": 3991 + }, + { + "epoch": 0.41984040385449667, + "grad_norm": 2.8276668356522165, + "learning_rate": 3.169597464251258e-06, + "loss": 0.958, + "step": 3992 + }, + { + "epoch": 0.419945574296344, + "grad_norm": 2.2819287728424387, + "learning_rate": 3.1687931086821534e-06, + "loss": 0.9661, + "step": 3993 + }, + { + "epoch": 0.4200507447381913, + "grad_norm": 1.9335515420518428, + "learning_rate": 3.1679886785374227e-06, + "loss": 1.0086, + "step": 3994 + }, + { + "epoch": 0.42015591518003864, + "grad_norm": 2.703224110233739, + "learning_rate": 3.1671841739067645e-06, + "loss": 0.987, + "step": 3995 + }, + { + "epoch": 0.42026108562188597, + "grad_norm": 2.366911505986237, + "learning_rate": 3.1663795948798883e-06, + "loss": 1.0131, + "step": 3996 + }, + { + "epoch": 0.4203662560637333, + "grad_norm": 2.1981340850481317, + "learning_rate": 3.1655749415465098e-06, + "loss": 1.0267, + "step": 3997 + }, + { + "epoch": 0.4204714265055806, + "grad_norm": 2.2878927859138662, + "learning_rate": 3.164770213996356e-06, + "loss": 0.9874, + "step": 3998 + }, + { + "epoch": 0.42057659694742794, + "grad_norm": 2.797600970455762, + "learning_rate": 3.1639654123191586e-06, + "loss": 1.0073, + "step": 3999 + }, + { + "epoch": 0.42068176738927526, + "grad_norm": 2.587687977989116, + "learning_rate": 3.1631605366046604e-06, + "loss": 0.9942, + "step": 4000 + }, + { + "epoch": 0.4207869378311226, + "grad_norm": 2.8519043232197503, + "learning_rate": 3.1623555869426105e-06, + "loss": 1.0209, + "step": 4001 + }, + { + "epoch": 0.42089210827296986, + "grad_norm": 2.7590859333924436, + "learning_rate": 3.161550563422768e-06, + "loss": 1.0017, + "step": 4002 + }, + { + "epoch": 0.4209972787148172, + "grad_norm": 2.422336890671471, + "learning_rate": 3.160745466134898e-06, + "loss": 1.0184, + "step": 4003 + }, + { + "epoch": 0.4211024491566645, + "grad_norm": 2.632748801794489, + "learning_rate": 3.1599402951687745e-06, + "loss": 1.0328, + "step": 4004 + }, + { + "epoch": 0.42120761959851183, + "grad_norm": 2.6624952216264592, + "learning_rate": 3.1591350506141834e-06, + "loss": 0.9996, + "step": 4005 + }, + { + "epoch": 0.42131279004035915, + "grad_norm": 2.181289439115408, + "learning_rate": 3.1583297325609117e-06, + "loss": 0.9675, + "step": 4006 + }, + { + "epoch": 0.4214179604822065, + "grad_norm": 1.9476495215981124, + "learning_rate": 3.1575243410987627e-06, + "loss": 0.9822, + "step": 4007 + }, + { + "epoch": 0.4215231309240538, + "grad_norm": 2.363361592022842, + "learning_rate": 3.1567188763175417e-06, + "loss": 0.9819, + "step": 4008 + }, + { + "epoch": 0.4216283013659011, + "grad_norm": 2.8584886252924653, + "learning_rate": 3.155913338307064e-06, + "loss": 1.0167, + "step": 4009 + }, + { + "epoch": 0.42173347180774845, + "grad_norm": 2.1134205793143463, + "learning_rate": 3.1551077271571533e-06, + "loss": 1.0171, + "step": 4010 + }, + { + "epoch": 0.4218386422495958, + "grad_norm": 3.1621683092136545, + "learning_rate": 3.154302042957642e-06, + "loss": 1.0072, + "step": 4011 + }, + { + "epoch": 0.42194381269144304, + "grad_norm": 3.3120782048978805, + "learning_rate": 3.153496285798371e-06, + "loss": 0.9971, + "step": 4012 + }, + { + "epoch": 0.42204898313329037, + "grad_norm": 2.0326302294261644, + "learning_rate": 3.152690455769186e-06, + "loss": 1.0114, + "step": 4013 + }, + { + "epoch": 0.4221541535751377, + "grad_norm": 2.0910034446035146, + "learning_rate": 3.1518845529599464e-06, + "loss": 0.995, + "step": 4014 + }, + { + "epoch": 0.422259324016985, + "grad_norm": 2.7438097015848326, + "learning_rate": 3.1510785774605148e-06, + "loss": 1.0412, + "step": 4015 + }, + { + "epoch": 0.42236449445883234, + "grad_norm": 1.9719647600817969, + "learning_rate": 3.1502725293607635e-06, + "loss": 0.9841, + "step": 4016 + }, + { + "epoch": 0.42246966490067966, + "grad_norm": 2.5438613247357154, + "learning_rate": 3.1494664087505743e-06, + "loss": 1.002, + "step": 4017 + }, + { + "epoch": 0.422574835342527, + "grad_norm": 2.3918717555442424, + "learning_rate": 3.1486602157198347e-06, + "loss": 0.986, + "step": 4018 + }, + { + "epoch": 0.4226800057843743, + "grad_norm": 2.6686455761422594, + "learning_rate": 3.1478539503584427e-06, + "loss": 0.9643, + "step": 4019 + }, + { + "epoch": 0.42278517622622164, + "grad_norm": 2.966460155715894, + "learning_rate": 3.147047612756302e-06, + "loss": 1.0212, + "step": 4020 + }, + { + "epoch": 0.42289034666806896, + "grad_norm": 1.8811689133408926, + "learning_rate": 3.1462412030033264e-06, + "loss": 1.0067, + "step": 4021 + }, + { + "epoch": 0.42299551710991623, + "grad_norm": 2.425415315035811, + "learning_rate": 3.1454347211894364e-06, + "loss": 1.031, + "step": 4022 + }, + { + "epoch": 0.42310068755176355, + "grad_norm": 2.0840405211523074, + "learning_rate": 3.144628167404561e-06, + "loss": 0.98, + "step": 4023 + }, + { + "epoch": 0.4232058579936109, + "grad_norm": 2.1726194511797052, + "learning_rate": 3.1438215417386377e-06, + "loss": 0.9629, + "step": 4024 + }, + { + "epoch": 0.4233110284354582, + "grad_norm": 2.3856593069057714, + "learning_rate": 3.1430148442816112e-06, + "loss": 0.9939, + "step": 4025 + }, + { + "epoch": 0.4234161988773055, + "grad_norm": 3.3977729251669184, + "learning_rate": 3.1422080751234342e-06, + "loss": 1.0636, + "step": 4026 + }, + { + "epoch": 0.42352136931915285, + "grad_norm": 1.9288928690265361, + "learning_rate": 3.1414012343540667e-06, + "loss": 0.9848, + "step": 4027 + }, + { + "epoch": 0.4236265397610002, + "grad_norm": 1.9840004899690646, + "learning_rate": 3.1405943220634805e-06, + "loss": 0.9988, + "step": 4028 + }, + { + "epoch": 0.4237317102028475, + "grad_norm": 2.442923625799874, + "learning_rate": 3.1397873383416503e-06, + "loss": 1.0168, + "step": 4029 + }, + { + "epoch": 0.4238368806446948, + "grad_norm": 2.739607689137626, + "learning_rate": 3.138980283278562e-06, + "loss": 1.0001, + "step": 4030 + }, + { + "epoch": 0.42394205108654215, + "grad_norm": 1.8732837933056634, + "learning_rate": 3.1381731569642093e-06, + "loss": 1.0149, + "step": 4031 + }, + { + "epoch": 0.4240472215283895, + "grad_norm": 2.1861276788817694, + "learning_rate": 3.137365959488591e-06, + "loss": 1.0425, + "step": 4032 + }, + { + "epoch": 0.42415239197023674, + "grad_norm": 2.227889990859692, + "learning_rate": 3.1365586909417177e-06, + "loss": 1.0049, + "step": 4033 + }, + { + "epoch": 0.42425756241208407, + "grad_norm": 2.401704867671759, + "learning_rate": 3.1357513514136044e-06, + "loss": 1.0011, + "step": 4034 + }, + { + "epoch": 0.4243627328539314, + "grad_norm": 2.572582774921797, + "learning_rate": 3.1349439409942778e-06, + "loss": 1.0292, + "step": 4035 + }, + { + "epoch": 0.4244679032957787, + "grad_norm": 2.464307334674958, + "learning_rate": 3.1341364597737684e-06, + "loss": 1.0012, + "step": 4036 + }, + { + "epoch": 0.42457307373762604, + "grad_norm": 2.1963762256824078, + "learning_rate": 3.133328907842118e-06, + "loss": 0.9715, + "step": 4037 + }, + { + "epoch": 0.42467824417947336, + "grad_norm": 3.0073856142249267, + "learning_rate": 3.132521285289375e-06, + "loss": 0.9854, + "step": 4038 + }, + { + "epoch": 0.4247834146213207, + "grad_norm": 2.7319358749557625, + "learning_rate": 3.1317135922055954e-06, + "loss": 1.0202, + "step": 4039 + }, + { + "epoch": 0.424888585063168, + "grad_norm": 1.7350134734136733, + "learning_rate": 3.130905828680842e-06, + "loss": 0.9956, + "step": 4040 + }, + { + "epoch": 0.42499375550501534, + "grad_norm": 2.3777384845962257, + "learning_rate": 3.1300979948051874e-06, + "loss": 1.0045, + "step": 4041 + }, + { + "epoch": 0.42509892594686266, + "grad_norm": 2.601237373112988, + "learning_rate": 3.1292900906687123e-06, + "loss": 0.9874, + "step": 4042 + }, + { + "epoch": 0.42520409638870993, + "grad_norm": 2.2851187750948054, + "learning_rate": 3.128482116361503e-06, + "loss": 0.9632, + "step": 4043 + }, + { + "epoch": 0.42530926683055725, + "grad_norm": 1.7435113684957382, + "learning_rate": 3.1276740719736565e-06, + "loss": 1.0032, + "step": 4044 + }, + { + "epoch": 0.4254144372724046, + "grad_norm": 2.623135138337169, + "learning_rate": 3.126865957595274e-06, + "loss": 1.0326, + "step": 4045 + }, + { + "epoch": 0.4255196077142519, + "grad_norm": 3.0108763917408683, + "learning_rate": 3.126057773316469e-06, + "loss": 0.9891, + "step": 4046 + }, + { + "epoch": 0.4256247781560992, + "grad_norm": 2.580885149154926, + "learning_rate": 3.1252495192273575e-06, + "loss": 1.0105, + "step": 4047 + }, + { + "epoch": 0.42572994859794655, + "grad_norm": 2.1650235060129694, + "learning_rate": 3.1244411954180677e-06, + "loss": 1.033, + "step": 4048 + }, + { + "epoch": 0.4258351190397939, + "grad_norm": 2.756079053013681, + "learning_rate": 3.1236328019787338e-06, + "loss": 0.9757, + "step": 4049 + }, + { + "epoch": 0.4259402894816412, + "grad_norm": 2.9476755695727452, + "learning_rate": 3.1228243389994976e-06, + "loss": 0.9734, + "step": 4050 + }, + { + "epoch": 0.4260454599234885, + "grad_norm": 1.816970545742394, + "learning_rate": 3.1220158065705104e-06, + "loss": 0.9716, + "step": 4051 + }, + { + "epoch": 0.42615063036533585, + "grad_norm": 2.1446993232730724, + "learning_rate": 3.1212072047819276e-06, + "loss": 0.9774, + "step": 4052 + }, + { + "epoch": 0.4262558008071831, + "grad_norm": 3.315132448487326, + "learning_rate": 3.1203985337239174e-06, + "loss": 1.0018, + "step": 4053 + }, + { + "epoch": 0.42636097124903044, + "grad_norm": 2.3941591823008643, + "learning_rate": 3.1195897934866504e-06, + "loss": 0.9595, + "step": 4054 + }, + { + "epoch": 0.42646614169087776, + "grad_norm": 2.0757761997376627, + "learning_rate": 3.118780984160307e-06, + "loss": 0.9928, + "step": 4055 + }, + { + "epoch": 0.4265713121327251, + "grad_norm": 1.8275624288227292, + "learning_rate": 3.1179721058350787e-06, + "loss": 1.0441, + "step": 4056 + }, + { + "epoch": 0.4266764825745724, + "grad_norm": 2.5156642086205534, + "learning_rate": 3.117163158601159e-06, + "loss": 1.0065, + "step": 4057 + }, + { + "epoch": 0.42678165301641974, + "grad_norm": 2.851734935278207, + "learning_rate": 3.1163541425487535e-06, + "loss": 0.9887, + "step": 4058 + }, + { + "epoch": 0.42688682345826706, + "grad_norm": 2.4798006522437617, + "learning_rate": 3.1155450577680725e-06, + "loss": 1.0047, + "step": 4059 + }, + { + "epoch": 0.4269919939001144, + "grad_norm": 1.5625163069318968, + "learning_rate": 3.1147359043493365e-06, + "loss": 0.96, + "step": 4060 + }, + { + "epoch": 0.4270971643419617, + "grad_norm": 2.125796790580701, + "learning_rate": 3.113926682382771e-06, + "loss": 0.9713, + "step": 4061 + }, + { + "epoch": 0.42720233478380903, + "grad_norm": 2.7077037498810754, + "learning_rate": 3.113117391958612e-06, + "loss": 0.995, + "step": 4062 + }, + { + "epoch": 0.4273075052256563, + "grad_norm": 2.611706521027208, + "learning_rate": 3.1123080331671015e-06, + "loss": 1.0046, + "step": 4063 + }, + { + "epoch": 0.4274126756675036, + "grad_norm": 1.909431132438331, + "learning_rate": 3.111498606098487e-06, + "loss": 1.0073, + "step": 4064 + }, + { + "epoch": 0.42751784610935095, + "grad_norm": 2.7980952079855665, + "learning_rate": 3.110689110843029e-06, + "loss": 1.0122, + "step": 4065 + }, + { + "epoch": 0.4276230165511983, + "grad_norm": 2.1381354002981037, + "learning_rate": 3.10987954749099e-06, + "loss": 0.9777, + "step": 4066 + }, + { + "epoch": 0.4277281869930456, + "grad_norm": 3.062933496654773, + "learning_rate": 3.1090699161326442e-06, + "loss": 1.0339, + "step": 4067 + }, + { + "epoch": 0.4278333574348929, + "grad_norm": 2.555658076041788, + "learning_rate": 3.108260216858272e-06, + "loss": 1.0222, + "step": 4068 + }, + { + "epoch": 0.42793852787674025, + "grad_norm": 2.924322622102227, + "learning_rate": 3.10745044975816e-06, + "loss": 0.9848, + "step": 4069 + }, + { + "epoch": 0.4280436983185876, + "grad_norm": 2.7919031185411525, + "learning_rate": 3.1066406149226046e-06, + "loss": 1.026, + "step": 4070 + }, + { + "epoch": 0.4281488687604349, + "grad_norm": 2.3674479918799087, + "learning_rate": 3.105830712441907e-06, + "loss": 1.0281, + "step": 4071 + }, + { + "epoch": 0.4282540392022822, + "grad_norm": 2.258912241147774, + "learning_rate": 3.1050207424063793e-06, + "loss": 0.9928, + "step": 4072 + }, + { + "epoch": 0.4283592096441295, + "grad_norm": 2.831893269330568, + "learning_rate": 3.1042107049063385e-06, + "loss": 1.011, + "step": 4073 + }, + { + "epoch": 0.4284643800859768, + "grad_norm": 2.9136787478525115, + "learning_rate": 3.103400600032111e-06, + "loss": 0.9885, + "step": 4074 + }, + { + "epoch": 0.42856955052782414, + "grad_norm": 2.5104840959586308, + "learning_rate": 3.1025904278740286e-06, + "loss": 1.0101, + "step": 4075 + }, + { + "epoch": 0.42867472096967146, + "grad_norm": 2.831638306335408, + "learning_rate": 3.1017801885224332e-06, + "loss": 0.9507, + "step": 4076 + }, + { + "epoch": 0.4287798914115188, + "grad_norm": 2.2190444522869814, + "learning_rate": 3.1009698820676714e-06, + "loss": 0.9653, + "step": 4077 + }, + { + "epoch": 0.4288850618533661, + "grad_norm": 3.176813249530208, + "learning_rate": 3.100159508600099e-06, + "loss": 0.9871, + "step": 4078 + }, + { + "epoch": 0.42899023229521344, + "grad_norm": 3.3715464732818154, + "learning_rate": 3.0993490682100797e-06, + "loss": 1.0664, + "step": 4079 + }, + { + "epoch": 0.42909540273706076, + "grad_norm": 2.187534155940015, + "learning_rate": 3.0985385609879832e-06, + "loss": 1.0389, + "step": 4080 + }, + { + "epoch": 0.4292005731789081, + "grad_norm": 1.6249601238371671, + "learning_rate": 3.097727987024187e-06, + "loss": 1.0206, + "step": 4081 + }, + { + "epoch": 0.4293057436207554, + "grad_norm": 2.028832037009087, + "learning_rate": 3.096917346409078e-06, + "loss": 1.0082, + "step": 4082 + }, + { + "epoch": 0.42941091406260273, + "grad_norm": 2.5626702289369585, + "learning_rate": 3.0961066392330475e-06, + "loss": 1.0013, + "step": 4083 + }, + { + "epoch": 0.42951608450445, + "grad_norm": 2.6483344911125952, + "learning_rate": 3.0952958655864957e-06, + "loss": 0.9299, + "step": 4084 + }, + { + "epoch": 0.4296212549462973, + "grad_norm": 2.240406701709471, + "learning_rate": 3.0944850255598307e-06, + "loss": 1.0073, + "step": 4085 + }, + { + "epoch": 0.42972642538814465, + "grad_norm": 3.2212875181907155, + "learning_rate": 3.0936741192434673e-06, + "loss": 1.0506, + "step": 4086 + }, + { + "epoch": 0.429831595829992, + "grad_norm": 2.8050004911994466, + "learning_rate": 3.0928631467278275e-06, + "loss": 1.045, + "step": 4087 + }, + { + "epoch": 0.4299367662718393, + "grad_norm": 2.3637943800552037, + "learning_rate": 3.0920521081033418e-06, + "loss": 1.0016, + "step": 4088 + }, + { + "epoch": 0.4300419367136866, + "grad_norm": 2.4022684297375942, + "learning_rate": 3.091241003460446e-06, + "loss": 1.0081, + "step": 4089 + }, + { + "epoch": 0.43014710715553395, + "grad_norm": 2.6213335210562643, + "learning_rate": 3.0904298328895865e-06, + "loss": 1.0066, + "step": 4090 + }, + { + "epoch": 0.43025227759738127, + "grad_norm": 2.405980637873229, + "learning_rate": 3.089618596481213e-06, + "loss": 0.9793, + "step": 4091 + }, + { + "epoch": 0.4303574480392286, + "grad_norm": 2.452115188274156, + "learning_rate": 3.088807294325786e-06, + "loss": 0.93, + "step": 4092 + }, + { + "epoch": 0.4304626184810759, + "grad_norm": 2.0884019140145065, + "learning_rate": 3.0879959265137722e-06, + "loss": 0.9783, + "step": 4093 + }, + { + "epoch": 0.4305677889229232, + "grad_norm": 2.9202655391972714, + "learning_rate": 3.0871844931356437e-06, + "loss": 0.9959, + "step": 4094 + }, + { + "epoch": 0.4306729593647705, + "grad_norm": 2.0360566333873136, + "learning_rate": 3.0863729942818835e-06, + "loss": 0.985, + "step": 4095 + }, + { + "epoch": 0.43077812980661784, + "grad_norm": 2.203833572789431, + "learning_rate": 3.08556143004298e-06, + "loss": 1.0127, + "step": 4096 + }, + { + "epoch": 0.43088330024846516, + "grad_norm": 3.060914354625758, + "learning_rate": 3.0847498005094277e-06, + "loss": 1.0552, + "step": 4097 + }, + { + "epoch": 0.4309884706903125, + "grad_norm": 2.127677108709688, + "learning_rate": 3.0839381057717295e-06, + "loss": 1.027, + "step": 4098 + }, + { + "epoch": 0.4310936411321598, + "grad_norm": 2.185558039100322, + "learning_rate": 3.083126345920397e-06, + "loss": 0.9747, + "step": 4099 + }, + { + "epoch": 0.43119881157400713, + "grad_norm": 2.502125904858561, + "learning_rate": 3.082314521045947e-06, + "loss": 0.9718, + "step": 4100 + }, + { + "epoch": 0.43130398201585446, + "grad_norm": 2.249386844041367, + "learning_rate": 3.081502631238904e-06, + "loss": 1.0284, + "step": 4101 + }, + { + "epoch": 0.4314091524577018, + "grad_norm": 2.510215362561653, + "learning_rate": 3.080690676589801e-06, + "loss": 1.0115, + "step": 4102 + }, + { + "epoch": 0.4315143228995491, + "grad_norm": 2.540632786506748, + "learning_rate": 3.0798786571891754e-06, + "loss": 1.0009, + "step": 4103 + }, + { + "epoch": 0.4316194933413964, + "grad_norm": 3.268212533873219, + "learning_rate": 3.0790665731275764e-06, + "loss": 1.0034, + "step": 4104 + }, + { + "epoch": 0.4317246637832437, + "grad_norm": 2.2631828866412564, + "learning_rate": 3.0782544244955546e-06, + "loss": 0.986, + "step": 4105 + }, + { + "epoch": 0.431829834225091, + "grad_norm": 2.8101017497715945, + "learning_rate": 3.077442211383674e-06, + "loss": 1.0227, + "step": 4106 + }, + { + "epoch": 0.43193500466693835, + "grad_norm": 2.590129436859189, + "learning_rate": 3.0766299338825003e-06, + "loss": 0.9566, + "step": 4107 + }, + { + "epoch": 0.4320401751087857, + "grad_norm": 2.6443948940324655, + "learning_rate": 3.0758175920826096e-06, + "loss": 0.9694, + "step": 4108 + }, + { + "epoch": 0.432145345550633, + "grad_norm": 3.141552378052775, + "learning_rate": 3.075005186074584e-06, + "loss": 1.0031, + "step": 4109 + }, + { + "epoch": 0.4322505159924803, + "grad_norm": 3.8498143175202366, + "learning_rate": 3.0741927159490133e-06, + "loss": 1.042, + "step": 4110 + }, + { + "epoch": 0.43235568643432765, + "grad_norm": 1.586336922468649, + "learning_rate": 3.073380181796495e-06, + "loss": 0.9673, + "step": 4111 + }, + { + "epoch": 0.43246085687617497, + "grad_norm": 3.3046763352664676, + "learning_rate": 3.072567583707632e-06, + "loss": 0.9643, + "step": 4112 + }, + { + "epoch": 0.4325660273180223, + "grad_norm": 3.225434065889587, + "learning_rate": 3.071754921773035e-06, + "loss": 1.0441, + "step": 4113 + }, + { + "epoch": 0.43267119775986956, + "grad_norm": 2.4386638186184277, + "learning_rate": 3.070942196083323e-06, + "loss": 0.9961, + "step": 4114 + }, + { + "epoch": 0.4327763682017169, + "grad_norm": 2.2998360002188307, + "learning_rate": 3.070129406729121e-06, + "loss": 1.0012, + "step": 4115 + }, + { + "epoch": 0.4328815386435642, + "grad_norm": 2.2206282124156593, + "learning_rate": 3.0693165538010617e-06, + "loss": 1.001, + "step": 4116 + }, + { + "epoch": 0.43298670908541154, + "grad_norm": 2.616939405371355, + "learning_rate": 3.0685036373897833e-06, + "loss": 0.9975, + "step": 4117 + }, + { + "epoch": 0.43309187952725886, + "grad_norm": 2.1392510552663477, + "learning_rate": 3.0676906575859335e-06, + "loss": 1.0204, + "step": 4118 + }, + { + "epoch": 0.4331970499691062, + "grad_norm": 2.4770512512838523, + "learning_rate": 3.0668776144801653e-06, + "loss": 1.0116, + "step": 4119 + }, + { + "epoch": 0.4333022204109535, + "grad_norm": 2.7619726953432227, + "learning_rate": 3.0660645081631396e-06, + "loss": 1.0338, + "step": 4120 + }, + { + "epoch": 0.43340739085280083, + "grad_norm": 3.6075982772582718, + "learning_rate": 3.0652513387255227e-06, + "loss": 0.9864, + "step": 4121 + }, + { + "epoch": 0.43351256129464816, + "grad_norm": 2.0025698189454344, + "learning_rate": 3.064438106257992e-06, + "loss": 1.0167, + "step": 4122 + }, + { + "epoch": 0.4336177317364955, + "grad_norm": 2.4436854317627286, + "learning_rate": 3.063624810851227e-06, + "loss": 1.0045, + "step": 4123 + }, + { + "epoch": 0.43372290217834275, + "grad_norm": 2.485396787287376, + "learning_rate": 3.0628114525959175e-06, + "loss": 1.001, + "step": 4124 + }, + { + "epoch": 0.4338280726201901, + "grad_norm": 3.1025874629215706, + "learning_rate": 3.0619980315827585e-06, + "loss": 1.0026, + "step": 4125 + }, + { + "epoch": 0.4339332430620374, + "grad_norm": 3.037225307324419, + "learning_rate": 3.061184547902454e-06, + "loss": 0.9949, + "step": 4126 + }, + { + "epoch": 0.4340384135038847, + "grad_norm": 2.8257071700615133, + "learning_rate": 3.060371001645713e-06, + "loss": 1.0168, + "step": 4127 + }, + { + "epoch": 0.43414358394573205, + "grad_norm": 1.4769811886831412, + "learning_rate": 3.0595573929032513e-06, + "loss": 0.9953, + "step": 4128 + }, + { + "epoch": 0.43424875438757937, + "grad_norm": 2.3056756386496047, + "learning_rate": 3.0587437217657937e-06, + "loss": 0.9979, + "step": 4129 + }, + { + "epoch": 0.4343539248294267, + "grad_norm": 2.530987884033079, + "learning_rate": 3.057929988324071e-06, + "loss": 1.0052, + "step": 4130 + }, + { + "epoch": 0.434459095271274, + "grad_norm": 2.669533462808313, + "learning_rate": 3.0571161926688204e-06, + "loss": 1.0023, + "step": 4131 + }, + { + "epoch": 0.43456426571312134, + "grad_norm": 2.0814334987484417, + "learning_rate": 3.056302334890786e-06, + "loss": 0.9791, + "step": 4132 + }, + { + "epoch": 0.43466943615496867, + "grad_norm": 2.8696601264581654, + "learning_rate": 3.0554884150807208e-06, + "loss": 1.0174, + "step": 4133 + }, + { + "epoch": 0.43477460659681594, + "grad_norm": 2.2470344072413533, + "learning_rate": 3.0546744333293815e-06, + "loss": 0.974, + "step": 4134 + }, + { + "epoch": 0.43487977703866326, + "grad_norm": 2.6293475640576642, + "learning_rate": 3.053860389727534e-06, + "loss": 1.0178, + "step": 4135 + }, + { + "epoch": 0.4349849474805106, + "grad_norm": 2.023015252677748, + "learning_rate": 3.0530462843659504e-06, + "loss": 0.991, + "step": 4136 + }, + { + "epoch": 0.4350901179223579, + "grad_norm": 3.028146066432831, + "learning_rate": 3.0522321173354095e-06, + "loss": 1.0114, + "step": 4137 + }, + { + "epoch": 0.43519528836420523, + "grad_norm": 2.198855179319069, + "learning_rate": 3.0514178887266986e-06, + "loss": 1.0585, + "step": 4138 + }, + { + "epoch": 0.43530045880605256, + "grad_norm": 1.9696831755080155, + "learning_rate": 3.0506035986306095e-06, + "loss": 1.0022, + "step": 4139 + }, + { + "epoch": 0.4354056292478999, + "grad_norm": 3.374841364809197, + "learning_rate": 3.0497892471379415e-06, + "loss": 1.0142, + "step": 4140 + }, + { + "epoch": 0.4355107996897472, + "grad_norm": 1.8557258077383334, + "learning_rate": 3.0489748343395013e-06, + "loss": 0.9776, + "step": 4141 + }, + { + "epoch": 0.43561597013159453, + "grad_norm": 2.47059315827509, + "learning_rate": 3.0481603603261023e-06, + "loss": 1.0122, + "step": 4142 + }, + { + "epoch": 0.43572114057344186, + "grad_norm": 2.7641667162449974, + "learning_rate": 3.0473458251885658e-06, + "loss": 0.9913, + "step": 4143 + }, + { + "epoch": 0.4358263110152892, + "grad_norm": 1.910330796679931, + "learning_rate": 3.0465312290177166e-06, + "loss": 1.0036, + "step": 4144 + }, + { + "epoch": 0.43593148145713645, + "grad_norm": 2.118224274596519, + "learning_rate": 3.0457165719043906e-06, + "loss": 1.0305, + "step": 4145 + }, + { + "epoch": 0.4360366518989838, + "grad_norm": 3.610473332887411, + "learning_rate": 3.0449018539394274e-06, + "loss": 1.0083, + "step": 4146 + }, + { + "epoch": 0.4361418223408311, + "grad_norm": 2.3957683382072847, + "learning_rate": 3.044087075213675e-06, + "loss": 0.9799, + "step": 4147 + }, + { + "epoch": 0.4362469927826784, + "grad_norm": 2.6043783680389865, + "learning_rate": 3.0432722358179873e-06, + "loss": 1.0203, + "step": 4148 + }, + { + "epoch": 0.43635216322452575, + "grad_norm": 2.0797439186751654, + "learning_rate": 3.0424573358432243e-06, + "loss": 0.9899, + "step": 4149 + }, + { + "epoch": 0.43645733366637307, + "grad_norm": 2.5250166716374918, + "learning_rate": 3.041642375380254e-06, + "loss": 1.0332, + "step": 4150 + }, + { + "epoch": 0.4365625041082204, + "grad_norm": 2.4929052295492213, + "learning_rate": 3.0408273545199517e-06, + "loss": 1.0466, + "step": 4151 + }, + { + "epoch": 0.4366676745500677, + "grad_norm": 1.9109696456025282, + "learning_rate": 3.0400122733531984e-06, + "loss": 0.9933, + "step": 4152 + }, + { + "epoch": 0.43677284499191504, + "grad_norm": 1.9084755489133232, + "learning_rate": 3.039197131970881e-06, + "loss": 0.986, + "step": 4153 + }, + { + "epoch": 0.43687801543376237, + "grad_norm": 2.669040697159479, + "learning_rate": 3.0383819304638953e-06, + "loss": 1.0244, + "step": 4154 + }, + { + "epoch": 0.43698318587560964, + "grad_norm": 2.3914620652281022, + "learning_rate": 3.0375666689231424e-06, + "loss": 1.0093, + "step": 4155 + }, + { + "epoch": 0.43708835631745696, + "grad_norm": 3.002326215365401, + "learning_rate": 3.0367513474395293e-06, + "loss": 1.0092, + "step": 4156 + }, + { + "epoch": 0.4371935267593043, + "grad_norm": 2.611694901531462, + "learning_rate": 3.035935966103972e-06, + "loss": 0.9976, + "step": 4157 + }, + { + "epoch": 0.4372986972011516, + "grad_norm": 3.2906065240695264, + "learning_rate": 3.0351205250073897e-06, + "loss": 1.0235, + "step": 4158 + }, + { + "epoch": 0.43740386764299893, + "grad_norm": 2.7111569861223654, + "learning_rate": 3.034305024240713e-06, + "loss": 0.9998, + "step": 4159 + }, + { + "epoch": 0.43750903808484626, + "grad_norm": 2.8104041489630553, + "learning_rate": 3.0334894638948753e-06, + "loss": 0.9441, + "step": 4160 + }, + { + "epoch": 0.4376142085266936, + "grad_norm": 2.088309040465824, + "learning_rate": 3.0326738440608176e-06, + "loss": 1.0236, + "step": 4161 + }, + { + "epoch": 0.4377193789685409, + "grad_norm": 2.4797799924774955, + "learning_rate": 3.031858164829489e-06, + "loss": 0.9917, + "step": 4162 + }, + { + "epoch": 0.43782454941038823, + "grad_norm": 2.2176787604079258, + "learning_rate": 3.0310424262918437e-06, + "loss": 0.9509, + "step": 4163 + }, + { + "epoch": 0.43792971985223555, + "grad_norm": 2.2461741633185994, + "learning_rate": 3.030226628538843e-06, + "loss": 0.9834, + "step": 4164 + }, + { + "epoch": 0.4380348902940828, + "grad_norm": 3.2683363657888775, + "learning_rate": 3.0294107716614522e-06, + "loss": 0.9626, + "step": 4165 + }, + { + "epoch": 0.43814006073593015, + "grad_norm": 2.101521859677799, + "learning_rate": 3.0285948557506497e-06, + "loss": 1.0173, + "step": 4166 + }, + { + "epoch": 0.4382452311777775, + "grad_norm": 2.5072346713043774, + "learning_rate": 3.027778880897413e-06, + "loss": 0.9717, + "step": 4167 + }, + { + "epoch": 0.4383504016196248, + "grad_norm": 2.886458825011365, + "learning_rate": 3.026962847192732e-06, + "loss": 0.9941, + "step": 4168 + }, + { + "epoch": 0.4384555720614721, + "grad_norm": 2.250039487973679, + "learning_rate": 3.0261467547276e-06, + "loss": 0.999, + "step": 4169 + }, + { + "epoch": 0.43856074250331945, + "grad_norm": 3.166565115166139, + "learning_rate": 3.0253306035930173e-06, + "loss": 0.9735, + "step": 4170 + }, + { + "epoch": 0.43866591294516677, + "grad_norm": 2.798487567270618, + "learning_rate": 3.024514393879992e-06, + "loss": 1.0038, + "step": 4171 + }, + { + "epoch": 0.4387710833870141, + "grad_norm": 2.7668838586778435, + "learning_rate": 3.0236981256795367e-06, + "loss": 1.0429, + "step": 4172 + }, + { + "epoch": 0.4388762538288614, + "grad_norm": 2.3144035959674554, + "learning_rate": 3.022881799082672e-06, + "loss": 0.9873, + "step": 4173 + }, + { + "epoch": 0.43898142427070874, + "grad_norm": 2.492637324980567, + "learning_rate": 3.022065414180425e-06, + "loss": 0.994, + "step": 4174 + }, + { + "epoch": 0.439086594712556, + "grad_norm": 2.551661508974668, + "learning_rate": 3.021248971063829e-06, + "loss": 1.0159, + "step": 4175 + }, + { + "epoch": 0.43919176515440334, + "grad_norm": 2.0455759141104584, + "learning_rate": 3.0204324698239236e-06, + "loss": 1.0531, + "step": 4176 + }, + { + "epoch": 0.43929693559625066, + "grad_norm": 2.798083383983236, + "learning_rate": 3.019615910551755e-06, + "loss": 0.9885, + "step": 4177 + }, + { + "epoch": 0.439402106038098, + "grad_norm": 1.9753015864782129, + "learning_rate": 3.0187992933383754e-06, + "loss": 0.981, + "step": 4178 + }, + { + "epoch": 0.4395072764799453, + "grad_norm": 2.0935177190552077, + "learning_rate": 3.017982618274844e-06, + "loss": 0.9449, + "step": 4179 + }, + { + "epoch": 0.43961244692179263, + "grad_norm": 2.3446418568178315, + "learning_rate": 3.0171658854522274e-06, + "loss": 1.0141, + "step": 4180 + }, + { + "epoch": 0.43971761736363996, + "grad_norm": 1.3397303520976527, + "learning_rate": 3.016349094961597e-06, + "loss": 1.0155, + "step": 4181 + }, + { + "epoch": 0.4398227878054873, + "grad_norm": 2.58181226066027, + "learning_rate": 3.015532246894031e-06, + "loss": 1.0046, + "step": 4182 + }, + { + "epoch": 0.4399279582473346, + "grad_norm": 2.4829085003143923, + "learning_rate": 3.0147153413406154e-06, + "loss": 1.0414, + "step": 4183 + }, + { + "epoch": 0.44003312868918193, + "grad_norm": 3.42576868411267, + "learning_rate": 3.013898378392441e-06, + "loss": 1.0368, + "step": 4184 + }, + { + "epoch": 0.4401382991310292, + "grad_norm": 2.252207757095223, + "learning_rate": 3.0130813581406044e-06, + "loss": 1.0187, + "step": 4185 + }, + { + "epoch": 0.4402434695728765, + "grad_norm": 2.5342374508667707, + "learning_rate": 3.0122642806762114e-06, + "loss": 0.9934, + "step": 4186 + }, + { + "epoch": 0.44034864001472385, + "grad_norm": 2.8671681025292246, + "learning_rate": 3.0114471460903714e-06, + "loss": 0.9852, + "step": 4187 + }, + { + "epoch": 0.44045381045657117, + "grad_norm": 2.5693510866729388, + "learning_rate": 3.0106299544742013e-06, + "loss": 1.0354, + "step": 4188 + }, + { + "epoch": 0.4405589808984185, + "grad_norm": 3.364495917648472, + "learning_rate": 3.0098127059188247e-06, + "loss": 1.0339, + "step": 4189 + }, + { + "epoch": 0.4406641513402658, + "grad_norm": 2.7990251477607146, + "learning_rate": 3.0089954005153706e-06, + "loss": 1.0116, + "step": 4190 + }, + { + "epoch": 0.44076932178211314, + "grad_norm": 1.7727662574506695, + "learning_rate": 3.0081780383549763e-06, + "loss": 1.012, + "step": 4191 + }, + { + "epoch": 0.44087449222396047, + "grad_norm": 3.2222272812715245, + "learning_rate": 3.0073606195287825e-06, + "loss": 0.9633, + "step": 4192 + }, + { + "epoch": 0.4409796626658078, + "grad_norm": 3.6290145756150722, + "learning_rate": 3.0065431441279386e-06, + "loss": 1.0499, + "step": 4193 + }, + { + "epoch": 0.4410848331076551, + "grad_norm": 3.1982957510410515, + "learning_rate": 3.005725612243599e-06, + "loss": 0.9487, + "step": 4194 + }, + { + "epoch": 0.4411900035495024, + "grad_norm": 2.8434896830214123, + "learning_rate": 3.0049080239669243e-06, + "loss": 1.0009, + "step": 4195 + }, + { + "epoch": 0.4412951739913497, + "grad_norm": 2.523909980989135, + "learning_rate": 3.0040903793890834e-06, + "loss": 1.0032, + "step": 4196 + }, + { + "epoch": 0.44140034443319703, + "grad_norm": 2.523114131380918, + "learning_rate": 3.003272678601249e-06, + "loss": 1.0288, + "step": 4197 + }, + { + "epoch": 0.44150551487504436, + "grad_norm": 2.8908811657988247, + "learning_rate": 3.002454921694602e-06, + "loss": 1.0062, + "step": 4198 + }, + { + "epoch": 0.4416106853168917, + "grad_norm": 2.7991620994148376, + "learning_rate": 3.0016371087603274e-06, + "loss": 1.0093, + "step": 4199 + }, + { + "epoch": 0.441715855758739, + "grad_norm": 2.6659000123176773, + "learning_rate": 3.0008192398896185e-06, + "loss": 1.0206, + "step": 4200 + }, + { + "epoch": 0.44182102620058633, + "grad_norm": 2.2057443650750863, + "learning_rate": 3.0000013151736747e-06, + "loss": 1.0263, + "step": 4201 + }, + { + "epoch": 0.44192619664243366, + "grad_norm": 2.7169424176961896, + "learning_rate": 2.999183334703699e-06, + "loss": 1.0041, + "step": 4202 + }, + { + "epoch": 0.442031367084281, + "grad_norm": 2.950358292631194, + "learning_rate": 2.9983652985709037e-06, + "loss": 0.9955, + "step": 4203 + }, + { + "epoch": 0.4421365375261283, + "grad_norm": 2.806264087299609, + "learning_rate": 2.9975472068665063e-06, + "loss": 1.0283, + "step": 4204 + }, + { + "epoch": 0.44224170796797563, + "grad_norm": 2.711842000947828, + "learning_rate": 2.9967290596817308e-06, + "loss": 1.0311, + "step": 4205 + }, + { + "epoch": 0.4423468784098229, + "grad_norm": 2.7709733812366664, + "learning_rate": 2.9959108571078056e-06, + "loss": 0.9947, + "step": 4206 + }, + { + "epoch": 0.4424520488516702, + "grad_norm": 3.2885009923549515, + "learning_rate": 2.995092599235968e-06, + "loss": 1.0179, + "step": 4207 + }, + { + "epoch": 0.44255721929351755, + "grad_norm": 2.955161655367558, + "learning_rate": 2.994274286157459e-06, + "loss": 0.9709, + "step": 4208 + }, + { + "epoch": 0.44266238973536487, + "grad_norm": 2.953888729382429, + "learning_rate": 2.9934559179635282e-06, + "loss": 1.0252, + "step": 4209 + }, + { + "epoch": 0.4427675601772122, + "grad_norm": 3.003042556969862, + "learning_rate": 2.992637494745429e-06, + "loss": 0.9944, + "step": 4210 + }, + { + "epoch": 0.4428727306190595, + "grad_norm": 2.7657547018738637, + "learning_rate": 2.9918190165944217e-06, + "loss": 0.9603, + "step": 4211 + }, + { + "epoch": 0.44297790106090684, + "grad_norm": 2.0161297281440613, + "learning_rate": 2.991000483601774e-06, + "loss": 1.0033, + "step": 4212 + }, + { + "epoch": 0.44308307150275417, + "grad_norm": 2.023154902922774, + "learning_rate": 2.9901818958587587e-06, + "loss": 0.9801, + "step": 4213 + }, + { + "epoch": 0.4431882419446015, + "grad_norm": 1.8645927902550326, + "learning_rate": 2.9893632534566534e-06, + "loss": 1.0054, + "step": 4214 + }, + { + "epoch": 0.4432934123864488, + "grad_norm": 3.5301225857507417, + "learning_rate": 2.9885445564867438e-06, + "loss": 0.9769, + "step": 4215 + }, + { + "epoch": 0.4433985828282961, + "grad_norm": 2.8515371337141744, + "learning_rate": 2.9877258050403214e-06, + "loss": 0.9968, + "step": 4216 + }, + { + "epoch": 0.4435037532701434, + "grad_norm": 2.404740251256232, + "learning_rate": 2.9869069992086825e-06, + "loss": 0.9942, + "step": 4217 + }, + { + "epoch": 0.44360892371199073, + "grad_norm": 2.046523753796662, + "learning_rate": 2.9860881390831303e-06, + "loss": 1.0065, + "step": 4218 + }, + { + "epoch": 0.44371409415383806, + "grad_norm": 3.103804612920373, + "learning_rate": 2.985269224754975e-06, + "loss": 1.061, + "step": 4219 + }, + { + "epoch": 0.4438192645956854, + "grad_norm": 2.221775249113264, + "learning_rate": 2.9844502563155324e-06, + "loss": 0.9729, + "step": 4220 + }, + { + "epoch": 0.4439244350375327, + "grad_norm": 2.36487818709531, + "learning_rate": 2.9836312338561223e-06, + "loss": 1.0165, + "step": 4221 + }, + { + "epoch": 0.44402960547938003, + "grad_norm": 2.5320619109662053, + "learning_rate": 2.9828121574680717e-06, + "loss": 1.0447, + "step": 4222 + }, + { + "epoch": 0.44413477592122735, + "grad_norm": 2.2988633433443515, + "learning_rate": 2.9819930272427162e-06, + "loss": 0.9668, + "step": 4223 + }, + { + "epoch": 0.4442399463630747, + "grad_norm": 2.9595824678582545, + "learning_rate": 2.981173843271393e-06, + "loss": 1.015, + "step": 4224 + }, + { + "epoch": 0.444345116804922, + "grad_norm": 2.950867108730787, + "learning_rate": 2.9803546056454487e-06, + "loss": 1.0077, + "step": 4225 + }, + { + "epoch": 0.44445028724676927, + "grad_norm": 1.6132738036228578, + "learning_rate": 2.9795353144562344e-06, + "loss": 0.9943, + "step": 4226 + }, + { + "epoch": 0.4445554576886166, + "grad_norm": 2.7813805140120333, + "learning_rate": 2.978715969795108e-06, + "loss": 0.9776, + "step": 4227 + }, + { + "epoch": 0.4446606281304639, + "grad_norm": 2.562516330449887, + "learning_rate": 2.9778965717534314e-06, + "loss": 0.9551, + "step": 4228 + }, + { + "epoch": 0.44476579857231124, + "grad_norm": 3.1737333083955583, + "learning_rate": 2.9770771204225744e-06, + "loss": 0.9925, + "step": 4229 + }, + { + "epoch": 0.44487096901415857, + "grad_norm": 1.8507674535018093, + "learning_rate": 2.9762576158939127e-06, + "loss": 1.0234, + "step": 4230 + }, + { + "epoch": 0.4449761394560059, + "grad_norm": 2.6261455818000408, + "learning_rate": 2.975438058258827e-06, + "loss": 1.0195, + "step": 4231 + }, + { + "epoch": 0.4450813098978532, + "grad_norm": 1.8245806335822785, + "learning_rate": 2.974618447608705e-06, + "loss": 0.9718, + "step": 4232 + }, + { + "epoch": 0.44518648033970054, + "grad_norm": 2.2479753999337815, + "learning_rate": 2.9737987840349393e-06, + "loss": 0.9852, + "step": 4233 + }, + { + "epoch": 0.44529165078154787, + "grad_norm": 2.35768351945659, + "learning_rate": 2.9729790676289276e-06, + "loss": 1.0047, + "step": 4234 + }, + { + "epoch": 0.4453968212233952, + "grad_norm": 1.6696931548669964, + "learning_rate": 2.972159298482076e-06, + "loss": 0.9698, + "step": 4235 + }, + { + "epoch": 0.44550199166524246, + "grad_norm": 2.0882289069459548, + "learning_rate": 2.971339476685795e-06, + "loss": 0.9712, + "step": 4236 + }, + { + "epoch": 0.4456071621070898, + "grad_norm": 2.093444735376746, + "learning_rate": 2.9705196023315007e-06, + "loss": 1.0109, + "step": 4237 + }, + { + "epoch": 0.4457123325489371, + "grad_norm": 2.5021177606762377, + "learning_rate": 2.9696996755106155e-06, + "loss": 1.0178, + "step": 4238 + }, + { + "epoch": 0.44581750299078443, + "grad_norm": 2.334943268674044, + "learning_rate": 2.968879696314568e-06, + "loss": 0.9909, + "step": 4239 + }, + { + "epoch": 0.44592267343263176, + "grad_norm": 2.3017506240577448, + "learning_rate": 2.968059664834792e-06, + "loss": 1.023, + "step": 4240 + }, + { + "epoch": 0.4460278438744791, + "grad_norm": 2.769841880116001, + "learning_rate": 2.967239581162727e-06, + "loss": 0.9995, + "step": 4241 + }, + { + "epoch": 0.4461330143163264, + "grad_norm": 3.7992104532494775, + "learning_rate": 2.96641944538982e-06, + "loss": 1.0083, + "step": 4242 + }, + { + "epoch": 0.44623818475817373, + "grad_norm": 2.1347949459797393, + "learning_rate": 2.9655992576075198e-06, + "loss": 0.9633, + "step": 4243 + }, + { + "epoch": 0.44634335520002105, + "grad_norm": 2.951928502125178, + "learning_rate": 2.964779017907287e-06, + "loss": 0.9934, + "step": 4244 + }, + { + "epoch": 0.4464485256418684, + "grad_norm": 2.263878251484875, + "learning_rate": 2.9639587263805824e-06, + "loss": 1.0103, + "step": 4245 + }, + { + "epoch": 0.44655369608371565, + "grad_norm": 3.2110964246131064, + "learning_rate": 2.963138383118876e-06, + "loss": 0.9281, + "step": 4246 + }, + { + "epoch": 0.44665886652556297, + "grad_norm": 2.7206342576353966, + "learning_rate": 2.962317988213642e-06, + "loss": 1.0208, + "step": 4247 + }, + { + "epoch": 0.4467640369674103, + "grad_norm": 2.569325435677048, + "learning_rate": 2.961497541756361e-06, + "loss": 0.9949, + "step": 4248 + }, + { + "epoch": 0.4468692074092576, + "grad_norm": 2.0106169690636726, + "learning_rate": 2.960677043838519e-06, + "loss": 1.0168, + "step": 4249 + }, + { + "epoch": 0.44697437785110494, + "grad_norm": 3.047440201325554, + "learning_rate": 2.959856494551608e-06, + "loss": 1.0501, + "step": 4250 + }, + { + "epoch": 0.44707954829295227, + "grad_norm": 2.9897469916545525, + "learning_rate": 2.9590358939871255e-06, + "loss": 1.0254, + "step": 4251 + }, + { + "epoch": 0.4471847187347996, + "grad_norm": 2.0389005489963674, + "learning_rate": 2.9582152422365745e-06, + "loss": 0.9759, + "step": 4252 + }, + { + "epoch": 0.4472898891766469, + "grad_norm": 2.791018390371004, + "learning_rate": 2.957394539391465e-06, + "loss": 1.0193, + "step": 4253 + }, + { + "epoch": 0.44739505961849424, + "grad_norm": 2.3959514096484473, + "learning_rate": 2.956573785543311e-06, + "loss": 1.0402, + "step": 4254 + }, + { + "epoch": 0.44750023006034156, + "grad_norm": 2.952041521825401, + "learning_rate": 2.955752980783633e-06, + "loss": 1.0327, + "step": 4255 + }, + { + "epoch": 0.44760540050218883, + "grad_norm": 2.834122618770755, + "learning_rate": 2.9549321252039577e-06, + "loss": 1.0195, + "step": 4256 + }, + { + "epoch": 0.44771057094403616, + "grad_norm": 2.6232922045756832, + "learning_rate": 2.954111218895816e-06, + "loss": 0.9946, + "step": 4257 + }, + { + "epoch": 0.4478157413858835, + "grad_norm": 2.890337682768083, + "learning_rate": 2.9532902619507465e-06, + "loss": 1.0015, + "step": 4258 + }, + { + "epoch": 0.4479209118277308, + "grad_norm": 2.0462008188888023, + "learning_rate": 2.95246925446029e-06, + "loss": 1.0015, + "step": 4259 + }, + { + "epoch": 0.44802608226957813, + "grad_norm": 2.3153362347848634, + "learning_rate": 2.951648196515998e-06, + "loss": 1.0159, + "step": 4260 + }, + { + "epoch": 0.44813125271142545, + "grad_norm": 3.294187755137225, + "learning_rate": 2.9508270882094227e-06, + "loss": 1.0014, + "step": 4261 + }, + { + "epoch": 0.4482364231532728, + "grad_norm": 2.8942766762795453, + "learning_rate": 2.9500059296321254e-06, + "loss": 1.015, + "step": 4262 + }, + { + "epoch": 0.4483415935951201, + "grad_norm": 2.8362356385644976, + "learning_rate": 2.9491847208756713e-06, + "loss": 1.0182, + "step": 4263 + }, + { + "epoch": 0.4484467640369674, + "grad_norm": 2.016987936853375, + "learning_rate": 2.9483634620316314e-06, + "loss": 0.9975, + "step": 4264 + }, + { + "epoch": 0.44855193447881475, + "grad_norm": 2.3257680119464226, + "learning_rate": 2.947542153191583e-06, + "loss": 0.9592, + "step": 4265 + }, + { + "epoch": 0.4486571049206621, + "grad_norm": 2.7528064005499218, + "learning_rate": 2.946720794447106e-06, + "loss": 1.0271, + "step": 4266 + }, + { + "epoch": 0.44876227536250934, + "grad_norm": 2.4897714526055252, + "learning_rate": 2.945899385889792e-06, + "loss": 0.9719, + "step": 4267 + }, + { + "epoch": 0.44886744580435667, + "grad_norm": 2.8675188349876675, + "learning_rate": 2.9450779276112313e-06, + "loss": 1.0455, + "step": 4268 + }, + { + "epoch": 0.448972616246204, + "grad_norm": 2.63772057013864, + "learning_rate": 2.944256419703025e-06, + "loss": 1.0008, + "step": 4269 + }, + { + "epoch": 0.4490777866880513, + "grad_norm": 2.767101795516205, + "learning_rate": 2.9434348622567773e-06, + "loss": 1.0132, + "step": 4270 + }, + { + "epoch": 0.44918295712989864, + "grad_norm": 2.8503438719512353, + "learning_rate": 2.942613255364097e-06, + "loss": 1.0182, + "step": 4271 + }, + { + "epoch": 0.44928812757174597, + "grad_norm": 3.264311463794955, + "learning_rate": 2.941791599116601e-06, + "loss": 0.9823, + "step": 4272 + }, + { + "epoch": 0.4493932980135933, + "grad_norm": 1.9470160801562246, + "learning_rate": 2.9409698936059083e-06, + "loss": 1.0072, + "step": 4273 + }, + { + "epoch": 0.4494984684554406, + "grad_norm": 2.578131298488729, + "learning_rate": 2.940148138923648e-06, + "loss": 0.9903, + "step": 4274 + }, + { + "epoch": 0.44960363889728794, + "grad_norm": 2.5479584159562036, + "learning_rate": 2.9393263351614503e-06, + "loss": 1.0249, + "step": 4275 + }, + { + "epoch": 0.44970880933913526, + "grad_norm": 2.3256148001185672, + "learning_rate": 2.9385044824109544e-06, + "loss": 1.0325, + "step": 4276 + }, + { + "epoch": 0.44981397978098253, + "grad_norm": 3.1496173601363933, + "learning_rate": 2.9376825807638016e-06, + "loss": 1.0063, + "step": 4277 + }, + { + "epoch": 0.44991915022282986, + "grad_norm": 2.851586489375221, + "learning_rate": 2.936860630311642e-06, + "loss": 1.029, + "step": 4278 + }, + { + "epoch": 0.4500243206646772, + "grad_norm": 3.088679797466413, + "learning_rate": 2.9360386311461276e-06, + "loss": 1.0058, + "step": 4279 + }, + { + "epoch": 0.4501294911065245, + "grad_norm": 2.054451946303782, + "learning_rate": 2.9352165833589188e-06, + "loss": 0.9734, + "step": 4280 + }, + { + "epoch": 0.45023466154837183, + "grad_norm": 2.718591132670405, + "learning_rate": 2.9343944870416798e-06, + "loss": 1.0024, + "step": 4281 + }, + { + "epoch": 0.45033983199021915, + "grad_norm": 2.746230280528559, + "learning_rate": 2.9335723422860807e-06, + "loss": 0.9915, + "step": 4282 + }, + { + "epoch": 0.4504450024320665, + "grad_norm": 2.537502797059733, + "learning_rate": 2.9327501491837977e-06, + "loss": 0.9917, + "step": 4283 + }, + { + "epoch": 0.4505501728739138, + "grad_norm": 3.1284325108868782, + "learning_rate": 2.931927907826511e-06, + "loss": 1.0431, + "step": 4284 + }, + { + "epoch": 0.4506553433157611, + "grad_norm": 1.812931921725533, + "learning_rate": 2.9311056183059085e-06, + "loss": 0.9778, + "step": 4285 + }, + { + "epoch": 0.45076051375760845, + "grad_norm": 2.851522179977043, + "learning_rate": 2.93028328071368e-06, + "loss": 0.9874, + "step": 4286 + }, + { + "epoch": 0.4508656841994557, + "grad_norm": 2.766819002757065, + "learning_rate": 2.9294608951415225e-06, + "loss": 0.9944, + "step": 4287 + }, + { + "epoch": 0.45097085464130304, + "grad_norm": 3.163681281030067, + "learning_rate": 2.9286384616811397e-06, + "loss": 1.031, + "step": 4288 + }, + { + "epoch": 0.45107602508315037, + "grad_norm": 2.485916557061216, + "learning_rate": 2.927815980424238e-06, + "loss": 1.0086, + "step": 4289 + }, + { + "epoch": 0.4511811955249977, + "grad_norm": 2.109206916955106, + "learning_rate": 2.926993451462532e-06, + "loss": 0.9913, + "step": 4290 + }, + { + "epoch": 0.451286365966845, + "grad_norm": 2.457120848888426, + "learning_rate": 2.926170874887738e-06, + "loss": 0.9919, + "step": 4291 + }, + { + "epoch": 0.45139153640869234, + "grad_norm": 2.303663013614732, + "learning_rate": 2.925348250791582e-06, + "loss": 0.9718, + "step": 4292 + }, + { + "epoch": 0.45149670685053966, + "grad_norm": 3.1484358472259335, + "learning_rate": 2.924525579265791e-06, + "loss": 0.9857, + "step": 4293 + }, + { + "epoch": 0.451601877292387, + "grad_norm": 3.189808105502437, + "learning_rate": 2.9237028604021008e-06, + "loss": 1.0218, + "step": 4294 + }, + { + "epoch": 0.4517070477342343, + "grad_norm": 2.7759170958312116, + "learning_rate": 2.9228800942922497e-06, + "loss": 1.0121, + "step": 4295 + }, + { + "epoch": 0.45181221817608164, + "grad_norm": 2.1847673720857, + "learning_rate": 2.922057281027983e-06, + "loss": 1.0119, + "step": 4296 + }, + { + "epoch": 0.4519173886179289, + "grad_norm": 2.2941536309406096, + "learning_rate": 2.921234420701051e-06, + "loss": 0.9933, + "step": 4297 + }, + { + "epoch": 0.45202255905977623, + "grad_norm": 1.7883355539808496, + "learning_rate": 2.9204115134032086e-06, + "loss": 0.9594, + "step": 4298 + }, + { + "epoch": 0.45212772950162355, + "grad_norm": 2.9394681120602506, + "learning_rate": 2.9195885592262167e-06, + "loss": 1.0014, + "step": 4299 + }, + { + "epoch": 0.4522328999434709, + "grad_norm": 3.0533456324247195, + "learning_rate": 2.9187655582618413e-06, + "loss": 0.9617, + "step": 4300 + }, + { + "epoch": 0.4523380703853182, + "grad_norm": 1.6982849296696754, + "learning_rate": 2.9179425106018532e-06, + "loss": 0.9429, + "step": 4301 + }, + { + "epoch": 0.4524432408271655, + "grad_norm": 3.1396740609931446, + "learning_rate": 2.9171194163380277e-06, + "loss": 1.0107, + "step": 4302 + }, + { + "epoch": 0.45254841126901285, + "grad_norm": 2.811681718875989, + "learning_rate": 2.916296275562147e-06, + "loss": 1.0572, + "step": 4303 + }, + { + "epoch": 0.4526535817108602, + "grad_norm": 3.2640718105886735, + "learning_rate": 2.9154730883659988e-06, + "loss": 1.025, + "step": 4304 + }, + { + "epoch": 0.4527587521527075, + "grad_norm": 2.69588504505022, + "learning_rate": 2.9146498548413725e-06, + "loss": 0.9978, + "step": 4305 + }, + { + "epoch": 0.4528639225945548, + "grad_norm": 2.2342874790888843, + "learning_rate": 2.913826575080067e-06, + "loss": 0.9304, + "step": 4306 + }, + { + "epoch": 0.4529690930364021, + "grad_norm": 3.0565709108799037, + "learning_rate": 2.9130032491738837e-06, + "loss": 1.0373, + "step": 4307 + }, + { + "epoch": 0.4530742634782494, + "grad_norm": 2.853116287639515, + "learning_rate": 2.9121798772146293e-06, + "loss": 1.0201, + "step": 4308 + }, + { + "epoch": 0.45317943392009674, + "grad_norm": 2.3744681471107607, + "learning_rate": 2.911356459294117e-06, + "loss": 0.9437, + "step": 4309 + }, + { + "epoch": 0.45328460436194407, + "grad_norm": 2.164104788250051, + "learning_rate": 2.910532995504163e-06, + "loss": 0.9791, + "step": 4310 + }, + { + "epoch": 0.4533897748037914, + "grad_norm": 1.6203464986260847, + "learning_rate": 2.9097094859365926e-06, + "loss": 0.9675, + "step": 4311 + }, + { + "epoch": 0.4534949452456387, + "grad_norm": 2.9303086296052934, + "learning_rate": 2.90888593068323e-06, + "loss": 1.0381, + "step": 4312 + }, + { + "epoch": 0.45360011568748604, + "grad_norm": 2.0626838173774376, + "learning_rate": 2.908062329835911e-06, + "loss": 0.9685, + "step": 4313 + }, + { + "epoch": 0.45370528612933336, + "grad_norm": 2.2697295104611115, + "learning_rate": 2.9072386834864723e-06, + "loss": 0.9784, + "step": 4314 + }, + { + "epoch": 0.4538104565711807, + "grad_norm": 3.0376685513758077, + "learning_rate": 2.9064149917267565e-06, + "loss": 1.0353, + "step": 4315 + }, + { + "epoch": 0.453915627013028, + "grad_norm": 2.657090557042862, + "learning_rate": 2.905591254648612e-06, + "loss": 0.9427, + "step": 4316 + }, + { + "epoch": 0.4540207974548753, + "grad_norm": 2.3929879751726872, + "learning_rate": 2.904767472343892e-06, + "loss": 1.0099, + "step": 4317 + }, + { + "epoch": 0.4541259678967226, + "grad_norm": 2.695393831318368, + "learning_rate": 2.9039436449044543e-06, + "loss": 1.0132, + "step": 4318 + }, + { + "epoch": 0.45423113833856993, + "grad_norm": 2.1469397224850417, + "learning_rate": 2.903119772422162e-06, + "loss": 0.9917, + "step": 4319 + }, + { + "epoch": 0.45433630878041725, + "grad_norm": 1.9890341493391972, + "learning_rate": 2.902295854988884e-06, + "loss": 1.0114, + "step": 4320 + }, + { + "epoch": 0.4544414792222646, + "grad_norm": 2.6430862936585617, + "learning_rate": 2.901471892696493e-06, + "loss": 1.0101, + "step": 4321 + }, + { + "epoch": 0.4545466496641119, + "grad_norm": 2.0147983041213253, + "learning_rate": 2.900647885636867e-06, + "loss": 1.0124, + "step": 4322 + }, + { + "epoch": 0.4546518201059592, + "grad_norm": 2.1685192867151057, + "learning_rate": 2.899823833901889e-06, + "loss": 1.0, + "step": 4323 + }, + { + "epoch": 0.45475699054780655, + "grad_norm": 2.332868281748152, + "learning_rate": 2.8989997375834485e-06, + "loss": 1.0148, + "step": 4324 + }, + { + "epoch": 0.4548621609896539, + "grad_norm": 1.3686107234017948, + "learning_rate": 2.8981755967734377e-06, + "loss": 0.9342, + "step": 4325 + }, + { + "epoch": 0.4549673314315012, + "grad_norm": 2.087757270716191, + "learning_rate": 2.8973514115637534e-06, + "loss": 0.9817, + "step": 4326 + }, + { + "epoch": 0.4550725018733485, + "grad_norm": 2.271350804832439, + "learning_rate": 2.8965271820463016e-06, + "loss": 1.0087, + "step": 4327 + }, + { + "epoch": 0.4551776723151958, + "grad_norm": 1.643573599294807, + "learning_rate": 2.895702908312987e-06, + "loss": 1.0125, + "step": 4328 + }, + { + "epoch": 0.4552828427570431, + "grad_norm": 2.554616326554526, + "learning_rate": 2.8948785904557262e-06, + "loss": 0.9899, + "step": 4329 + }, + { + "epoch": 0.45538801319889044, + "grad_norm": 2.91621589953988, + "learning_rate": 2.8940542285664337e-06, + "loss": 1.0189, + "step": 4330 + }, + { + "epoch": 0.45549318364073776, + "grad_norm": 1.9347991717626811, + "learning_rate": 2.8932298227370335e-06, + "loss": 0.966, + "step": 4331 + }, + { + "epoch": 0.4555983540825851, + "grad_norm": 3.5268488608131996, + "learning_rate": 2.8924053730594536e-06, + "loss": 0.9827, + "step": 4332 + }, + { + "epoch": 0.4557035245244324, + "grad_norm": 3.3032634100912515, + "learning_rate": 2.8915808796256268e-06, + "loss": 1.0037, + "step": 4333 + }, + { + "epoch": 0.45580869496627974, + "grad_norm": 2.594478528315312, + "learning_rate": 2.89075634252749e-06, + "loss": 0.9854, + "step": 4334 + }, + { + "epoch": 0.45591386540812706, + "grad_norm": 2.142538714749134, + "learning_rate": 2.8899317618569843e-06, + "loss": 1.0015, + "step": 4335 + }, + { + "epoch": 0.4560190358499744, + "grad_norm": 2.336631372632609, + "learning_rate": 2.889107137706059e-06, + "loss": 1.0257, + "step": 4336 + }, + { + "epoch": 0.4561242062918217, + "grad_norm": 3.5495497144192676, + "learning_rate": 2.8882824701666657e-06, + "loss": 1.0268, + "step": 4337 + }, + { + "epoch": 0.456229376733669, + "grad_norm": 1.8932999054002906, + "learning_rate": 2.88745775933076e-06, + "loss": 0.9928, + "step": 4338 + }, + { + "epoch": 0.4563345471755163, + "grad_norm": 3.0148596426527643, + "learning_rate": 2.8866330052903042e-06, + "loss": 1.024, + "step": 4339 + }, + { + "epoch": 0.4564397176173636, + "grad_norm": 2.0856993515410323, + "learning_rate": 2.885808208137265e-06, + "loss": 0.9969, + "step": 4340 + }, + { + "epoch": 0.45654488805921095, + "grad_norm": 2.095948181530454, + "learning_rate": 2.8849833679636137e-06, + "loss": 0.9824, + "step": 4341 + }, + { + "epoch": 0.4566500585010583, + "grad_norm": 2.5170281229756384, + "learning_rate": 2.8841584848613254e-06, + "loss": 1.002, + "step": 4342 + }, + { + "epoch": 0.4567552289429056, + "grad_norm": 2.1378769455831304, + "learning_rate": 2.883333558922383e-06, + "loss": 1.0271, + "step": 4343 + }, + { + "epoch": 0.4568603993847529, + "grad_norm": 2.3821344983844495, + "learning_rate": 2.88250859023877e-06, + "loss": 0.9948, + "step": 4344 + }, + { + "epoch": 0.45696556982660025, + "grad_norm": 2.5487726722527357, + "learning_rate": 2.8816835789024783e-06, + "loss": 1.0075, + "step": 4345 + }, + { + "epoch": 0.4570707402684476, + "grad_norm": 3.0550451214956023, + "learning_rate": 2.880858525005502e-06, + "loss": 1.0063, + "step": 4346 + }, + { + "epoch": 0.4571759107102949, + "grad_norm": 2.798668732759541, + "learning_rate": 2.880033428639842e-06, + "loss": 0.9896, + "step": 4347 + }, + { + "epoch": 0.45728108115214217, + "grad_norm": 2.980102984068875, + "learning_rate": 2.8792082898975028e-06, + "loss": 1.03, + "step": 4348 + }, + { + "epoch": 0.4573862515939895, + "grad_norm": 1.9173203998180592, + "learning_rate": 2.8783831088704923e-06, + "loss": 1.001, + "step": 4349 + }, + { + "epoch": 0.4574914220358368, + "grad_norm": 2.2818375318772843, + "learning_rate": 2.877557885650827e-06, + "loss": 1.0082, + "step": 4350 + }, + { + "epoch": 0.45759659247768414, + "grad_norm": 2.3754880401750844, + "learning_rate": 2.876732620330524e-06, + "loss": 1.0056, + "step": 4351 + }, + { + "epoch": 0.45770176291953146, + "grad_norm": 2.6543669930498495, + "learning_rate": 2.8759073130016073e-06, + "loss": 0.9431, + "step": 4352 + }, + { + "epoch": 0.4578069333613788, + "grad_norm": 2.6939766890582453, + "learning_rate": 2.8750819637561045e-06, + "loss": 0.9748, + "step": 4353 + }, + { + "epoch": 0.4579121038032261, + "grad_norm": 1.9175919955114296, + "learning_rate": 2.87425657268605e-06, + "loss": 1.0176, + "step": 4354 + }, + { + "epoch": 0.45801727424507344, + "grad_norm": 2.439681288815302, + "learning_rate": 2.873431139883479e-06, + "loss": 0.9385, + "step": 4355 + }, + { + "epoch": 0.45812244468692076, + "grad_norm": 2.8212284142415505, + "learning_rate": 2.872605665440436e-06, + "loss": 0.9857, + "step": 4356 + }, + { + "epoch": 0.4582276151287681, + "grad_norm": 2.1470640317341196, + "learning_rate": 2.8717801494489673e-06, + "loss": 0.9626, + "step": 4357 + }, + { + "epoch": 0.45833278557061535, + "grad_norm": 2.2492255805625514, + "learning_rate": 2.8709545920011233e-06, + "loss": 1.0123, + "step": 4358 + }, + { + "epoch": 0.4584379560124627, + "grad_norm": 2.0002199257891906, + "learning_rate": 2.8701289931889602e-06, + "loss": 1.0467, + "step": 4359 + }, + { + "epoch": 0.45854312645431, + "grad_norm": 2.981959655065841, + "learning_rate": 2.8693033531045395e-06, + "loss": 0.953, + "step": 4360 + }, + { + "epoch": 0.4586482968961573, + "grad_norm": 3.851394505625027, + "learning_rate": 2.868477671839926e-06, + "loss": 1.0175, + "step": 4361 + }, + { + "epoch": 0.45875346733800465, + "grad_norm": 2.31415083896268, + "learning_rate": 2.867651949487189e-06, + "loss": 1.0186, + "step": 4362 + }, + { + "epoch": 0.458858637779852, + "grad_norm": 3.8867757736496027, + "learning_rate": 2.8668261861384045e-06, + "loss": 1.0274, + "step": 4363 + }, + { + "epoch": 0.4589638082216993, + "grad_norm": 2.1989997123351555, + "learning_rate": 2.8660003818856506e-06, + "loss": 0.9769, + "step": 4364 + }, + { + "epoch": 0.4590689786635466, + "grad_norm": 2.0392697344733546, + "learning_rate": 2.865174536821011e-06, + "loss": 0.9493, + "step": 4365 + }, + { + "epoch": 0.45917414910539395, + "grad_norm": 2.4063204089586097, + "learning_rate": 2.864348651036574e-06, + "loss": 0.9577, + "step": 4366 + }, + { + "epoch": 0.45927931954724127, + "grad_norm": 2.285261233874211, + "learning_rate": 2.8635227246244306e-06, + "loss": 0.9948, + "step": 4367 + }, + { + "epoch": 0.45938448998908854, + "grad_norm": 2.2549083660343383, + "learning_rate": 2.8626967576766808e-06, + "loss": 1.024, + "step": 4368 + }, + { + "epoch": 0.45948966043093586, + "grad_norm": 2.408292400780605, + "learning_rate": 2.8618707502854243e-06, + "loss": 0.9919, + "step": 4369 + }, + { + "epoch": 0.4595948308727832, + "grad_norm": 2.147791593115681, + "learning_rate": 2.8610447025427685e-06, + "loss": 0.9478, + "step": 4370 + }, + { + "epoch": 0.4597000013146305, + "grad_norm": 2.0763303161056292, + "learning_rate": 2.8602186145408235e-06, + "loss": 1.0489, + "step": 4371 + }, + { + "epoch": 0.45980517175647784, + "grad_norm": 2.849591364090468, + "learning_rate": 2.859392486371705e-06, + "loss": 0.989, + "step": 4372 + }, + { + "epoch": 0.45991034219832516, + "grad_norm": 2.003930986515399, + "learning_rate": 2.858566318127532e-06, + "loss": 0.9761, + "step": 4373 + }, + { + "epoch": 0.4600155126401725, + "grad_norm": 1.9945701603167976, + "learning_rate": 2.8577401099004285e-06, + "loss": 1.0204, + "step": 4374 + }, + { + "epoch": 0.4601206830820198, + "grad_norm": 3.075579516233079, + "learning_rate": 2.856913861782525e-06, + "loss": 1.0004, + "step": 4375 + }, + { + "epoch": 0.46022585352386713, + "grad_norm": 2.351283254441647, + "learning_rate": 2.856087573865952e-06, + "loss": 1.0144, + "step": 4376 + }, + { + "epoch": 0.46033102396571446, + "grad_norm": 3.339158339951501, + "learning_rate": 2.8552612462428497e-06, + "loss": 1.006, + "step": 4377 + }, + { + "epoch": 0.4604361944075617, + "grad_norm": 2.309435332828068, + "learning_rate": 2.854434879005357e-06, + "loss": 1.0221, + "step": 4378 + }, + { + "epoch": 0.46054136484940905, + "grad_norm": 3.2211925972738333, + "learning_rate": 2.853608472245624e-06, + "loss": 1.0109, + "step": 4379 + }, + { + "epoch": 0.4606465352912564, + "grad_norm": 2.1892482332044327, + "learning_rate": 2.8527820260557986e-06, + "loss": 1.0088, + "step": 4380 + }, + { + "epoch": 0.4607517057331037, + "grad_norm": 2.410757032252132, + "learning_rate": 2.851955540528036e-06, + "loss": 0.9959, + "step": 4381 + }, + { + "epoch": 0.460856876174951, + "grad_norm": 2.359864321372521, + "learning_rate": 2.8511290157544976e-06, + "loss": 1.0216, + "step": 4382 + }, + { + "epoch": 0.46096204661679835, + "grad_norm": 2.1948898074829355, + "learning_rate": 2.8503024518273455e-06, + "loss": 0.989, + "step": 4383 + }, + { + "epoch": 0.4610672170586457, + "grad_norm": 2.4426656861817997, + "learning_rate": 2.849475848838749e-06, + "loss": 0.9701, + "step": 4384 + }, + { + "epoch": 0.461172387500493, + "grad_norm": 2.5319847009537946, + "learning_rate": 2.84864920688088e-06, + "loss": 1.0233, + "step": 4385 + }, + { + "epoch": 0.4612775579423403, + "grad_norm": 2.133328823397027, + "learning_rate": 2.847822526045917e-06, + "loss": 0.9656, + "step": 4386 + }, + { + "epoch": 0.46138272838418765, + "grad_norm": 2.639337179377181, + "learning_rate": 2.8469958064260405e-06, + "loss": 1.0025, + "step": 4387 + }, + { + "epoch": 0.46148789882603497, + "grad_norm": 2.0901303479901516, + "learning_rate": 2.846169048113435e-06, + "loss": 0.9886, + "step": 4388 + }, + { + "epoch": 0.46159306926788224, + "grad_norm": 2.7143229057209957, + "learning_rate": 2.8453422512002925e-06, + "loss": 1.0069, + "step": 4389 + }, + { + "epoch": 0.46169823970972956, + "grad_norm": 2.2128312082996158, + "learning_rate": 2.844515415778806e-06, + "loss": 1.0177, + "step": 4390 + }, + { + "epoch": 0.4618034101515769, + "grad_norm": 2.1207444779689135, + "learning_rate": 2.843688541941174e-06, + "loss": 1.0304, + "step": 4391 + }, + { + "epoch": 0.4619085805934242, + "grad_norm": 2.6321023824250025, + "learning_rate": 2.8428616297795998e-06, + "loss": 0.9876, + "step": 4392 + }, + { + "epoch": 0.46201375103527154, + "grad_norm": 2.5462159065092242, + "learning_rate": 2.84203467938629e-06, + "loss": 1.0227, + "step": 4393 + }, + { + "epoch": 0.46211892147711886, + "grad_norm": 3.258389125088761, + "learning_rate": 2.8412076908534574e-06, + "loss": 0.983, + "step": 4394 + }, + { + "epoch": 0.4622240919189662, + "grad_norm": 2.3212982515302603, + "learning_rate": 2.840380664273316e-06, + "loss": 0.9358, + "step": 4395 + }, + { + "epoch": 0.4623292623608135, + "grad_norm": 2.4763075667324657, + "learning_rate": 2.839553599738087e-06, + "loss": 1.0202, + "step": 4396 + }, + { + "epoch": 0.46243443280266083, + "grad_norm": 2.0484739974347534, + "learning_rate": 2.838726497339993e-06, + "loss": 1.0134, + "step": 4397 + }, + { + "epoch": 0.46253960324450816, + "grad_norm": 1.9402963476386015, + "learning_rate": 2.8378993571712638e-06, + "loss": 0.9729, + "step": 4398 + }, + { + "epoch": 0.4626447736863554, + "grad_norm": 1.8355127361037, + "learning_rate": 2.8370721793241314e-06, + "loss": 1.0101, + "step": 4399 + }, + { + "epoch": 0.46274994412820275, + "grad_norm": 2.994356739022344, + "learning_rate": 2.8362449638908324e-06, + "loss": 1.0008, + "step": 4400 + }, + { + "epoch": 0.4628551145700501, + "grad_norm": 2.910116749399226, + "learning_rate": 2.8354177109636076e-06, + "loss": 0.994, + "step": 4401 + }, + { + "epoch": 0.4629602850118974, + "grad_norm": 2.223042107881437, + "learning_rate": 2.834590420634703e-06, + "loss": 1.016, + "step": 4402 + }, + { + "epoch": 0.4630654554537447, + "grad_norm": 2.6324446425874073, + "learning_rate": 2.8337630929963672e-06, + "loss": 1.0376, + "step": 4403 + }, + { + "epoch": 0.46317062589559205, + "grad_norm": 2.8252122929662153, + "learning_rate": 2.832935728140853e-06, + "loss": 0.9432, + "step": 4404 + }, + { + "epoch": 0.46327579633743937, + "grad_norm": 2.0038921955516105, + "learning_rate": 2.83210832616042e-06, + "loss": 1.0019, + "step": 4405 + }, + { + "epoch": 0.4633809667792867, + "grad_norm": 2.549003738454214, + "learning_rate": 2.8312808871473275e-06, + "loss": 1.0029, + "step": 4406 + }, + { + "epoch": 0.463486137221134, + "grad_norm": 2.2351716118016474, + "learning_rate": 2.830453411193843e-06, + "loss": 0.9699, + "step": 4407 + }, + { + "epoch": 0.46359130766298134, + "grad_norm": 2.8680061904201173, + "learning_rate": 2.829625898392237e-06, + "loss": 1.0212, + "step": 4408 + }, + { + "epoch": 0.4636964781048286, + "grad_norm": 2.0089360810313908, + "learning_rate": 2.828798348834782e-06, + "loss": 1.0066, + "step": 4409 + }, + { + "epoch": 0.46380164854667594, + "grad_norm": 2.127532937133814, + "learning_rate": 2.827970762613757e-06, + "loss": 0.9945, + "step": 4410 + }, + { + "epoch": 0.46390681898852326, + "grad_norm": 2.089558020765553, + "learning_rate": 2.827143139821444e-06, + "loss": 1.0153, + "step": 4411 + }, + { + "epoch": 0.4640119894303706, + "grad_norm": 3.252402836184958, + "learning_rate": 2.82631548055013e-06, + "loss": 0.9513, + "step": 4412 + }, + { + "epoch": 0.4641171598722179, + "grad_norm": 2.135962227586247, + "learning_rate": 2.8254877848921036e-06, + "loss": 0.9607, + "step": 4413 + }, + { + "epoch": 0.46422233031406523, + "grad_norm": 2.41667417366253, + "learning_rate": 2.824660052939662e-06, + "loss": 0.9908, + "step": 4414 + }, + { + "epoch": 0.46432750075591256, + "grad_norm": 2.380362760812115, + "learning_rate": 2.823832284785102e-06, + "loss": 0.9779, + "step": 4415 + }, + { + "epoch": 0.4644326711977599, + "grad_norm": 2.0345880558081944, + "learning_rate": 2.8230044805207275e-06, + "loss": 0.9734, + "step": 4416 + }, + { + "epoch": 0.4645378416396072, + "grad_norm": 2.3167139827947776, + "learning_rate": 2.8221766402388436e-06, + "loss": 0.96, + "step": 4417 + }, + { + "epoch": 0.46464301208145453, + "grad_norm": 2.3878293198539224, + "learning_rate": 2.8213487640317615e-06, + "loss": 0.97, + "step": 4418 + }, + { + "epoch": 0.4647481825233018, + "grad_norm": 2.1973785313368426, + "learning_rate": 2.820520851991796e-06, + "loss": 0.9816, + "step": 4419 + }, + { + "epoch": 0.4648533529651491, + "grad_norm": 3.0125702231483262, + "learning_rate": 2.8196929042112652e-06, + "loss": 1.0055, + "step": 4420 + }, + { + "epoch": 0.46495852340699645, + "grad_norm": 2.3918412139996, + "learning_rate": 2.8188649207824925e-06, + "loss": 0.9632, + "step": 4421 + }, + { + "epoch": 0.4650636938488438, + "grad_norm": 1.9278006868749584, + "learning_rate": 2.8180369017978037e-06, + "loss": 0.9645, + "step": 4422 + }, + { + "epoch": 0.4651688642906911, + "grad_norm": 2.85997981754498, + "learning_rate": 2.8172088473495306e-06, + "loss": 0.9861, + "step": 4423 + }, + { + "epoch": 0.4652740347325384, + "grad_norm": 2.4986220629370344, + "learning_rate": 2.816380757530006e-06, + "loss": 0.957, + "step": 4424 + }, + { + "epoch": 0.46537920517438575, + "grad_norm": 2.3157951259747316, + "learning_rate": 2.8155526324315704e-06, + "loss": 0.9893, + "step": 4425 + }, + { + "epoch": 0.46548437561623307, + "grad_norm": 2.9956280856636384, + "learning_rate": 2.814724472146564e-06, + "loss": 1.0325, + "step": 4426 + }, + { + "epoch": 0.4655895460580804, + "grad_norm": 2.7269340487252345, + "learning_rate": 2.813896276767334e-06, + "loss": 0.9351, + "step": 4427 + }, + { + "epoch": 0.4656947164999277, + "grad_norm": 2.4606920746541014, + "learning_rate": 2.8130680463862315e-06, + "loss": 1.0123, + "step": 4428 + }, + { + "epoch": 0.465799886941775, + "grad_norm": 2.848249960105425, + "learning_rate": 2.8122397810956086e-06, + "loss": 0.9924, + "step": 4429 + }, + { + "epoch": 0.4659050573836223, + "grad_norm": 2.8446842951304783, + "learning_rate": 2.811411480987825e-06, + "loss": 1.0287, + "step": 4430 + }, + { + "epoch": 0.46601022782546964, + "grad_norm": 2.393650778819736, + "learning_rate": 2.810583146155243e-06, + "loss": 1.0055, + "step": 4431 + }, + { + "epoch": 0.46611539826731696, + "grad_norm": 3.0125679338832065, + "learning_rate": 2.8097547766902273e-06, + "loss": 0.9402, + "step": 4432 + }, + { + "epoch": 0.4662205687091643, + "grad_norm": 2.4891599650912886, + "learning_rate": 2.8089263726851474e-06, + "loss": 0.9905, + "step": 4433 + }, + { + "epoch": 0.4663257391510116, + "grad_norm": 2.6495533894705616, + "learning_rate": 2.8080979342323765e-06, + "loss": 0.9873, + "step": 4434 + }, + { + "epoch": 0.46643090959285893, + "grad_norm": 2.5985892014306375, + "learning_rate": 2.8072694614242935e-06, + "loss": 1.0312, + "step": 4435 + }, + { + "epoch": 0.46653608003470626, + "grad_norm": 2.636695159781973, + "learning_rate": 2.8064409543532776e-06, + "loss": 1.0396, + "step": 4436 + }, + { + "epoch": 0.4666412504765536, + "grad_norm": 3.244510424950399, + "learning_rate": 2.805612413111716e-06, + "loss": 0.9766, + "step": 4437 + }, + { + "epoch": 0.4667464209184009, + "grad_norm": 2.2447570199099154, + "learning_rate": 2.8047838377919952e-06, + "loss": 0.9901, + "step": 4438 + }, + { + "epoch": 0.4668515913602482, + "grad_norm": 2.5976977350088415, + "learning_rate": 2.8039552284865094e-06, + "loss": 0.9973, + "step": 4439 + }, + { + "epoch": 0.4669567618020955, + "grad_norm": 3.2454932031526944, + "learning_rate": 2.8031265852876537e-06, + "loss": 1.0331, + "step": 4440 + }, + { + "epoch": 0.4670619322439428, + "grad_norm": 2.523586811336105, + "learning_rate": 2.8022979082878297e-06, + "loss": 0.9869, + "step": 4441 + }, + { + "epoch": 0.46716710268579015, + "grad_norm": 2.302211346608485, + "learning_rate": 2.801469197579441e-06, + "loss": 1.0341, + "step": 4442 + }, + { + "epoch": 0.46727227312763747, + "grad_norm": 2.348259230268178, + "learning_rate": 2.800640453254894e-06, + "loss": 0.9683, + "step": 4443 + }, + { + "epoch": 0.4673774435694848, + "grad_norm": 2.366754132665982, + "learning_rate": 2.799811675406601e-06, + "loss": 0.9587, + "step": 4444 + }, + { + "epoch": 0.4674826140113321, + "grad_norm": 2.075592911455706, + "learning_rate": 2.7989828641269778e-06, + "loss": 1.0085, + "step": 4445 + }, + { + "epoch": 0.46758778445317944, + "grad_norm": 2.6326964601948877, + "learning_rate": 2.798154019508443e-06, + "loss": 0.9919, + "step": 4446 + }, + { + "epoch": 0.46769295489502677, + "grad_norm": 2.8302081829382835, + "learning_rate": 2.7973251416434176e-06, + "loss": 1.0534, + "step": 4447 + }, + { + "epoch": 0.4677981253368741, + "grad_norm": 2.639359868066465, + "learning_rate": 2.7964962306243305e-06, + "loss": 1.0037, + "step": 4448 + }, + { + "epoch": 0.4679032957787214, + "grad_norm": 2.292040628888014, + "learning_rate": 2.7956672865436095e-06, + "loss": 1.0244, + "step": 4449 + }, + { + "epoch": 0.4680084662205687, + "grad_norm": 3.4389539874184334, + "learning_rate": 2.794838309493689e-06, + "loss": 1.0291, + "step": 4450 + }, + { + "epoch": 0.468113636662416, + "grad_norm": 4.035449967348666, + "learning_rate": 2.7940092995670075e-06, + "loss": 1.0501, + "step": 4451 + }, + { + "epoch": 0.46821880710426333, + "grad_norm": 2.7831049279874653, + "learning_rate": 2.7931802568560053e-06, + "loss": 1.0194, + "step": 4452 + }, + { + "epoch": 0.46832397754611066, + "grad_norm": 1.8206417464439855, + "learning_rate": 2.792351181453126e-06, + "loss": 0.9495, + "step": 4453 + }, + { + "epoch": 0.468429147987958, + "grad_norm": 1.9541921059032041, + "learning_rate": 2.791522073450819e-06, + "loss": 1.0107, + "step": 4454 + }, + { + "epoch": 0.4685343184298053, + "grad_norm": 2.8892598367431086, + "learning_rate": 2.790692932941537e-06, + "loss": 1.0612, + "step": 4455 + }, + { + "epoch": 0.46863948887165263, + "grad_norm": 3.171614211998949, + "learning_rate": 2.7898637600177343e-06, + "loss": 1.0538, + "step": 4456 + }, + { + "epoch": 0.46874465931349996, + "grad_norm": 2.424092647751607, + "learning_rate": 2.7890345547718707e-06, + "loss": 1.0326, + "step": 4457 + }, + { + "epoch": 0.4688498297553473, + "grad_norm": 1.8602198107845023, + "learning_rate": 2.7882053172964085e-06, + "loss": 0.9663, + "step": 4458 + }, + { + "epoch": 0.4689550001971946, + "grad_norm": 2.3246688695244226, + "learning_rate": 2.787376047683815e-06, + "loss": 0.9459, + "step": 4459 + }, + { + "epoch": 0.4690601706390419, + "grad_norm": 2.762737851787094, + "learning_rate": 2.78654674602656e-06, + "loss": 1.0057, + "step": 4460 + }, + { + "epoch": 0.4691653410808892, + "grad_norm": 2.3224208280852427, + "learning_rate": 2.7857174124171166e-06, + "loss": 0.988, + "step": 4461 + }, + { + "epoch": 0.4692705115227365, + "grad_norm": 1.6891392684017292, + "learning_rate": 2.7848880469479623e-06, + "loss": 1.0095, + "step": 4462 + }, + { + "epoch": 0.46937568196458385, + "grad_norm": 2.3788041616512814, + "learning_rate": 2.7840586497115767e-06, + "loss": 1.0111, + "step": 4463 + }, + { + "epoch": 0.46948085240643117, + "grad_norm": 2.5464866689627996, + "learning_rate": 2.783229220800446e-06, + "loss": 0.9748, + "step": 4464 + }, + { + "epoch": 0.4695860228482785, + "grad_norm": 2.6905461256581114, + "learning_rate": 2.7823997603070573e-06, + "loss": 1.0209, + "step": 4465 + }, + { + "epoch": 0.4696911932901258, + "grad_norm": 2.767917863313156, + "learning_rate": 2.7815702683239002e-06, + "loss": 1.0259, + "step": 4466 + }, + { + "epoch": 0.46979636373197314, + "grad_norm": 2.347477742569025, + "learning_rate": 2.7807407449434726e-06, + "loss": 0.9851, + "step": 4467 + }, + { + "epoch": 0.46990153417382047, + "grad_norm": 1.90764637954435, + "learning_rate": 2.7799111902582697e-06, + "loss": 1.0211, + "step": 4468 + }, + { + "epoch": 0.4700067046156678, + "grad_norm": 2.0075620650917667, + "learning_rate": 2.7790816043607953e-06, + "loss": 0.987, + "step": 4469 + }, + { + "epoch": 0.47011187505751506, + "grad_norm": 2.7682956389063973, + "learning_rate": 2.7782519873435538e-06, + "loss": 0.9998, + "step": 4470 + }, + { + "epoch": 0.4702170454993624, + "grad_norm": 2.292023839054441, + "learning_rate": 2.7774223392990544e-06, + "loss": 1.0166, + "step": 4471 + }, + { + "epoch": 0.4703222159412097, + "grad_norm": 3.2215154014236353, + "learning_rate": 2.776592660319809e-06, + "loss": 1.0095, + "step": 4472 + }, + { + "epoch": 0.47042738638305703, + "grad_norm": 2.719209058008712, + "learning_rate": 2.775762950498333e-06, + "loss": 0.9651, + "step": 4473 + }, + { + "epoch": 0.47053255682490436, + "grad_norm": 3.210513336735668, + "learning_rate": 2.7749332099271466e-06, + "loss": 1.0112, + "step": 4474 + }, + { + "epoch": 0.4706377272667517, + "grad_norm": 2.6300858424014715, + "learning_rate": 2.7741034386987707e-06, + "loss": 1.0085, + "step": 4475 + }, + { + "epoch": 0.470742897708599, + "grad_norm": 3.14495333627856, + "learning_rate": 2.7732736369057326e-06, + "loss": 0.9815, + "step": 4476 + }, + { + "epoch": 0.47084806815044633, + "grad_norm": 2.5526520052973973, + "learning_rate": 2.772443804640561e-06, + "loss": 0.9963, + "step": 4477 + }, + { + "epoch": 0.47095323859229365, + "grad_norm": 1.9617055562355878, + "learning_rate": 2.771613941995789e-06, + "loss": 1.0017, + "step": 4478 + }, + { + "epoch": 0.471058409034141, + "grad_norm": 2.688578495945201, + "learning_rate": 2.7707840490639517e-06, + "loss": 0.9938, + "step": 4479 + }, + { + "epoch": 0.47116357947598825, + "grad_norm": 2.4345239857502046, + "learning_rate": 2.76995412593759e-06, + "loss": 0.996, + "step": 4480 + }, + { + "epoch": 0.47126874991783557, + "grad_norm": 3.6250029270038393, + "learning_rate": 2.769124172709246e-06, + "loss": 0.9917, + "step": 4481 + }, + { + "epoch": 0.4713739203596829, + "grad_norm": 2.2005515373005156, + "learning_rate": 2.7682941894714664e-06, + "loss": 0.9691, + "step": 4482 + }, + { + "epoch": 0.4714790908015302, + "grad_norm": 2.3295932635087224, + "learning_rate": 2.7674641763168003e-06, + "loss": 1.017, + "step": 4483 + }, + { + "epoch": 0.47158426124337754, + "grad_norm": 3.0234424271551417, + "learning_rate": 2.7666341333378005e-06, + "loss": 0.9685, + "step": 4484 + }, + { + "epoch": 0.47168943168522487, + "grad_norm": 2.486438862191606, + "learning_rate": 2.7658040606270244e-06, + "loss": 0.998, + "step": 4485 + }, + { + "epoch": 0.4717946021270722, + "grad_norm": 2.675116684244302, + "learning_rate": 2.7649739582770297e-06, + "loss": 1.0076, + "step": 4486 + }, + { + "epoch": 0.4718997725689195, + "grad_norm": 2.7072141444203908, + "learning_rate": 2.7641438263803804e-06, + "loss": 1.0033, + "step": 4487 + }, + { + "epoch": 0.47200494301076684, + "grad_norm": 2.5861797482973516, + "learning_rate": 2.7633136650296434e-06, + "loss": 0.9744, + "step": 4488 + }, + { + "epoch": 0.47211011345261417, + "grad_norm": 2.224720554082226, + "learning_rate": 2.762483474317387e-06, + "loss": 1.0217, + "step": 4489 + }, + { + "epoch": 0.47221528389446144, + "grad_norm": 2.245262509992629, + "learning_rate": 2.7616532543361834e-06, + "loss": 0.9912, + "step": 4490 + }, + { + "epoch": 0.47232045433630876, + "grad_norm": 2.7604293363117023, + "learning_rate": 2.7608230051786094e-06, + "loss": 0.9939, + "step": 4491 + }, + { + "epoch": 0.4724256247781561, + "grad_norm": 2.4111390221536224, + "learning_rate": 2.7599927269372453e-06, + "loss": 0.9487, + "step": 4492 + }, + { + "epoch": 0.4725307952200034, + "grad_norm": 1.891935956225179, + "learning_rate": 2.759162419704671e-06, + "loss": 0.9603, + "step": 4493 + }, + { + "epoch": 0.47263596566185073, + "grad_norm": 1.8123928584801083, + "learning_rate": 2.7583320835734743e-06, + "loss": 0.9951, + "step": 4494 + }, + { + "epoch": 0.47274113610369806, + "grad_norm": 2.3993983584972343, + "learning_rate": 2.757501718636244e-06, + "loss": 0.9939, + "step": 4495 + }, + { + "epoch": 0.4728463065455454, + "grad_norm": 2.0288832292698746, + "learning_rate": 2.7566713249855715e-06, + "loss": 1.0161, + "step": 4496 + }, + { + "epoch": 0.4729514769873927, + "grad_norm": 2.323445513583333, + "learning_rate": 2.7558409027140525e-06, + "loss": 0.9886, + "step": 4497 + }, + { + "epoch": 0.47305664742924003, + "grad_norm": 2.68445355500332, + "learning_rate": 2.7550104519142846e-06, + "loss": 0.9494, + "step": 4498 + }, + { + "epoch": 0.47316181787108735, + "grad_norm": 2.640136937856992, + "learning_rate": 2.754179972678871e-06, + "loss": 1.0562, + "step": 4499 + }, + { + "epoch": 0.4732669883129346, + "grad_norm": 3.324792657639246, + "learning_rate": 2.753349465100415e-06, + "loss": 0.9979, + "step": 4500 + }, + { + "epoch": 0.47337215875478195, + "grad_norm": 2.810264675470383, + "learning_rate": 2.7525189292715264e-06, + "loss": 1.0266, + "step": 4501 + }, + { + "epoch": 0.47347732919662927, + "grad_norm": 2.003647833018789, + "learning_rate": 2.751688365284816e-06, + "loss": 1.0359, + "step": 4502 + }, + { + "epoch": 0.4735824996384766, + "grad_norm": 2.7806320532702697, + "learning_rate": 2.7508577732328975e-06, + "loss": 1.0402, + "step": 4503 + }, + { + "epoch": 0.4736876700803239, + "grad_norm": 1.713725247330321, + "learning_rate": 2.750027153208388e-06, + "loss": 1.0154, + "step": 4504 + }, + { + "epoch": 0.47379284052217124, + "grad_norm": 2.8013510184260864, + "learning_rate": 2.7491965053039084e-06, + "loss": 1.026, + "step": 4505 + }, + { + "epoch": 0.47389801096401857, + "grad_norm": 2.7346431525668917, + "learning_rate": 2.7483658296120828e-06, + "loss": 1.0076, + "step": 4506 + }, + { + "epoch": 0.4740031814058659, + "grad_norm": 2.805919154601316, + "learning_rate": 2.747535126225538e-06, + "loss": 0.9873, + "step": 4507 + }, + { + "epoch": 0.4741083518477132, + "grad_norm": 2.950564613008157, + "learning_rate": 2.746704395236904e-06, + "loss": 1.0062, + "step": 4508 + }, + { + "epoch": 0.47421352228956054, + "grad_norm": 2.751482814652587, + "learning_rate": 2.745873636738813e-06, + "loss": 0.9653, + "step": 4509 + }, + { + "epoch": 0.47431869273140786, + "grad_norm": 2.5665028667710517, + "learning_rate": 2.7450428508239024e-06, + "loss": 0.9883, + "step": 4510 + }, + { + "epoch": 0.47442386317325513, + "grad_norm": 3.3938183374193205, + "learning_rate": 2.7442120375848096e-06, + "loss": 0.9968, + "step": 4511 + }, + { + "epoch": 0.47452903361510246, + "grad_norm": 2.1396244469160774, + "learning_rate": 2.7433811971141772e-06, + "loss": 0.964, + "step": 4512 + }, + { + "epoch": 0.4746342040569498, + "grad_norm": 2.34919376628943, + "learning_rate": 2.742550329504651e-06, + "loss": 0.9981, + "step": 4513 + }, + { + "epoch": 0.4747393744987971, + "grad_norm": 2.9534207402307864, + "learning_rate": 2.7417194348488786e-06, + "loss": 0.9802, + "step": 4514 + }, + { + "epoch": 0.47484454494064443, + "grad_norm": 3.164085329529094, + "learning_rate": 2.7408885132395117e-06, + "loss": 1.0135, + "step": 4515 + }, + { + "epoch": 0.47494971538249176, + "grad_norm": 2.2533801639022406, + "learning_rate": 2.7400575647692046e-06, + "loss": 0.9697, + "step": 4516 + }, + { + "epoch": 0.4750548858243391, + "grad_norm": 2.362830237698888, + "learning_rate": 2.7392265895306142e-06, + "loss": 1.0129, + "step": 4517 + }, + { + "epoch": 0.4751600562661864, + "grad_norm": 3.088808161086836, + "learning_rate": 2.7383955876164004e-06, + "loss": 0.9999, + "step": 4518 + }, + { + "epoch": 0.47526522670803373, + "grad_norm": 3.1253457323179767, + "learning_rate": 2.737564559119227e-06, + "loss": 1.0031, + "step": 4519 + }, + { + "epoch": 0.47537039714988105, + "grad_norm": 2.366491108309749, + "learning_rate": 2.7367335041317593e-06, + "loss": 1.0039, + "step": 4520 + }, + { + "epoch": 0.4754755675917283, + "grad_norm": 2.451828923555723, + "learning_rate": 2.7359024227466668e-06, + "loss": 0.9815, + "step": 4521 + }, + { + "epoch": 0.47558073803357565, + "grad_norm": 2.0943618962936945, + "learning_rate": 2.735071315056622e-06, + "loss": 0.985, + "step": 4522 + }, + { + "epoch": 0.47568590847542297, + "grad_norm": 2.978757177852028, + "learning_rate": 2.734240181154299e-06, + "loss": 1.0017, + "step": 4523 + }, + { + "epoch": 0.4757910789172703, + "grad_norm": 2.880549752555551, + "learning_rate": 2.733409021132377e-06, + "loss": 0.9694, + "step": 4524 + }, + { + "epoch": 0.4758962493591176, + "grad_norm": 1.9750573431598037, + "learning_rate": 2.7325778350835353e-06, + "loss": 0.9964, + "step": 4525 + }, + { + "epoch": 0.47600141980096494, + "grad_norm": 2.672025240947568, + "learning_rate": 2.7317466231004584e-06, + "loss": 0.9814, + "step": 4526 + }, + { + "epoch": 0.47610659024281227, + "grad_norm": 2.397919630397351, + "learning_rate": 2.7309153852758335e-06, + "loss": 1.0414, + "step": 4527 + }, + { + "epoch": 0.4762117606846596, + "grad_norm": 3.589638718925268, + "learning_rate": 2.730084121702348e-06, + "loss": 0.9689, + "step": 4528 + }, + { + "epoch": 0.4763169311265069, + "grad_norm": 2.383301359474508, + "learning_rate": 2.7292528324726963e-06, + "loss": 0.9526, + "step": 4529 + }, + { + "epoch": 0.47642210156835424, + "grad_norm": 1.7141669642552233, + "learning_rate": 2.7284215176795724e-06, + "loss": 0.9945, + "step": 4530 + }, + { + "epoch": 0.4765272720102015, + "grad_norm": 2.2151545678563083, + "learning_rate": 2.7275901774156753e-06, + "loss": 1.0156, + "step": 4531 + }, + { + "epoch": 0.47663244245204883, + "grad_norm": 2.362263151552294, + "learning_rate": 2.726758811773706e-06, + "loss": 0.9864, + "step": 4532 + }, + { + "epoch": 0.47673761289389616, + "grad_norm": 1.719772591675931, + "learning_rate": 2.725927420846367e-06, + "loss": 0.9876, + "step": 4533 + }, + { + "epoch": 0.4768427833357435, + "grad_norm": 2.803143954819945, + "learning_rate": 2.7250960047263658e-06, + "loss": 1.0204, + "step": 4534 + }, + { + "epoch": 0.4769479537775908, + "grad_norm": 1.9487171453914527, + "learning_rate": 2.7242645635064107e-06, + "loss": 0.9932, + "step": 4535 + }, + { + "epoch": 0.47705312421943813, + "grad_norm": 2.208339642159797, + "learning_rate": 2.7234330972792157e-06, + "loss": 0.919, + "step": 4536 + }, + { + "epoch": 0.47715829466128545, + "grad_norm": 2.2756389370959207, + "learning_rate": 2.7226016061374934e-06, + "loss": 1.0314, + "step": 4537 + }, + { + "epoch": 0.4772634651031328, + "grad_norm": 3.099273899280179, + "learning_rate": 2.7217700901739637e-06, + "loss": 1.0085, + "step": 4538 + }, + { + "epoch": 0.4773686355449801, + "grad_norm": 2.6724241504223354, + "learning_rate": 2.720938549481346e-06, + "loss": 0.9733, + "step": 4539 + }, + { + "epoch": 0.4774738059868274, + "grad_norm": 2.502055605870524, + "learning_rate": 2.720106984152364e-06, + "loss": 0.9623, + "step": 4540 + }, + { + "epoch": 0.4775789764286747, + "grad_norm": 2.0664420087785245, + "learning_rate": 2.719275394279743e-06, + "loss": 1.0268, + "step": 4541 + }, + { + "epoch": 0.477684146870522, + "grad_norm": 1.8769297433106866, + "learning_rate": 2.718443779956212e-06, + "loss": 1.0, + "step": 4542 + }, + { + "epoch": 0.47778931731236934, + "grad_norm": 2.458314650760072, + "learning_rate": 2.7176121412745026e-06, + "loss": 1.0313, + "step": 4543 + }, + { + "epoch": 0.47789448775421667, + "grad_norm": 2.630804381758795, + "learning_rate": 2.7167804783273495e-06, + "loss": 0.9763, + "step": 4544 + }, + { + "epoch": 0.477999658196064, + "grad_norm": 2.9835995375477418, + "learning_rate": 2.715948791207489e-06, + "loss": 0.9961, + "step": 4545 + }, + { + "epoch": 0.4781048286379113, + "grad_norm": 2.5063558471282446, + "learning_rate": 2.7151170800076603e-06, + "loss": 1.0074, + "step": 4546 + }, + { + "epoch": 0.47820999907975864, + "grad_norm": 2.827999520063029, + "learning_rate": 2.714285344820606e-06, + "loss": 0.9995, + "step": 4547 + }, + { + "epoch": 0.47831516952160597, + "grad_norm": 2.8368404828358442, + "learning_rate": 2.7134535857390714e-06, + "loss": 0.9858, + "step": 4548 + }, + { + "epoch": 0.4784203399634533, + "grad_norm": 2.807519023318031, + "learning_rate": 2.7126218028558037e-06, + "loss": 0.9945, + "step": 4549 + }, + { + "epoch": 0.4785255104053006, + "grad_norm": 2.3253069567766764, + "learning_rate": 2.711789996263554e-06, + "loss": 0.9841, + "step": 4550 + }, + { + "epoch": 0.4786306808471479, + "grad_norm": 2.400222728107831, + "learning_rate": 2.7109581660550733e-06, + "loss": 0.992, + "step": 4551 + }, + { + "epoch": 0.4787358512889952, + "grad_norm": 1.8434853526832968, + "learning_rate": 2.710126312323119e-06, + "loss": 0.9798, + "step": 4552 + }, + { + "epoch": 0.47884102173084253, + "grad_norm": 2.339371298345462, + "learning_rate": 2.7092944351604482e-06, + "loss": 0.9666, + "step": 4553 + }, + { + "epoch": 0.47894619217268986, + "grad_norm": 2.0644879283853754, + "learning_rate": 2.7084625346598232e-06, + "loss": 0.9738, + "step": 4554 + }, + { + "epoch": 0.4790513626145372, + "grad_norm": 3.0372309396395987, + "learning_rate": 2.707630610914005e-06, + "loss": 0.9611, + "step": 4555 + }, + { + "epoch": 0.4791565330563845, + "grad_norm": 2.7578247233652973, + "learning_rate": 2.706798664015761e-06, + "loss": 1.0389, + "step": 4556 + }, + { + "epoch": 0.47926170349823183, + "grad_norm": 3.0623439608759, + "learning_rate": 2.70596669405786e-06, + "loss": 1.0041, + "step": 4557 + }, + { + "epoch": 0.47936687394007915, + "grad_norm": 2.4941925421930433, + "learning_rate": 2.7051347011330716e-06, + "loss": 0.9829, + "step": 4558 + }, + { + "epoch": 0.4794720443819265, + "grad_norm": 3.0490408838472103, + "learning_rate": 2.704302685334171e-06, + "loss": 1.0366, + "step": 4559 + }, + { + "epoch": 0.4795772148237738, + "grad_norm": 2.3813045311909553, + "learning_rate": 2.7034706467539335e-06, + "loss": 0.9829, + "step": 4560 + }, + { + "epoch": 0.47968238526562107, + "grad_norm": 2.845538555861447, + "learning_rate": 2.702638585485139e-06, + "loss": 0.9978, + "step": 4561 + }, + { + "epoch": 0.4797875557074684, + "grad_norm": 3.2438971299974955, + "learning_rate": 2.701806501620568e-06, + "loss": 1.0011, + "step": 4562 + }, + { + "epoch": 0.4798927261493157, + "grad_norm": 2.3425851673049585, + "learning_rate": 2.700974395253004e-06, + "loss": 1.0052, + "step": 4563 + }, + { + "epoch": 0.47999789659116304, + "grad_norm": 2.8884679088128324, + "learning_rate": 2.7001422664752338e-06, + "loss": 0.969, + "step": 4564 + }, + { + "epoch": 0.48010306703301037, + "grad_norm": 2.7955695336255615, + "learning_rate": 2.699310115380046e-06, + "loss": 1.0197, + "step": 4565 + }, + { + "epoch": 0.4802082374748577, + "grad_norm": 3.190960632752079, + "learning_rate": 2.6984779420602324e-06, + "loss": 1.0072, + "step": 4566 + }, + { + "epoch": 0.480313407916705, + "grad_norm": 2.5976595875210506, + "learning_rate": 2.697645746608586e-06, + "loss": 0.9822, + "step": 4567 + }, + { + "epoch": 0.48041857835855234, + "grad_norm": 2.0169697804572664, + "learning_rate": 2.6968135291179036e-06, + "loss": 0.9938, + "step": 4568 + }, + { + "epoch": 0.48052374880039966, + "grad_norm": 2.9736885221988794, + "learning_rate": 2.6959812896809843e-06, + "loss": 1.0062, + "step": 4569 + }, + { + "epoch": 0.480628919242247, + "grad_norm": 1.4651386680966676, + "learning_rate": 2.6951490283906285e-06, + "loss": 0.9801, + "step": 4570 + }, + { + "epoch": 0.4807340896840943, + "grad_norm": 2.0786679973928197, + "learning_rate": 2.6943167453396397e-06, + "loss": 1.0099, + "step": 4571 + }, + { + "epoch": 0.4808392601259416, + "grad_norm": 2.3231766743941784, + "learning_rate": 2.6934844406208243e-06, + "loss": 0.9628, + "step": 4572 + }, + { + "epoch": 0.4809444305677889, + "grad_norm": 2.358885296144915, + "learning_rate": 2.6926521143269914e-06, + "loss": 1.016, + "step": 4573 + }, + { + "epoch": 0.48104960100963623, + "grad_norm": 2.4699479211845197, + "learning_rate": 2.6918197665509506e-06, + "loss": 0.9918, + "step": 4574 + }, + { + "epoch": 0.48115477145148355, + "grad_norm": 2.3154126074282173, + "learning_rate": 2.690987397385516e-06, + "loss": 1.0321, + "step": 4575 + }, + { + "epoch": 0.4812599418933309, + "grad_norm": 2.6809121807728493, + "learning_rate": 2.690155006923503e-06, + "loss": 1.0223, + "step": 4576 + }, + { + "epoch": 0.4813651123351782, + "grad_norm": 2.8409825087081644, + "learning_rate": 2.6893225952577294e-06, + "loss": 1.0256, + "step": 4577 + }, + { + "epoch": 0.4814702827770255, + "grad_norm": 2.235796045311807, + "learning_rate": 2.688490162481015e-06, + "loss": 1.0431, + "step": 4578 + }, + { + "epoch": 0.48157545321887285, + "grad_norm": 2.3738201066374747, + "learning_rate": 2.6876577086861844e-06, + "loss": 1.0046, + "step": 4579 + }, + { + "epoch": 0.4816806236607202, + "grad_norm": 2.2521790012523413, + "learning_rate": 2.686825233966061e-06, + "loss": 1.0065, + "step": 4580 + }, + { + "epoch": 0.4817857941025675, + "grad_norm": 2.680999712329763, + "learning_rate": 2.6859927384134727e-06, + "loss": 0.9777, + "step": 4581 + }, + { + "epoch": 0.48189096454441477, + "grad_norm": 2.4999906862595678, + "learning_rate": 2.685160222121249e-06, + "loss": 1.0089, + "step": 4582 + }, + { + "epoch": 0.4819961349862621, + "grad_norm": 1.5800457189959451, + "learning_rate": 2.6843276851822233e-06, + "loss": 0.9998, + "step": 4583 + }, + { + "epoch": 0.4821013054281094, + "grad_norm": 2.6596489372561725, + "learning_rate": 2.6834951276892273e-06, + "loss": 0.9915, + "step": 4584 + }, + { + "epoch": 0.48220647586995674, + "grad_norm": 2.701721201218792, + "learning_rate": 2.6826625497351e-06, + "loss": 0.9542, + "step": 4585 + }, + { + "epoch": 0.48231164631180407, + "grad_norm": 2.755401995048588, + "learning_rate": 2.681829951412679e-06, + "loss": 1.0366, + "step": 4586 + }, + { + "epoch": 0.4824168167536514, + "grad_norm": 2.6480098597209576, + "learning_rate": 2.6809973328148058e-06, + "loss": 0.9671, + "step": 4587 + }, + { + "epoch": 0.4825219871954987, + "grad_norm": 3.376948552074513, + "learning_rate": 2.6801646940343245e-06, + "loss": 1.0068, + "step": 4588 + }, + { + "epoch": 0.48262715763734604, + "grad_norm": 2.6687667258804013, + "learning_rate": 2.67933203516408e-06, + "loss": 0.9519, + "step": 4589 + }, + { + "epoch": 0.48273232807919336, + "grad_norm": 1.648325858154635, + "learning_rate": 2.6784993562969207e-06, + "loss": 1.0098, + "step": 4590 + }, + { + "epoch": 0.4828374985210407, + "grad_norm": 2.22475778779079, + "learning_rate": 2.6776666575256965e-06, + "loss": 1.0006, + "step": 4591 + }, + { + "epoch": 0.48294266896288796, + "grad_norm": 2.3818188632021093, + "learning_rate": 2.676833938943259e-06, + "loss": 1.0092, + "step": 4592 + }, + { + "epoch": 0.4830478394047353, + "grad_norm": 2.408034896184432, + "learning_rate": 2.6760012006424647e-06, + "loss": 0.998, + "step": 4593 + }, + { + "epoch": 0.4831530098465826, + "grad_norm": 1.8677903443913986, + "learning_rate": 2.6751684427161684e-06, + "loss": 0.9938, + "step": 4594 + }, + { + "epoch": 0.48325818028842993, + "grad_norm": 3.2959095169625856, + "learning_rate": 2.6743356652572304e-06, + "loss": 0.9984, + "step": 4595 + }, + { + "epoch": 0.48336335073027725, + "grad_norm": 3.1404363237116604, + "learning_rate": 2.673502868358512e-06, + "loss": 1.0293, + "step": 4596 + }, + { + "epoch": 0.4834685211721246, + "grad_norm": 2.5720636875148197, + "learning_rate": 2.6726700521128757e-06, + "loss": 1.0043, + "step": 4597 + }, + { + "epoch": 0.4835736916139719, + "grad_norm": 2.363912157861966, + "learning_rate": 2.671837216613187e-06, + "loss": 0.974, + "step": 4598 + }, + { + "epoch": 0.4836788620558192, + "grad_norm": 2.618997975520033, + "learning_rate": 2.6710043619523128e-06, + "loss": 1.0115, + "step": 4599 + }, + { + "epoch": 0.48378403249766655, + "grad_norm": 2.2993164157541632, + "learning_rate": 2.6701714882231256e-06, + "loss": 0.9936, + "step": 4600 + }, + { + "epoch": 0.4838892029395139, + "grad_norm": 2.7590745627058864, + "learning_rate": 2.669338595518494e-06, + "loss": 0.9784, + "step": 4601 + }, + { + "epoch": 0.48399437338136114, + "grad_norm": 2.7995862197909447, + "learning_rate": 2.6685056839312944e-06, + "loss": 0.976, + "step": 4602 + }, + { + "epoch": 0.48409954382320847, + "grad_norm": 2.571103128838185, + "learning_rate": 2.667672753554402e-06, + "loss": 0.9439, + "step": 4603 + }, + { + "epoch": 0.4842047142650558, + "grad_norm": 1.8408434670986793, + "learning_rate": 2.6668398044806946e-06, + "loss": 0.9928, + "step": 4604 + }, + { + "epoch": 0.4843098847069031, + "grad_norm": 2.135680773980519, + "learning_rate": 2.666006836803054e-06, + "loss": 1.0166, + "step": 4605 + }, + { + "epoch": 0.48441505514875044, + "grad_norm": 2.2802759432661186, + "learning_rate": 2.6651738506143603e-06, + "loss": 1.0261, + "step": 4606 + }, + { + "epoch": 0.48452022559059776, + "grad_norm": 2.220392952522567, + "learning_rate": 2.6643408460075e-06, + "loss": 1.0148, + "step": 4607 + }, + { + "epoch": 0.4846253960324451, + "grad_norm": 2.5358133229575097, + "learning_rate": 2.663507823075358e-06, + "loss": 0.9698, + "step": 4608 + }, + { + "epoch": 0.4847305664742924, + "grad_norm": 2.375319857666318, + "learning_rate": 2.662674781910824e-06, + "loss": 0.965, + "step": 4609 + }, + { + "epoch": 0.48483573691613974, + "grad_norm": 2.213027783818853, + "learning_rate": 2.6618417226067877e-06, + "loss": 1.0248, + "step": 4610 + }, + { + "epoch": 0.48494090735798706, + "grad_norm": 2.148489365280874, + "learning_rate": 2.6610086452561423e-06, + "loss": 0.9983, + "step": 4611 + }, + { + "epoch": 0.48504607779983433, + "grad_norm": 2.5409516705940627, + "learning_rate": 2.6601755499517826e-06, + "loss": 0.9994, + "step": 4612 + }, + { + "epoch": 0.48515124824168165, + "grad_norm": 2.363278403171672, + "learning_rate": 2.6593424367866042e-06, + "loss": 1.0018, + "step": 4613 + }, + { + "epoch": 0.485256418683529, + "grad_norm": 2.1133831998111945, + "learning_rate": 2.658509305853507e-06, + "loss": 1.0488, + "step": 4614 + }, + { + "epoch": 0.4853615891253763, + "grad_norm": 2.42236880112027, + "learning_rate": 2.6576761572453903e-06, + "loss": 0.9917, + "step": 4615 + }, + { + "epoch": 0.4854667595672236, + "grad_norm": 2.772025975679734, + "learning_rate": 2.6568429910551574e-06, + "loss": 1.026, + "step": 4616 + }, + { + "epoch": 0.48557193000907095, + "grad_norm": 2.5143222232504487, + "learning_rate": 2.6560098073757122e-06, + "loss": 0.9878, + "step": 4617 + }, + { + "epoch": 0.4856771004509183, + "grad_norm": 2.073400372306121, + "learning_rate": 2.6551766062999624e-06, + "loss": 0.9997, + "step": 4618 + }, + { + "epoch": 0.4857822708927656, + "grad_norm": 1.9864972586645486, + "learning_rate": 2.6543433879208147e-06, + "loss": 1.0091, + "step": 4619 + }, + { + "epoch": 0.4858874413346129, + "grad_norm": 2.751681846656289, + "learning_rate": 2.6535101523311806e-06, + "loss": 0.9822, + "step": 4620 + }, + { + "epoch": 0.48599261177646025, + "grad_norm": 2.313188102376116, + "learning_rate": 2.6526768996239725e-06, + "loss": 0.9828, + "step": 4621 + }, + { + "epoch": 0.4860977822183075, + "grad_norm": 1.8303821979796089, + "learning_rate": 2.651843629892103e-06, + "loss": 1.0083, + "step": 4622 + }, + { + "epoch": 0.48620295266015484, + "grad_norm": 1.6838185182278091, + "learning_rate": 2.65101034322849e-06, + "loss": 1.0225, + "step": 4623 + }, + { + "epoch": 0.48630812310200217, + "grad_norm": 2.3240841743762193, + "learning_rate": 2.6501770397260503e-06, + "loss": 0.9677, + "step": 4624 + }, + { + "epoch": 0.4864132935438495, + "grad_norm": 2.0762876706178477, + "learning_rate": 2.6493437194777038e-06, + "loss": 0.9767, + "step": 4625 + }, + { + "epoch": 0.4865184639856968, + "grad_norm": 2.7261027597048373, + "learning_rate": 2.648510382576373e-06, + "loss": 1.0191, + "step": 4626 + }, + { + "epoch": 0.48662363442754414, + "grad_norm": 2.5618337156491235, + "learning_rate": 2.647677029114981e-06, + "loss": 0.9701, + "step": 4627 + }, + { + "epoch": 0.48672880486939146, + "grad_norm": 2.703008948964744, + "learning_rate": 2.6468436591864534e-06, + "loss": 1.0215, + "step": 4628 + }, + { + "epoch": 0.4868339753112388, + "grad_norm": 2.0399542166728675, + "learning_rate": 2.6460102728837156e-06, + "loss": 1.0245, + "step": 4629 + }, + { + "epoch": 0.4869391457530861, + "grad_norm": 1.561628488644531, + "learning_rate": 2.6451768702996987e-06, + "loss": 0.9581, + "step": 4630 + }, + { + "epoch": 0.48704431619493344, + "grad_norm": 1.8892703105994535, + "learning_rate": 2.644343451527333e-06, + "loss": 0.9982, + "step": 4631 + }, + { + "epoch": 0.48714948663678076, + "grad_norm": 2.6663357784839676, + "learning_rate": 2.6435100166595516e-06, + "loss": 0.9515, + "step": 4632 + }, + { + "epoch": 0.48725465707862803, + "grad_norm": 2.71454659482501, + "learning_rate": 2.6426765657892883e-06, + "loss": 1.0505, + "step": 4633 + }, + { + "epoch": 0.48735982752047535, + "grad_norm": 2.4346295714541246, + "learning_rate": 2.64184309900948e-06, + "loss": 0.9909, + "step": 4634 + }, + { + "epoch": 0.4874649979623227, + "grad_norm": 2.884081836500342, + "learning_rate": 2.641009616413064e-06, + "loss": 1.0068, + "step": 4635 + }, + { + "epoch": 0.48757016840417, + "grad_norm": 3.1422402377931924, + "learning_rate": 2.6401761180929798e-06, + "loss": 1.0288, + "step": 4636 + }, + { + "epoch": 0.4876753388460173, + "grad_norm": 2.524963864157816, + "learning_rate": 2.63934260414217e-06, + "loss": 1.0381, + "step": 4637 + }, + { + "epoch": 0.48778050928786465, + "grad_norm": 2.009876109861969, + "learning_rate": 2.638509074653577e-06, + "loss": 0.9859, + "step": 4638 + }, + { + "epoch": 0.487885679729712, + "grad_norm": 2.0009976732169847, + "learning_rate": 2.637675529720147e-06, + "loss": 0.9932, + "step": 4639 + }, + { + "epoch": 0.4879908501715593, + "grad_norm": 2.0651436252017388, + "learning_rate": 2.6368419694348248e-06, + "loss": 1.0164, + "step": 4640 + }, + { + "epoch": 0.4880960206134066, + "grad_norm": 2.498095061638486, + "learning_rate": 2.6360083938905612e-06, + "loss": 1.0162, + "step": 4641 + }, + { + "epoch": 0.48820119105525395, + "grad_norm": 2.305734098685978, + "learning_rate": 2.635174803180305e-06, + "loss": 1.0235, + "step": 4642 + }, + { + "epoch": 0.4883063614971012, + "grad_norm": 1.9716369355731187, + "learning_rate": 2.6343411973970075e-06, + "loss": 0.9755, + "step": 4643 + }, + { + "epoch": 0.48841153193894854, + "grad_norm": 2.453378278184514, + "learning_rate": 2.633507576633623e-06, + "loss": 1.0, + "step": 4644 + }, + { + "epoch": 0.48851670238079586, + "grad_norm": 2.868869437829872, + "learning_rate": 2.632673940983106e-06, + "loss": 1.0469, + "step": 4645 + }, + { + "epoch": 0.4886218728226432, + "grad_norm": 3.2665412006936227, + "learning_rate": 2.631840290538415e-06, + "loss": 0.998, + "step": 4646 + }, + { + "epoch": 0.4887270432644905, + "grad_norm": 2.280933184203751, + "learning_rate": 2.6310066253925067e-06, + "loss": 0.9999, + "step": 4647 + }, + { + "epoch": 0.48883221370633784, + "grad_norm": 2.389682732235324, + "learning_rate": 2.6301729456383425e-06, + "loss": 0.9938, + "step": 4648 + }, + { + "epoch": 0.48893738414818516, + "grad_norm": 2.257361752513528, + "learning_rate": 2.629339251368884e-06, + "loss": 1.0198, + "step": 4649 + }, + { + "epoch": 0.4890425545900325, + "grad_norm": 3.1433460780930336, + "learning_rate": 2.6285055426770935e-06, + "loss": 1.022, + "step": 4650 + }, + { + "epoch": 0.4891477250318798, + "grad_norm": 2.1429693200460904, + "learning_rate": 2.627671819655937e-06, + "loss": 0.999, + "step": 4651 + }, + { + "epoch": 0.48925289547372713, + "grad_norm": 2.3878543730972206, + "learning_rate": 2.62683808239838e-06, + "loss": 1.0023, + "step": 4652 + }, + { + "epoch": 0.4893580659155744, + "grad_norm": 2.7691239385645683, + "learning_rate": 2.6260043309973925e-06, + "loss": 0.9745, + "step": 4653 + }, + { + "epoch": 0.4894632363574217, + "grad_norm": 2.499584360819442, + "learning_rate": 2.625170565545943e-06, + "loss": 1.0039, + "step": 4654 + }, + { + "epoch": 0.48956840679926905, + "grad_norm": 2.3633614479755547, + "learning_rate": 2.624336786137003e-06, + "loss": 0.9862, + "step": 4655 + }, + { + "epoch": 0.4896735772411164, + "grad_norm": 1.7313247618731031, + "learning_rate": 2.6235029928635457e-06, + "loss": 1.0106, + "step": 4656 + }, + { + "epoch": 0.4897787476829637, + "grad_norm": 2.176190189031968, + "learning_rate": 2.6226691858185454e-06, + "loss": 0.9691, + "step": 4657 + }, + { + "epoch": 0.489883918124811, + "grad_norm": 3.0580118331682637, + "learning_rate": 2.621835365094978e-06, + "loss": 1.0148, + "step": 4658 + }, + { + "epoch": 0.48998908856665835, + "grad_norm": 2.3238272613013864, + "learning_rate": 2.6210015307858207e-06, + "loss": 0.9906, + "step": 4659 + }, + { + "epoch": 0.4900942590085057, + "grad_norm": 2.6038315918171047, + "learning_rate": 2.620167682984052e-06, + "loss": 0.982, + "step": 4660 + }, + { + "epoch": 0.490199429450353, + "grad_norm": 3.173624125525723, + "learning_rate": 2.6193338217826536e-06, + "loss": 1.0282, + "step": 4661 + }, + { + "epoch": 0.4903045998922003, + "grad_norm": 2.4485387128876868, + "learning_rate": 2.6184999472746076e-06, + "loss": 1.019, + "step": 4662 + }, + { + "epoch": 0.4904097703340476, + "grad_norm": 2.2273055407932096, + "learning_rate": 2.6176660595528967e-06, + "loss": 1.0285, + "step": 4663 + }, + { + "epoch": 0.4905149407758949, + "grad_norm": 2.3699531308839004, + "learning_rate": 2.616832158710506e-06, + "loss": 0.988, + "step": 4664 + }, + { + "epoch": 0.49062011121774224, + "grad_norm": 2.5297910043499283, + "learning_rate": 2.6159982448404227e-06, + "loss": 1.0217, + "step": 4665 + }, + { + "epoch": 0.49072528165958956, + "grad_norm": 2.29822026488786, + "learning_rate": 2.615164318035633e-06, + "loss": 0.9885, + "step": 4666 + }, + { + "epoch": 0.4908304521014369, + "grad_norm": 2.3420596691918796, + "learning_rate": 2.6143303783891278e-06, + "loss": 1.0152, + "step": 4667 + }, + { + "epoch": 0.4909356225432842, + "grad_norm": 2.6066387189345885, + "learning_rate": 2.6134964259938967e-06, + "loss": 1.0215, + "step": 4668 + }, + { + "epoch": 0.49104079298513154, + "grad_norm": 2.579302269115743, + "learning_rate": 2.6126624609429325e-06, + "loss": 0.9935, + "step": 4669 + }, + { + "epoch": 0.49114596342697886, + "grad_norm": 3.1504338937696814, + "learning_rate": 2.611828483329229e-06, + "loss": 0.9786, + "step": 4670 + }, + { + "epoch": 0.4912511338688262, + "grad_norm": 2.3543913083014365, + "learning_rate": 2.6109944932457813e-06, + "loss": 0.9647, + "step": 4671 + }, + { + "epoch": 0.4913563043106735, + "grad_norm": 2.586644471709751, + "learning_rate": 2.610160490785584e-06, + "loss": 1.0352, + "step": 4672 + }, + { + "epoch": 0.4914614747525208, + "grad_norm": 2.135602362463421, + "learning_rate": 2.6093264760416377e-06, + "loss": 1.0012, + "step": 4673 + }, + { + "epoch": 0.4915666451943681, + "grad_norm": 2.8951974224499795, + "learning_rate": 2.6084924491069397e-06, + "loss": 0.989, + "step": 4674 + }, + { + "epoch": 0.4916718156362154, + "grad_norm": 2.670407042084373, + "learning_rate": 2.60765841007449e-06, + "loss": 0.9966, + "step": 4675 + }, + { + "epoch": 0.49177698607806275, + "grad_norm": 2.5070143385941073, + "learning_rate": 2.606824359037292e-06, + "loss": 0.9903, + "step": 4676 + }, + { + "epoch": 0.4918821565199101, + "grad_norm": 3.1780468314666246, + "learning_rate": 2.605990296088348e-06, + "loss": 0.9961, + "step": 4677 + }, + { + "epoch": 0.4919873269617574, + "grad_norm": 3.0212424807578233, + "learning_rate": 2.6051562213206633e-06, + "loss": 1.0063, + "step": 4678 + }, + { + "epoch": 0.4920924974036047, + "grad_norm": 2.1752773833274732, + "learning_rate": 2.604322134827242e-06, + "loss": 0.9692, + "step": 4679 + }, + { + "epoch": 0.49219766784545205, + "grad_norm": 2.101331063752568, + "learning_rate": 2.603488036701093e-06, + "loss": 0.9853, + "step": 4680 + }, + { + "epoch": 0.49230283828729937, + "grad_norm": 2.7504154040985385, + "learning_rate": 2.6026539270352234e-06, + "loss": 0.977, + "step": 4681 + }, + { + "epoch": 0.4924080087291467, + "grad_norm": 2.283095151367684, + "learning_rate": 2.6018198059226433e-06, + "loss": 0.9963, + "step": 4682 + }, + { + "epoch": 0.492513179170994, + "grad_norm": 3.6836502509688196, + "learning_rate": 2.600985673456364e-06, + "loss": 1.0127, + "step": 4683 + }, + { + "epoch": 0.4926183496128413, + "grad_norm": 1.7196183166216403, + "learning_rate": 2.6001515297293982e-06, + "loss": 0.96, + "step": 4684 + }, + { + "epoch": 0.4927235200546886, + "grad_norm": 2.0308139618206105, + "learning_rate": 2.599317374834759e-06, + "loss": 0.9981, + "step": 4685 + }, + { + "epoch": 0.49282869049653594, + "grad_norm": 2.7245202798321766, + "learning_rate": 2.59848320886546e-06, + "loss": 1.0274, + "step": 4686 + }, + { + "epoch": 0.49293386093838326, + "grad_norm": 2.2582233947427786, + "learning_rate": 2.597649031914519e-06, + "loss": 1.031, + "step": 4687 + }, + { + "epoch": 0.4930390313802306, + "grad_norm": 2.875499760161925, + "learning_rate": 2.5968148440749526e-06, + "loss": 1.0076, + "step": 4688 + }, + { + "epoch": 0.4931442018220779, + "grad_norm": 2.3873631179045076, + "learning_rate": 2.595980645439778e-06, + "loss": 1.0003, + "step": 4689 + }, + { + "epoch": 0.49324937226392523, + "grad_norm": 1.832107108457019, + "learning_rate": 2.595146436102016e-06, + "loss": 0.9479, + "step": 4690 + }, + { + "epoch": 0.49335454270577256, + "grad_norm": 2.1390781829339893, + "learning_rate": 2.5943122161546874e-06, + "loss": 0.9757, + "step": 4691 + }, + { + "epoch": 0.4934597131476199, + "grad_norm": 2.678011051923976, + "learning_rate": 2.593477985690815e-06, + "loss": 1.0185, + "step": 4692 + }, + { + "epoch": 0.4935648835894672, + "grad_norm": 2.655131659501105, + "learning_rate": 2.59264374480342e-06, + "loss": 0.9632, + "step": 4693 + }, + { + "epoch": 0.4936700540313145, + "grad_norm": 2.387020654145177, + "learning_rate": 2.5918094935855275e-06, + "loss": 0.9935, + "step": 4694 + }, + { + "epoch": 0.4937752244731618, + "grad_norm": 2.484922013212365, + "learning_rate": 2.5909752321301633e-06, + "loss": 0.9588, + "step": 4695 + }, + { + "epoch": 0.4938803949150091, + "grad_norm": 3.105536483019693, + "learning_rate": 2.590140960530355e-06, + "loss": 0.9992, + "step": 4696 + }, + { + "epoch": 0.49398556535685645, + "grad_norm": 2.4101649578596445, + "learning_rate": 2.589306678879129e-06, + "loss": 1.039, + "step": 4697 + }, + { + "epoch": 0.4940907357987038, + "grad_norm": 2.383606369665489, + "learning_rate": 2.5884723872695138e-06, + "loss": 1.0335, + "step": 4698 + }, + { + "epoch": 0.4941959062405511, + "grad_norm": 2.5503747168717874, + "learning_rate": 2.587638085794541e-06, + "loss": 1.0034, + "step": 4699 + }, + { + "epoch": 0.4943010766823984, + "grad_norm": 2.0251324980808287, + "learning_rate": 2.5868037745472408e-06, + "loss": 0.9547, + "step": 4700 + }, + { + "epoch": 0.49440624712424575, + "grad_norm": 2.2254913031928596, + "learning_rate": 2.5859694536206455e-06, + "loss": 1.018, + "step": 4701 + }, + { + "epoch": 0.49451141756609307, + "grad_norm": 1.7896906696794816, + "learning_rate": 2.5851351231077876e-06, + "loss": 0.9859, + "step": 4702 + }, + { + "epoch": 0.4946165880079404, + "grad_norm": 2.6959960617755168, + "learning_rate": 2.5843007831017024e-06, + "loss": 1.0311, + "step": 4703 + }, + { + "epoch": 0.49472175844978766, + "grad_norm": 3.497101772230584, + "learning_rate": 2.583466433695425e-06, + "loss": 0.98, + "step": 4704 + }, + { + "epoch": 0.494826928891635, + "grad_norm": 2.220969128258192, + "learning_rate": 2.5826320749819917e-06, + "loss": 1.0059, + "step": 4705 + }, + { + "epoch": 0.4949320993334823, + "grad_norm": 2.477231282570078, + "learning_rate": 2.5817977070544408e-06, + "loss": 1.0255, + "step": 4706 + }, + { + "epoch": 0.49503726977532964, + "grad_norm": 2.43917889374586, + "learning_rate": 2.5809633300058095e-06, + "loss": 1.0001, + "step": 4707 + }, + { + "epoch": 0.49514244021717696, + "grad_norm": 3.203459546979018, + "learning_rate": 2.580128943929139e-06, + "loss": 0.9777, + "step": 4708 + }, + { + "epoch": 0.4952476106590243, + "grad_norm": 2.072545796219728, + "learning_rate": 2.579294548917467e-06, + "loss": 0.9962, + "step": 4709 + }, + { + "epoch": 0.4953527811008716, + "grad_norm": 2.5943743382810465, + "learning_rate": 2.578460145063838e-06, + "loss": 0.9941, + "step": 4710 + }, + { + "epoch": 0.49545795154271893, + "grad_norm": 1.9553371503993335, + "learning_rate": 2.577625732461293e-06, + "loss": 0.9708, + "step": 4711 + }, + { + "epoch": 0.49556312198456626, + "grad_norm": 2.5527175195599345, + "learning_rate": 2.576791311202876e-06, + "loss": 1.0314, + "step": 4712 + }, + { + "epoch": 0.4956682924264136, + "grad_norm": 2.7301034938935858, + "learning_rate": 2.575956881381631e-06, + "loss": 1.0063, + "step": 4713 + }, + { + "epoch": 0.49577346286826085, + "grad_norm": 2.3128243301857165, + "learning_rate": 2.575122443090604e-06, + "loss": 1.0161, + "step": 4714 + }, + { + "epoch": 0.4958786333101082, + "grad_norm": 2.1154594017025357, + "learning_rate": 2.574287996422841e-06, + "loss": 1.0245, + "step": 4715 + }, + { + "epoch": 0.4959838037519555, + "grad_norm": 2.2840660853868533, + "learning_rate": 2.573453541471389e-06, + "loss": 1.0142, + "step": 4716 + }, + { + "epoch": 0.4960889741938028, + "grad_norm": 2.5091203779297104, + "learning_rate": 2.572619078329297e-06, + "loss": 0.9737, + "step": 4717 + }, + { + "epoch": 0.49619414463565015, + "grad_norm": 2.004908508620825, + "learning_rate": 2.571784607089613e-06, + "loss": 0.9905, + "step": 4718 + }, + { + "epoch": 0.49629931507749747, + "grad_norm": 2.525170667494257, + "learning_rate": 2.570950127845388e-06, + "loss": 0.9913, + "step": 4719 + }, + { + "epoch": 0.4964044855193448, + "grad_norm": 2.3933251418961414, + "learning_rate": 2.5701156406896726e-06, + "loss": 1.0001, + "step": 4720 + }, + { + "epoch": 0.4965096559611921, + "grad_norm": 2.7182770035408166, + "learning_rate": 2.5692811457155186e-06, + "loss": 0.9676, + "step": 4721 + }, + { + "epoch": 0.49661482640303944, + "grad_norm": 2.8066431382335386, + "learning_rate": 2.5684466430159794e-06, + "loss": 0.9872, + "step": 4722 + }, + { + "epoch": 0.49671999684488677, + "grad_norm": 2.255340942233427, + "learning_rate": 2.5676121326841063e-06, + "loss": 0.9676, + "step": 4723 + }, + { + "epoch": 0.49682516728673404, + "grad_norm": 2.0906594545737267, + "learning_rate": 2.566777614812956e-06, + "loss": 0.9905, + "step": 4724 + }, + { + "epoch": 0.49693033772858136, + "grad_norm": 3.0950706282178184, + "learning_rate": 2.5659430894955827e-06, + "loss": 1.0, + "step": 4725 + }, + { + "epoch": 0.4970355081704287, + "grad_norm": 2.7642796610888065, + "learning_rate": 2.5651085568250426e-06, + "loss": 0.9555, + "step": 4726 + }, + { + "epoch": 0.497140678612276, + "grad_norm": 2.584998095593244, + "learning_rate": 2.564274016894393e-06, + "loss": 0.987, + "step": 4727 + }, + { + "epoch": 0.49724584905412333, + "grad_norm": 2.688374289525621, + "learning_rate": 2.5634394697966915e-06, + "loss": 0.9615, + "step": 4728 + }, + { + "epoch": 0.49735101949597066, + "grad_norm": 2.9732686786827056, + "learning_rate": 2.562604915624996e-06, + "loss": 0.9954, + "step": 4729 + }, + { + "epoch": 0.497456189937818, + "grad_norm": 2.6909679945625635, + "learning_rate": 2.5617703544723653e-06, + "loss": 0.9819, + "step": 4730 + }, + { + "epoch": 0.4975613603796653, + "grad_norm": 2.397135959093128, + "learning_rate": 2.5609357864318614e-06, + "loss": 1.0235, + "step": 4731 + }, + { + "epoch": 0.49766653082151263, + "grad_norm": 2.7889907201615185, + "learning_rate": 2.5601012115965425e-06, + "loss": 1.0192, + "step": 4732 + }, + { + "epoch": 0.49777170126335996, + "grad_norm": 2.4282752934234417, + "learning_rate": 2.559266630059473e-06, + "loss": 1.0139, + "step": 4733 + }, + { + "epoch": 0.4978768717052072, + "grad_norm": 2.8902978095243874, + "learning_rate": 2.5584320419137127e-06, + "loss": 1.0143, + "step": 4734 + }, + { + "epoch": 0.49798204214705455, + "grad_norm": 3.286409074951358, + "learning_rate": 2.5575974472523273e-06, + "loss": 1.0451, + "step": 4735 + }, + { + "epoch": 0.4980872125889019, + "grad_norm": 2.009314598243189, + "learning_rate": 2.556762846168378e-06, + "loss": 0.9562, + "step": 4736 + }, + { + "epoch": 0.4981923830307492, + "grad_norm": 1.832895791185105, + "learning_rate": 2.5559282387549305e-06, + "loss": 1.0244, + "step": 4737 + }, + { + "epoch": 0.4982975534725965, + "grad_norm": 2.1324340102401798, + "learning_rate": 2.5550936251050503e-06, + "loss": 1.0173, + "step": 4738 + }, + { + "epoch": 0.49840272391444385, + "grad_norm": 2.608030569296082, + "learning_rate": 2.5542590053118022e-06, + "loss": 0.986, + "step": 4739 + }, + { + "epoch": 0.49850789435629117, + "grad_norm": 2.7802760127585473, + "learning_rate": 2.553424379468254e-06, + "loss": 1.0234, + "step": 4740 + }, + { + "epoch": 0.4986130647981385, + "grad_norm": 2.7173683170964082, + "learning_rate": 2.5525897476674722e-06, + "loss": 1.0201, + "step": 4741 + }, + { + "epoch": 0.4987182352399858, + "grad_norm": 2.67038694261878, + "learning_rate": 2.5517551100025257e-06, + "loss": 1.004, + "step": 4742 + }, + { + "epoch": 0.49882340568183314, + "grad_norm": 2.9249794319362192, + "learning_rate": 2.550920466566483e-06, + "loss": 1.011, + "step": 4743 + }, + { + "epoch": 0.49892857612368047, + "grad_norm": 2.6543517747008405, + "learning_rate": 2.5500858174524105e-06, + "loss": 0.9933, + "step": 4744 + }, + { + "epoch": 0.49903374656552774, + "grad_norm": 2.3829909949524537, + "learning_rate": 2.5492511627533816e-06, + "loss": 1.0197, + "step": 4745 + }, + { + "epoch": 0.49913891700737506, + "grad_norm": 2.850982046315611, + "learning_rate": 2.548416502562465e-06, + "loss": 0.9597, + "step": 4746 + }, + { + "epoch": 0.4992440874492224, + "grad_norm": 2.1710344138345365, + "learning_rate": 2.5475818369727328e-06, + "loss": 0.9962, + "step": 4747 + }, + { + "epoch": 0.4993492578910697, + "grad_norm": 2.3265309831025265, + "learning_rate": 2.5467471660772557e-06, + "loss": 1.0114, + "step": 4748 + }, + { + "epoch": 0.49945442833291703, + "grad_norm": 2.701708578526532, + "learning_rate": 2.545912489969107e-06, + "loss": 1.0285, + "step": 4749 + }, + { + "epoch": 0.49955959877476436, + "grad_norm": 2.650784964191637, + "learning_rate": 2.5450778087413588e-06, + "loss": 0.9867, + "step": 4750 + }, + { + "epoch": 0.4996647692166117, + "grad_norm": 2.5087138546199355, + "learning_rate": 2.5442431224870847e-06, + "loss": 1.0158, + "step": 4751 + }, + { + "epoch": 0.499769939658459, + "grad_norm": 2.995804939311976, + "learning_rate": 2.5434084312993582e-06, + "loss": 1.0099, + "step": 4752 + }, + { + "epoch": 0.49987511010030633, + "grad_norm": 2.6588509171871553, + "learning_rate": 2.542573735271255e-06, + "loss": 1.0047, + "step": 4753 + }, + { + "epoch": 0.49998028054215365, + "grad_norm": 1.8191787178519514, + "learning_rate": 2.5417390344958494e-06, + "loss": 1.0107, + "step": 4754 + }, + { + "epoch": 0.5000854509840009, + "grad_norm": 1.8754132743779173, + "learning_rate": 2.5409043290662173e-06, + "loss": 0.988, + "step": 4755 + }, + { + "epoch": 0.5001906214258482, + "grad_norm": 2.631056408985051, + "learning_rate": 2.5400696190754347e-06, + "loss": 1.0357, + "step": 4756 + }, + { + "epoch": 0.5002957918676956, + "grad_norm": 2.032133232558368, + "learning_rate": 2.5392349046165783e-06, + "loss": 0.9912, + "step": 4757 + }, + { + "epoch": 0.5004009623095429, + "grad_norm": 2.0586532168086187, + "learning_rate": 2.5384001857827256e-06, + "loss": 0.9694, + "step": 4758 + }, + { + "epoch": 0.5005061327513902, + "grad_norm": 2.644307665365883, + "learning_rate": 2.5375654626669537e-06, + "loss": 0.9973, + "step": 4759 + }, + { + "epoch": 0.5006113031932375, + "grad_norm": 1.773857858460592, + "learning_rate": 2.5367307353623404e-06, + "loss": 0.9653, + "step": 4760 + }, + { + "epoch": 0.5007164736350849, + "grad_norm": 2.3144669620091585, + "learning_rate": 2.5358960039619653e-06, + "loss": 0.9818, + "step": 4761 + }, + { + "epoch": 0.5008216440769322, + "grad_norm": 3.4633187371850513, + "learning_rate": 2.535061268558906e-06, + "loss": 1.0327, + "step": 4762 + }, + { + "epoch": 0.5009268145187795, + "grad_norm": 1.7464824159525518, + "learning_rate": 2.5342265292462437e-06, + "loss": 0.9835, + "step": 4763 + }, + { + "epoch": 0.5010319849606268, + "grad_norm": 2.5356175046255336, + "learning_rate": 2.5333917861170576e-06, + "loss": 0.9872, + "step": 4764 + }, + { + "epoch": 0.5011371554024742, + "grad_norm": 2.4355449363964503, + "learning_rate": 2.5325570392644282e-06, + "loss": 1.0068, + "step": 4765 + }, + { + "epoch": 0.5012423258443215, + "grad_norm": 2.415671397242718, + "learning_rate": 2.531722288781436e-06, + "loss": 0.9525, + "step": 4766 + }, + { + "epoch": 0.5013474962861688, + "grad_norm": 3.179290978551529, + "learning_rate": 2.5308875347611613e-06, + "loss": 1.0196, + "step": 4767 + }, + { + "epoch": 0.5014526667280161, + "grad_norm": 2.0301483199514765, + "learning_rate": 2.530052777296687e-06, + "loss": 1.0312, + "step": 4768 + }, + { + "epoch": 0.5015578371698635, + "grad_norm": 1.7988911344950873, + "learning_rate": 2.5292180164810944e-06, + "loss": 0.9735, + "step": 4769 + }, + { + "epoch": 0.5016630076117107, + "grad_norm": 2.91048308505893, + "learning_rate": 2.5283832524074657e-06, + "loss": 1.0213, + "step": 4770 + }, + { + "epoch": 0.501768178053558, + "grad_norm": 2.4229460616322136, + "learning_rate": 2.5275484851688848e-06, + "loss": 0.9999, + "step": 4771 + }, + { + "epoch": 0.5018733484954053, + "grad_norm": 2.4418059020235003, + "learning_rate": 2.5267137148584335e-06, + "loss": 0.9837, + "step": 4772 + }, + { + "epoch": 0.5019785189372526, + "grad_norm": 2.3825700805364, + "learning_rate": 2.5258789415691947e-06, + "loss": 1.039, + "step": 4773 + }, + { + "epoch": 0.5020836893791, + "grad_norm": 2.5170794669290286, + "learning_rate": 2.5250441653942536e-06, + "loss": 0.9986, + "step": 4774 + }, + { + "epoch": 0.5021888598209473, + "grad_norm": 2.894554710829124, + "learning_rate": 2.5242093864266936e-06, + "loss": 0.9848, + "step": 4775 + }, + { + "epoch": 0.5022940302627946, + "grad_norm": 2.3297677080374064, + "learning_rate": 2.5233746047595984e-06, + "loss": 0.9898, + "step": 4776 + }, + { + "epoch": 0.502399200704642, + "grad_norm": 2.7131245724405604, + "learning_rate": 2.5225398204860534e-06, + "loss": 1.0009, + "step": 4777 + }, + { + "epoch": 0.5025043711464893, + "grad_norm": 2.264411534263541, + "learning_rate": 2.5217050336991434e-06, + "loss": 0.9784, + "step": 4778 + }, + { + "epoch": 0.5026095415883366, + "grad_norm": 2.2425268597601833, + "learning_rate": 2.520870244491954e-06, + "loss": 0.9914, + "step": 4779 + }, + { + "epoch": 0.5027147120301839, + "grad_norm": 1.5512671176052935, + "learning_rate": 2.5200354529575693e-06, + "loss": 0.9783, + "step": 4780 + }, + { + "epoch": 0.5028198824720312, + "grad_norm": 2.4180849734923515, + "learning_rate": 2.5192006591890767e-06, + "loss": 1.0115, + "step": 4781 + }, + { + "epoch": 0.5029250529138786, + "grad_norm": 2.0161358166931813, + "learning_rate": 2.5183658632795614e-06, + "loss": 1.011, + "step": 4782 + }, + { + "epoch": 0.5030302233557259, + "grad_norm": 2.9226670044367222, + "learning_rate": 2.5175310653221092e-06, + "loss": 0.9739, + "step": 4783 + }, + { + "epoch": 0.5031353937975732, + "grad_norm": 2.421297281935595, + "learning_rate": 2.5166962654098075e-06, + "loss": 1.0089, + "step": 4784 + }, + { + "epoch": 0.5032405642394205, + "grad_norm": 1.820279687289599, + "learning_rate": 2.515861463635742e-06, + "loss": 1.0076, + "step": 4785 + }, + { + "epoch": 0.5033457346812679, + "grad_norm": 2.13363221480922, + "learning_rate": 2.515026660093e-06, + "loss": 1.0352, + "step": 4786 + }, + { + "epoch": 0.5034509051231152, + "grad_norm": 2.2192281908305596, + "learning_rate": 2.51419185487467e-06, + "loss": 0.9947, + "step": 4787 + }, + { + "epoch": 0.5035560755649625, + "grad_norm": 2.6940645632774833, + "learning_rate": 2.5133570480738367e-06, + "loss": 0.9647, + "step": 4788 + }, + { + "epoch": 0.5036612460068098, + "grad_norm": 2.92639520094335, + "learning_rate": 2.5125222397835893e-06, + "loss": 0.9956, + "step": 4789 + }, + { + "epoch": 0.503766416448657, + "grad_norm": 1.796576628883353, + "learning_rate": 2.511687430097014e-06, + "loss": 0.9822, + "step": 4790 + }, + { + "epoch": 0.5038715868905044, + "grad_norm": 2.66563998878901, + "learning_rate": 2.5108526191072e-06, + "loss": 0.9973, + "step": 4791 + }, + { + "epoch": 0.5039767573323517, + "grad_norm": 1.8895115699475404, + "learning_rate": 2.5100178069072347e-06, + "loss": 0.9552, + "step": 4792 + }, + { + "epoch": 0.504081927774199, + "grad_norm": 3.798229950568707, + "learning_rate": 2.5091829935902063e-06, + "loss": 0.9752, + "step": 4793 + }, + { + "epoch": 0.5041870982160463, + "grad_norm": 2.8809636374230667, + "learning_rate": 2.508348179249203e-06, + "loss": 0.983, + "step": 4794 + }, + { + "epoch": 0.5042922686578937, + "grad_norm": 2.5746810183718436, + "learning_rate": 2.5075133639773126e-06, + "loss": 0.9668, + "step": 4795 + }, + { + "epoch": 0.504397439099741, + "grad_norm": 1.8118205407198673, + "learning_rate": 2.506678547867623e-06, + "loss": 0.999, + "step": 4796 + }, + { + "epoch": 0.5045026095415883, + "grad_norm": 1.5885999010984382, + "learning_rate": 2.5058437310132244e-06, + "loss": 0.9747, + "step": 4797 + }, + { + "epoch": 0.5046077799834356, + "grad_norm": 2.1825556187323114, + "learning_rate": 2.5050089135072044e-06, + "loss": 0.9754, + "step": 4798 + }, + { + "epoch": 0.504712950425283, + "grad_norm": 2.1535118451086257, + "learning_rate": 2.5041740954426508e-06, + "loss": 1.0055, + "step": 4799 + }, + { + "epoch": 0.5048181208671303, + "grad_norm": 2.1050204876713177, + "learning_rate": 2.5033392769126543e-06, + "loss": 1.0276, + "step": 4800 + }, + { + "epoch": 0.5049232913089776, + "grad_norm": 2.139583763580259, + "learning_rate": 2.5025044580103012e-06, + "loss": 1.0135, + "step": 4801 + }, + { + "epoch": 0.5050284617508249, + "grad_norm": 2.1492851865676337, + "learning_rate": 2.5016696388286827e-06, + "loss": 1.014, + "step": 4802 + }, + { + "epoch": 0.5051336321926723, + "grad_norm": 2.1397490291747014, + "learning_rate": 2.5008348194608855e-06, + "loss": 0.991, + "step": 4803 + }, + { + "epoch": 0.5052388026345196, + "grad_norm": 1.9983819330325248, + "learning_rate": 2.5e-06, + "loss": 1.009, + "step": 4804 + }, + { + "epoch": 0.5053439730763669, + "grad_norm": 2.5956563261204013, + "learning_rate": 2.499165180539115e-06, + "loss": 0.9449, + "step": 4805 + }, + { + "epoch": 0.5054491435182142, + "grad_norm": 2.1548335567068073, + "learning_rate": 2.498330361171318e-06, + "loss": 0.9516, + "step": 4806 + }, + { + "epoch": 0.5055543139600616, + "grad_norm": 2.73508525509217, + "learning_rate": 2.497495541989699e-06, + "loss": 0.9602, + "step": 4807 + }, + { + "epoch": 0.5056594844019089, + "grad_norm": 2.3580994201959253, + "learning_rate": 2.496660723087347e-06, + "loss": 0.9723, + "step": 4808 + }, + { + "epoch": 0.5057646548437562, + "grad_norm": 3.0042626219250823, + "learning_rate": 2.49582590455735e-06, + "loss": 0.9715, + "step": 4809 + }, + { + "epoch": 0.5058698252856035, + "grad_norm": 2.3224530834303185, + "learning_rate": 2.494991086492797e-06, + "loss": 1.0099, + "step": 4810 + }, + { + "epoch": 0.5059749957274507, + "grad_norm": 2.5954016653855962, + "learning_rate": 2.4941562689867755e-06, + "loss": 1.0442, + "step": 4811 + }, + { + "epoch": 0.5060801661692981, + "grad_norm": 2.561011878419228, + "learning_rate": 2.493321452132377e-06, + "loss": 0.9306, + "step": 4812 + }, + { + "epoch": 0.5061853366111454, + "grad_norm": 2.6220115240229416, + "learning_rate": 2.4924866360226883e-06, + "loss": 0.9854, + "step": 4813 + }, + { + "epoch": 0.5062905070529927, + "grad_norm": 2.529146248465547, + "learning_rate": 2.4916518207507977e-06, + "loss": 1.0399, + "step": 4814 + }, + { + "epoch": 0.50639567749484, + "grad_norm": 2.308532895773023, + "learning_rate": 2.490817006409794e-06, + "loss": 0.974, + "step": 4815 + }, + { + "epoch": 0.5065008479366874, + "grad_norm": 2.8751837068490085, + "learning_rate": 2.4899821930927653e-06, + "loss": 0.9367, + "step": 4816 + }, + { + "epoch": 0.5066060183785347, + "grad_norm": 1.8788646946917793, + "learning_rate": 2.4891473808928006e-06, + "loss": 0.9854, + "step": 4817 + }, + { + "epoch": 0.506711188820382, + "grad_norm": 2.1679354845155387, + "learning_rate": 2.488312569902987e-06, + "loss": 0.9615, + "step": 4818 + }, + { + "epoch": 0.5068163592622293, + "grad_norm": 2.3508798049934403, + "learning_rate": 2.487477760216412e-06, + "loss": 1.0156, + "step": 4819 + }, + { + "epoch": 0.5069215297040767, + "grad_norm": 1.7607835727712602, + "learning_rate": 2.4866429519261646e-06, + "loss": 0.994, + "step": 4820 + }, + { + "epoch": 0.507026700145924, + "grad_norm": 3.0523263238936593, + "learning_rate": 2.4858081451253306e-06, + "loss": 0.975, + "step": 4821 + }, + { + "epoch": 0.5071318705877713, + "grad_norm": 2.3526180310210454, + "learning_rate": 2.4849733399070002e-06, + "loss": 0.9872, + "step": 4822 + }, + { + "epoch": 0.5072370410296186, + "grad_norm": 2.263463303823174, + "learning_rate": 2.484138536364259e-06, + "loss": 0.9537, + "step": 4823 + }, + { + "epoch": 0.507342211471466, + "grad_norm": 2.1671169893440982, + "learning_rate": 2.4833037345901937e-06, + "loss": 0.9872, + "step": 4824 + }, + { + "epoch": 0.5074473819133133, + "grad_norm": 2.73902112314044, + "learning_rate": 2.482468934677891e-06, + "loss": 1.0494, + "step": 4825 + }, + { + "epoch": 0.5075525523551606, + "grad_norm": 2.6521521906637107, + "learning_rate": 2.481634136720439e-06, + "loss": 0.9943, + "step": 4826 + }, + { + "epoch": 0.5076577227970079, + "grad_norm": 3.004342981120889, + "learning_rate": 2.4807993408109237e-06, + "loss": 0.9704, + "step": 4827 + }, + { + "epoch": 0.5077628932388553, + "grad_norm": 3.205741832679773, + "learning_rate": 2.479964547042431e-06, + "loss": 0.9874, + "step": 4828 + }, + { + "epoch": 0.5078680636807026, + "grad_norm": 3.0608467513431106, + "learning_rate": 2.479129755508047e-06, + "loss": 1.0236, + "step": 4829 + }, + { + "epoch": 0.5079732341225499, + "grad_norm": 2.2772013134452402, + "learning_rate": 2.478294966300857e-06, + "loss": 0.993, + "step": 4830 + }, + { + "epoch": 0.5080784045643971, + "grad_norm": 2.143379541365547, + "learning_rate": 2.477460179513947e-06, + "loss": 0.9831, + "step": 4831 + }, + { + "epoch": 0.5081835750062444, + "grad_norm": 2.086617872315913, + "learning_rate": 2.4766253952404024e-06, + "loss": 0.9932, + "step": 4832 + }, + { + "epoch": 0.5082887454480918, + "grad_norm": 2.6508833715078386, + "learning_rate": 2.4757906135733077e-06, + "loss": 1.0029, + "step": 4833 + }, + { + "epoch": 0.5083939158899391, + "grad_norm": 2.8727794559719513, + "learning_rate": 2.4749558346057464e-06, + "loss": 1.0128, + "step": 4834 + }, + { + "epoch": 0.5084990863317864, + "grad_norm": 2.1203097915270797, + "learning_rate": 2.4741210584308053e-06, + "loss": 0.9912, + "step": 4835 + }, + { + "epoch": 0.5086042567736337, + "grad_norm": 2.6936248414531594, + "learning_rate": 2.4732862851415674e-06, + "loss": 0.9887, + "step": 4836 + }, + { + "epoch": 0.5087094272154811, + "grad_norm": 2.388064075424561, + "learning_rate": 2.472451514831116e-06, + "loss": 0.9818, + "step": 4837 + }, + { + "epoch": 0.5088145976573284, + "grad_norm": 2.4879710422290318, + "learning_rate": 2.4716167475925356e-06, + "loss": 1.0028, + "step": 4838 + }, + { + "epoch": 0.5089197680991757, + "grad_norm": 2.35770735993766, + "learning_rate": 2.470781983518906e-06, + "loss": 1.0071, + "step": 4839 + }, + { + "epoch": 0.509024938541023, + "grad_norm": 2.0072570921567605, + "learning_rate": 2.4699472227033137e-06, + "loss": 1.0177, + "step": 4840 + }, + { + "epoch": 0.5091301089828704, + "grad_norm": 2.926954873115335, + "learning_rate": 2.46911246523884e-06, + "loss": 0.9914, + "step": 4841 + }, + { + "epoch": 0.5092352794247177, + "grad_norm": 1.8985162910533802, + "learning_rate": 2.4682777112185657e-06, + "loss": 0.96, + "step": 4842 + }, + { + "epoch": 0.509340449866565, + "grad_norm": 2.961749775540693, + "learning_rate": 2.467442960735573e-06, + "loss": 1.011, + "step": 4843 + }, + { + "epoch": 0.5094456203084123, + "grad_norm": 2.828236340352394, + "learning_rate": 2.466608213882943e-06, + "loss": 1.0186, + "step": 4844 + }, + { + "epoch": 0.5095507907502597, + "grad_norm": 2.933167988889152, + "learning_rate": 2.4657734707537567e-06, + "loss": 0.997, + "step": 4845 + }, + { + "epoch": 0.509655961192107, + "grad_norm": 2.4675557778050945, + "learning_rate": 2.464938731441095e-06, + "loss": 0.947, + "step": 4846 + }, + { + "epoch": 0.5097611316339543, + "grad_norm": 2.084400197522081, + "learning_rate": 2.464103996038036e-06, + "loss": 0.9936, + "step": 4847 + }, + { + "epoch": 0.5098663020758016, + "grad_norm": 2.5000064897079946, + "learning_rate": 2.46326926463766e-06, + "loss": 0.9836, + "step": 4848 + }, + { + "epoch": 0.509971472517649, + "grad_norm": 2.4556021731452953, + "learning_rate": 2.462434537333047e-06, + "loss": 0.9781, + "step": 4849 + }, + { + "epoch": 0.5100766429594963, + "grad_norm": 3.10178764217554, + "learning_rate": 2.461599814217275e-06, + "loss": 0.9933, + "step": 4850 + }, + { + "epoch": 0.5101818134013435, + "grad_norm": 2.50622015636115, + "learning_rate": 2.4607650953834225e-06, + "loss": 1.0053, + "step": 4851 + }, + { + "epoch": 0.5102869838431908, + "grad_norm": 2.9146926707614864, + "learning_rate": 2.459930380924566e-06, + "loss": 1.0261, + "step": 4852 + }, + { + "epoch": 0.5103921542850381, + "grad_norm": 2.2712260753133715, + "learning_rate": 2.459095670933783e-06, + "loss": 1.0111, + "step": 4853 + }, + { + "epoch": 0.5104973247268855, + "grad_norm": 2.0427151120190445, + "learning_rate": 2.458260965504151e-06, + "loss": 1.0178, + "step": 4854 + }, + { + "epoch": 0.5106024951687328, + "grad_norm": 2.615519242206928, + "learning_rate": 2.457426264728746e-06, + "loss": 1.0209, + "step": 4855 + }, + { + "epoch": 0.5107076656105801, + "grad_norm": 2.8016701953883216, + "learning_rate": 2.4565915687006426e-06, + "loss": 1.0498, + "step": 4856 + }, + { + "epoch": 0.5108128360524274, + "grad_norm": 1.909826636778149, + "learning_rate": 2.4557568775129157e-06, + "loss": 0.9668, + "step": 4857 + }, + { + "epoch": 0.5109180064942748, + "grad_norm": 2.0558742173386637, + "learning_rate": 2.454922191258642e-06, + "loss": 0.9879, + "step": 4858 + }, + { + "epoch": 0.5110231769361221, + "grad_norm": 2.0257159398608646, + "learning_rate": 2.454087510030894e-06, + "loss": 0.9859, + "step": 4859 + }, + { + "epoch": 0.5111283473779694, + "grad_norm": 1.7754121266644842, + "learning_rate": 2.453252833922745e-06, + "loss": 0.9699, + "step": 4860 + }, + { + "epoch": 0.5112335178198167, + "grad_norm": 2.4413398534310375, + "learning_rate": 2.4524181630272685e-06, + "loss": 1.0299, + "step": 4861 + }, + { + "epoch": 0.5113386882616641, + "grad_norm": 2.715928878533507, + "learning_rate": 2.451583497437535e-06, + "loss": 0.9741, + "step": 4862 + }, + { + "epoch": 0.5114438587035114, + "grad_norm": 2.3218910000415365, + "learning_rate": 2.450748837246619e-06, + "loss": 1.0065, + "step": 4863 + }, + { + "epoch": 0.5115490291453587, + "grad_norm": 2.2782925199881214, + "learning_rate": 2.44991418254759e-06, + "loss": 0.9806, + "step": 4864 + }, + { + "epoch": 0.511654199587206, + "grad_norm": 2.7149913688207135, + "learning_rate": 2.449079533433519e-06, + "loss": 1.0209, + "step": 4865 + }, + { + "epoch": 0.5117593700290534, + "grad_norm": 2.3390153312282185, + "learning_rate": 2.448244889997475e-06, + "loss": 0.999, + "step": 4866 + }, + { + "epoch": 0.5118645404709007, + "grad_norm": 3.0557721293095157, + "learning_rate": 2.447410252332528e-06, + "loss": 1.0012, + "step": 4867 + }, + { + "epoch": 0.511969710912748, + "grad_norm": 2.866535350728344, + "learning_rate": 2.4465756205317466e-06, + "loss": 0.992, + "step": 4868 + }, + { + "epoch": 0.5120748813545953, + "grad_norm": 2.9450377971992023, + "learning_rate": 2.4457409946881986e-06, + "loss": 1.0557, + "step": 4869 + }, + { + "epoch": 0.5121800517964427, + "grad_norm": 2.786445259165392, + "learning_rate": 2.444906374894951e-06, + "loss": 1.0568, + "step": 4870 + }, + { + "epoch": 0.51228522223829, + "grad_norm": 1.9248634237188782, + "learning_rate": 2.4440717612450695e-06, + "loss": 0.9872, + "step": 4871 + }, + { + "epoch": 0.5123903926801372, + "grad_norm": 3.2932080298333166, + "learning_rate": 2.4432371538316226e-06, + "loss": 0.9767, + "step": 4872 + }, + { + "epoch": 0.5124955631219845, + "grad_norm": 2.5355754300367486, + "learning_rate": 2.4424025527476735e-06, + "loss": 1.0017, + "step": 4873 + }, + { + "epoch": 0.5126007335638318, + "grad_norm": 2.598457779940784, + "learning_rate": 2.441567958086288e-06, + "loss": 0.9825, + "step": 4874 + }, + { + "epoch": 0.5127059040056792, + "grad_norm": 2.252477222253449, + "learning_rate": 2.4407333699405285e-06, + "loss": 1.0299, + "step": 4875 + }, + { + "epoch": 0.5128110744475265, + "grad_norm": 1.61574628720382, + "learning_rate": 2.4398987884034574e-06, + "loss": 0.9719, + "step": 4876 + }, + { + "epoch": 0.5129162448893738, + "grad_norm": 2.9461045046867795, + "learning_rate": 2.43906421356814e-06, + "loss": 1.0177, + "step": 4877 + }, + { + "epoch": 0.5130214153312211, + "grad_norm": 2.6086241827887715, + "learning_rate": 2.4382296455276355e-06, + "loss": 0.9679, + "step": 4878 + }, + { + "epoch": 0.5131265857730685, + "grad_norm": 2.469431110622542, + "learning_rate": 2.4373950843750053e-06, + "loss": 0.9654, + "step": 4879 + }, + { + "epoch": 0.5132317562149158, + "grad_norm": 2.4965960753098253, + "learning_rate": 2.4365605302033094e-06, + "loss": 0.9802, + "step": 4880 + }, + { + "epoch": 0.5133369266567631, + "grad_norm": 2.0319790709927505, + "learning_rate": 2.435725983105608e-06, + "loss": 0.9885, + "step": 4881 + }, + { + "epoch": 0.5134420970986104, + "grad_norm": 2.4000275308260544, + "learning_rate": 2.4348914431749578e-06, + "loss": 0.9208, + "step": 4882 + }, + { + "epoch": 0.5135472675404578, + "grad_norm": 2.317952586045972, + "learning_rate": 2.434056910504418e-06, + "loss": 0.9975, + "step": 4883 + }, + { + "epoch": 0.5136524379823051, + "grad_norm": 2.2760790757419778, + "learning_rate": 2.4332223851870453e-06, + "loss": 1.0071, + "step": 4884 + }, + { + "epoch": 0.5137576084241524, + "grad_norm": 2.4027564608452723, + "learning_rate": 2.4323878673158937e-06, + "loss": 1.0331, + "step": 4885 + }, + { + "epoch": 0.5138627788659997, + "grad_norm": 1.8639452403440218, + "learning_rate": 2.431553356984022e-06, + "loss": 0.9873, + "step": 4886 + }, + { + "epoch": 0.5139679493078471, + "grad_norm": 2.5864359775236245, + "learning_rate": 2.430718854284482e-06, + "loss": 0.9993, + "step": 4887 + }, + { + "epoch": 0.5140731197496944, + "grad_norm": 1.8024207909165022, + "learning_rate": 2.429884359310328e-06, + "loss": 1.0033, + "step": 4888 + }, + { + "epoch": 0.5141782901915417, + "grad_norm": 2.4114408144418538, + "learning_rate": 2.4290498721546123e-06, + "loss": 1.0138, + "step": 4889 + }, + { + "epoch": 0.514283460633389, + "grad_norm": 2.8587403555199553, + "learning_rate": 2.4282153929103874e-06, + "loss": 0.9866, + "step": 4890 + }, + { + "epoch": 0.5143886310752364, + "grad_norm": 2.2763471355900706, + "learning_rate": 2.427380921670704e-06, + "loss": 0.9761, + "step": 4891 + }, + { + "epoch": 0.5144938015170836, + "grad_norm": 2.7630996690467073, + "learning_rate": 2.426546458528612e-06, + "loss": 1.0034, + "step": 4892 + }, + { + "epoch": 0.5145989719589309, + "grad_norm": 2.5503825990043034, + "learning_rate": 2.42571200357716e-06, + "loss": 0.9842, + "step": 4893 + }, + { + "epoch": 0.5147041424007782, + "grad_norm": 1.707781885008851, + "learning_rate": 2.4248775569093968e-06, + "loss": 0.9673, + "step": 4894 + }, + { + "epoch": 0.5148093128426255, + "grad_norm": 2.372412499450598, + "learning_rate": 2.4240431186183695e-06, + "loss": 1.0197, + "step": 4895 + }, + { + "epoch": 0.5149144832844729, + "grad_norm": 2.6726883190728916, + "learning_rate": 2.423208688797125e-06, + "loss": 0.9981, + "step": 4896 + }, + { + "epoch": 0.5150196537263202, + "grad_norm": 2.672102772646679, + "learning_rate": 2.422374267538708e-06, + "loss": 0.9785, + "step": 4897 + }, + { + "epoch": 0.5151248241681675, + "grad_norm": 3.3567918073562586, + "learning_rate": 2.4215398549361632e-06, + "loss": 0.9869, + "step": 4898 + }, + { + "epoch": 0.5152299946100148, + "grad_norm": 3.2146007931473184, + "learning_rate": 2.420705451082533e-06, + "loss": 0.9755, + "step": 4899 + }, + { + "epoch": 0.5153351650518622, + "grad_norm": 2.8601027840979563, + "learning_rate": 2.4198710560708623e-06, + "loss": 1.0102, + "step": 4900 + }, + { + "epoch": 0.5154403354937095, + "grad_norm": 2.49782936985743, + "learning_rate": 2.419036669994191e-06, + "loss": 1.0102, + "step": 4901 + }, + { + "epoch": 0.5155455059355568, + "grad_norm": 2.666513374876871, + "learning_rate": 2.41820229294556e-06, + "loss": 0.9846, + "step": 4902 + }, + { + "epoch": 0.5156506763774041, + "grad_norm": 2.1553652633705274, + "learning_rate": 2.4173679250180083e-06, + "loss": 0.9994, + "step": 4903 + }, + { + "epoch": 0.5157558468192515, + "grad_norm": 1.9176350923655092, + "learning_rate": 2.4165335663045753e-06, + "loss": 0.9929, + "step": 4904 + }, + { + "epoch": 0.5158610172610988, + "grad_norm": 2.8881593227917377, + "learning_rate": 2.4156992168982985e-06, + "loss": 1.0123, + "step": 4905 + }, + { + "epoch": 0.5159661877029461, + "grad_norm": 1.7026322357589312, + "learning_rate": 2.4148648768922133e-06, + "loss": 0.9522, + "step": 4906 + }, + { + "epoch": 0.5160713581447934, + "grad_norm": 1.9182156402806023, + "learning_rate": 2.4140305463793557e-06, + "loss": 0.9721, + "step": 4907 + }, + { + "epoch": 0.5161765285866408, + "grad_norm": 2.630435818921885, + "learning_rate": 2.4131962254527592e-06, + "loss": 0.9998, + "step": 4908 + }, + { + "epoch": 0.5162816990284881, + "grad_norm": 2.639541492793874, + "learning_rate": 2.41236191420546e-06, + "loss": 1.0166, + "step": 4909 + }, + { + "epoch": 0.5163868694703354, + "grad_norm": 2.193440397829881, + "learning_rate": 2.4115276127304866e-06, + "loss": 1.0148, + "step": 4910 + }, + { + "epoch": 0.5164920399121827, + "grad_norm": 1.9956942722452817, + "learning_rate": 2.4106933211208723e-06, + "loss": 0.9871, + "step": 4911 + }, + { + "epoch": 0.51659721035403, + "grad_norm": 2.841990398589798, + "learning_rate": 2.4098590394696452e-06, + "loss": 1.01, + "step": 4912 + }, + { + "epoch": 0.5167023807958773, + "grad_norm": 2.4733117430723324, + "learning_rate": 2.4090247678698362e-06, + "loss": 1.0348, + "step": 4913 + }, + { + "epoch": 0.5168075512377246, + "grad_norm": 1.852569961733493, + "learning_rate": 2.408190506414473e-06, + "loss": 0.9695, + "step": 4914 + }, + { + "epoch": 0.5169127216795719, + "grad_norm": 2.4532174848303683, + "learning_rate": 2.4073562551965814e-06, + "loss": 0.9901, + "step": 4915 + }, + { + "epoch": 0.5170178921214192, + "grad_norm": 2.5251790955216697, + "learning_rate": 2.4065220143091863e-06, + "loss": 1.0092, + "step": 4916 + }, + { + "epoch": 0.5171230625632666, + "grad_norm": 2.76475785288874, + "learning_rate": 2.405687783845313e-06, + "loss": 0.9948, + "step": 4917 + }, + { + "epoch": 0.5172282330051139, + "grad_norm": 2.7685770717204776, + "learning_rate": 2.4048535638979844e-06, + "loss": 0.9939, + "step": 4918 + }, + { + "epoch": 0.5173334034469612, + "grad_norm": 2.2946067444852036, + "learning_rate": 2.4040193545602232e-06, + "loss": 0.9687, + "step": 4919 + }, + { + "epoch": 0.5174385738888085, + "grad_norm": 2.645304674423841, + "learning_rate": 2.403185155925049e-06, + "loss": 1.0173, + "step": 4920 + }, + { + "epoch": 0.5175437443306559, + "grad_norm": 2.629339889738927, + "learning_rate": 2.4023509680854822e-06, + "loss": 0.9829, + "step": 4921 + }, + { + "epoch": 0.5176489147725032, + "grad_norm": 2.1954075878052035, + "learning_rate": 2.40151679113454e-06, + "loss": 0.9908, + "step": 4922 + }, + { + "epoch": 0.5177540852143505, + "grad_norm": 2.095867879804772, + "learning_rate": 2.400682625165242e-06, + "loss": 0.9969, + "step": 4923 + }, + { + "epoch": 0.5178592556561978, + "grad_norm": 1.9455070815665647, + "learning_rate": 2.399848470270602e-06, + "loss": 0.9971, + "step": 4924 + }, + { + "epoch": 0.5179644260980452, + "grad_norm": 2.507117651664635, + "learning_rate": 2.399014326543637e-06, + "loss": 1.0045, + "step": 4925 + }, + { + "epoch": 0.5180695965398925, + "grad_norm": 2.4607100953155503, + "learning_rate": 2.398180194077357e-06, + "loss": 1.0235, + "step": 4926 + }, + { + "epoch": 0.5181747669817398, + "grad_norm": 2.777674225709361, + "learning_rate": 2.3973460729647775e-06, + "loss": 1.0138, + "step": 4927 + }, + { + "epoch": 0.5182799374235871, + "grad_norm": 2.313723654298759, + "learning_rate": 2.396511963298908e-06, + "loss": 0.9688, + "step": 4928 + }, + { + "epoch": 0.5183851078654345, + "grad_norm": 2.3190874101733803, + "learning_rate": 2.395677865172759e-06, + "loss": 1.0122, + "step": 4929 + }, + { + "epoch": 0.5184902783072818, + "grad_norm": 2.7640107446846773, + "learning_rate": 2.394843778679338e-06, + "loss": 0.9832, + "step": 4930 + }, + { + "epoch": 0.5185954487491291, + "grad_norm": 2.4081953739998534, + "learning_rate": 2.3940097039116523e-06, + "loss": 1.014, + "step": 4931 + }, + { + "epoch": 0.5187006191909764, + "grad_norm": 1.923457778269713, + "learning_rate": 2.3931756409627084e-06, + "loss": 0.9874, + "step": 4932 + }, + { + "epoch": 0.5188057896328236, + "grad_norm": 1.8553693404082456, + "learning_rate": 2.3923415899255105e-06, + "loss": 0.9887, + "step": 4933 + }, + { + "epoch": 0.518910960074671, + "grad_norm": 2.7194260650322684, + "learning_rate": 2.3915075508930615e-06, + "loss": 0.9387, + "step": 4934 + }, + { + "epoch": 0.5190161305165183, + "grad_norm": 2.652242421666457, + "learning_rate": 2.3906735239583623e-06, + "loss": 1.0244, + "step": 4935 + }, + { + "epoch": 0.5191213009583656, + "grad_norm": 2.549188854335063, + "learning_rate": 2.389839509214416e-06, + "loss": 1.0246, + "step": 4936 + }, + { + "epoch": 0.5192264714002129, + "grad_norm": 2.2665938281306155, + "learning_rate": 2.3890055067542195e-06, + "loss": 1.0106, + "step": 4937 + }, + { + "epoch": 0.5193316418420603, + "grad_norm": 2.8241998887282795, + "learning_rate": 2.3881715166707716e-06, + "loss": 1.0155, + "step": 4938 + }, + { + "epoch": 0.5194368122839076, + "grad_norm": 2.152806090447936, + "learning_rate": 2.387337539057068e-06, + "loss": 1.0189, + "step": 4939 + }, + { + "epoch": 0.5195419827257549, + "grad_norm": 2.172323988446627, + "learning_rate": 2.3865035740061037e-06, + "loss": 1.0695, + "step": 4940 + }, + { + "epoch": 0.5196471531676022, + "grad_norm": 2.728428245649283, + "learning_rate": 2.385669621610873e-06, + "loss": 1.0112, + "step": 4941 + }, + { + "epoch": 0.5197523236094496, + "grad_norm": 2.124385885202202, + "learning_rate": 2.384835681964368e-06, + "loss": 1.0248, + "step": 4942 + }, + { + "epoch": 0.5198574940512969, + "grad_norm": 2.313862649372938, + "learning_rate": 2.3840017551595785e-06, + "loss": 0.9969, + "step": 4943 + }, + { + "epoch": 0.5199626644931442, + "grad_norm": 2.4524723823538954, + "learning_rate": 2.3831678412894947e-06, + "loss": 0.9844, + "step": 4944 + }, + { + "epoch": 0.5200678349349915, + "grad_norm": 2.5132178605505486, + "learning_rate": 2.3823339404471037e-06, + "loss": 1.04, + "step": 4945 + }, + { + "epoch": 0.5201730053768389, + "grad_norm": 2.4402520311081197, + "learning_rate": 2.3815000527253933e-06, + "loss": 0.9824, + "step": 4946 + }, + { + "epoch": 0.5202781758186862, + "grad_norm": 2.1382688241450247, + "learning_rate": 2.380666178217347e-06, + "loss": 1.0087, + "step": 4947 + }, + { + "epoch": 0.5203833462605335, + "grad_norm": 2.538021983793176, + "learning_rate": 2.3798323170159487e-06, + "loss": 0.9869, + "step": 4948 + }, + { + "epoch": 0.5204885167023808, + "grad_norm": 1.851342803348307, + "learning_rate": 2.3789984692141798e-06, + "loss": 0.9851, + "step": 4949 + }, + { + "epoch": 0.5205936871442282, + "grad_norm": 2.176853442535673, + "learning_rate": 2.3781646349050227e-06, + "loss": 0.9821, + "step": 4950 + }, + { + "epoch": 0.5206988575860755, + "grad_norm": 2.3979243648827984, + "learning_rate": 2.377330814181455e-06, + "loss": 0.9397, + "step": 4951 + }, + { + "epoch": 0.5208040280279228, + "grad_norm": 2.0633810413062212, + "learning_rate": 2.3764970071364548e-06, + "loss": 1.0225, + "step": 4952 + }, + { + "epoch": 0.52090919846977, + "grad_norm": 1.7807728183386664, + "learning_rate": 2.3756632138629977e-06, + "loss": 0.9851, + "step": 4953 + }, + { + "epoch": 0.5210143689116173, + "grad_norm": 2.2949369278058382, + "learning_rate": 2.3748294344540575e-06, + "loss": 0.9848, + "step": 4954 + }, + { + "epoch": 0.5211195393534647, + "grad_norm": 2.8758134936809965, + "learning_rate": 2.373995669002608e-06, + "loss": 1.0296, + "step": 4955 + }, + { + "epoch": 0.521224709795312, + "grad_norm": 2.7047106011767563, + "learning_rate": 2.37316191760162e-06, + "loss": 1.0089, + "step": 4956 + }, + { + "epoch": 0.5213298802371593, + "grad_norm": 2.4083545951327463, + "learning_rate": 2.3723281803440642e-06, + "loss": 0.9828, + "step": 4957 + }, + { + "epoch": 0.5214350506790066, + "grad_norm": 2.3483276709030703, + "learning_rate": 2.371494457322907e-06, + "loss": 0.9683, + "step": 4958 + }, + { + "epoch": 0.521540221120854, + "grad_norm": 2.6165413672922004, + "learning_rate": 2.3706607486311166e-06, + "loss": 0.9807, + "step": 4959 + }, + { + "epoch": 0.5216453915627013, + "grad_norm": 2.7199298338176434, + "learning_rate": 2.3698270543616583e-06, + "loss": 0.9758, + "step": 4960 + }, + { + "epoch": 0.5217505620045486, + "grad_norm": 1.6020078122627697, + "learning_rate": 2.368993374607494e-06, + "loss": 0.993, + "step": 4961 + }, + { + "epoch": 0.5218557324463959, + "grad_norm": 3.0119525509546476, + "learning_rate": 2.3681597094615863e-06, + "loss": 1.04, + "step": 4962 + }, + { + "epoch": 0.5219609028882433, + "grad_norm": 2.8336389633739376, + "learning_rate": 2.367326059016894e-06, + "loss": 0.9782, + "step": 4963 + }, + { + "epoch": 0.5220660733300906, + "grad_norm": 3.302155303408267, + "learning_rate": 2.3664924233663776e-06, + "loss": 0.9895, + "step": 4964 + }, + { + "epoch": 0.5221712437719379, + "grad_norm": 2.0928625598039674, + "learning_rate": 2.3656588026029938e-06, + "loss": 1.042, + "step": 4965 + }, + { + "epoch": 0.5222764142137852, + "grad_norm": 2.5158469098657816, + "learning_rate": 2.3648251968196964e-06, + "loss": 0.9834, + "step": 4966 + }, + { + "epoch": 0.5223815846556326, + "grad_norm": 2.8574951074085377, + "learning_rate": 2.36399160610944e-06, + "loss": 0.9998, + "step": 4967 + }, + { + "epoch": 0.5224867550974799, + "grad_norm": 1.7337887982119702, + "learning_rate": 2.3631580305651756e-06, + "loss": 1.0026, + "step": 4968 + }, + { + "epoch": 0.5225919255393272, + "grad_norm": 3.308910933603135, + "learning_rate": 2.362324470279854e-06, + "loss": 0.9672, + "step": 4969 + }, + { + "epoch": 0.5226970959811745, + "grad_norm": 2.6333972565437556, + "learning_rate": 2.3614909253464235e-06, + "loss": 0.9774, + "step": 4970 + }, + { + "epoch": 0.5228022664230219, + "grad_norm": 2.09659946546361, + "learning_rate": 2.360657395857831e-06, + "loss": 0.9807, + "step": 4971 + }, + { + "epoch": 0.5229074368648692, + "grad_norm": 2.6391317053576806, + "learning_rate": 2.3598238819070206e-06, + "loss": 1.0443, + "step": 4972 + }, + { + "epoch": 0.5230126073067164, + "grad_norm": 2.242550715788799, + "learning_rate": 2.358990383586937e-06, + "loss": 0.966, + "step": 4973 + }, + { + "epoch": 0.5231177777485637, + "grad_norm": 3.108913453931425, + "learning_rate": 2.3581569009905204e-06, + "loss": 0.9724, + "step": 4974 + }, + { + "epoch": 0.523222948190411, + "grad_norm": 1.9901033912663888, + "learning_rate": 2.357323434210712e-06, + "loss": 0.9695, + "step": 4975 + }, + { + "epoch": 0.5233281186322584, + "grad_norm": 2.168744011583421, + "learning_rate": 2.3564899833404496e-06, + "loss": 0.9924, + "step": 4976 + }, + { + "epoch": 0.5234332890741057, + "grad_norm": 2.549473243082255, + "learning_rate": 2.355656548472667e-06, + "loss": 1.0382, + "step": 4977 + }, + { + "epoch": 0.523538459515953, + "grad_norm": 2.5199612478036957, + "learning_rate": 2.3548231297003017e-06, + "loss": 1.0082, + "step": 4978 + }, + { + "epoch": 0.5236436299578003, + "grad_norm": 2.3189840650193148, + "learning_rate": 2.3539897271162853e-06, + "loss": 0.9743, + "step": 4979 + }, + { + "epoch": 0.5237488003996477, + "grad_norm": 1.8363174051887219, + "learning_rate": 2.3531563408135482e-06, + "loss": 1.014, + "step": 4980 + }, + { + "epoch": 0.523853970841495, + "grad_norm": 2.136662771433713, + "learning_rate": 2.3523229708850194e-06, + "loss": 1.0037, + "step": 4981 + }, + { + "epoch": 0.5239591412833423, + "grad_norm": 2.5855613260565393, + "learning_rate": 2.351489617423627e-06, + "loss": 0.9987, + "step": 4982 + }, + { + "epoch": 0.5240643117251896, + "grad_norm": 2.3432603718739737, + "learning_rate": 2.3506562805222966e-06, + "loss": 0.9765, + "step": 4983 + }, + { + "epoch": 0.524169482167037, + "grad_norm": 2.6466370290433874, + "learning_rate": 2.3498229602739506e-06, + "loss": 0.99, + "step": 4984 + }, + { + "epoch": 0.5242746526088843, + "grad_norm": 3.0276801019190107, + "learning_rate": 2.348989656771511e-06, + "loss": 0.9581, + "step": 4985 + }, + { + "epoch": 0.5243798230507316, + "grad_norm": 3.1030135072125486, + "learning_rate": 2.348156370107897e-06, + "loss": 0.9684, + "step": 4986 + }, + { + "epoch": 0.5244849934925789, + "grad_norm": 1.6705531610760058, + "learning_rate": 2.3473231003760283e-06, + "loss": 1.0041, + "step": 4987 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 3.195727549016848, + "learning_rate": 2.3464898476688198e-06, + "loss": 1.0116, + "step": 4988 + }, + { + "epoch": 0.5246953343762736, + "grad_norm": 2.4173618786228532, + "learning_rate": 2.345656612079186e-06, + "loss": 0.9629, + "step": 4989 + }, + { + "epoch": 0.5248005048181209, + "grad_norm": 2.6428711784981602, + "learning_rate": 2.3448233937000385e-06, + "loss": 1.0233, + "step": 4990 + }, + { + "epoch": 0.5249056752599682, + "grad_norm": 1.912207664947132, + "learning_rate": 2.343990192624288e-06, + "loss": 0.9782, + "step": 4991 + }, + { + "epoch": 0.5250108457018156, + "grad_norm": 2.2091019995514523, + "learning_rate": 2.3431570089448434e-06, + "loss": 0.9875, + "step": 4992 + }, + { + "epoch": 0.5251160161436629, + "grad_norm": 2.7977308048194285, + "learning_rate": 2.3423238427546105e-06, + "loss": 1.0392, + "step": 4993 + }, + { + "epoch": 0.5252211865855101, + "grad_norm": 2.639342545737909, + "learning_rate": 2.341490694146494e-06, + "loss": 1.0452, + "step": 4994 + }, + { + "epoch": 0.5253263570273574, + "grad_norm": 2.2851218048922695, + "learning_rate": 2.3406575632133953e-06, + "loss": 0.9841, + "step": 4995 + }, + { + "epoch": 0.5254315274692047, + "grad_norm": 1.9739459077968367, + "learning_rate": 2.339824450048218e-06, + "loss": 0.9819, + "step": 4996 + }, + { + "epoch": 0.5255366979110521, + "grad_norm": 2.0317174932305333, + "learning_rate": 2.3389913547438586e-06, + "loss": 1.0313, + "step": 4997 + }, + { + "epoch": 0.5256418683528994, + "grad_norm": 2.7337990453530154, + "learning_rate": 2.338158277393213e-06, + "loss": 1.0307, + "step": 4998 + }, + { + "epoch": 0.5257470387947467, + "grad_norm": 2.665374217415374, + "learning_rate": 2.337325218089177e-06, + "loss": 0.9793, + "step": 4999 + }, + { + "epoch": 0.525852209236594, + "grad_norm": 2.167783407256349, + "learning_rate": 2.3364921769246423e-06, + "loss": 0.9815, + "step": 5000 + }, + { + "epoch": 0.5259573796784414, + "grad_norm": 1.9578480148487534, + "learning_rate": 2.335659153992501e-06, + "loss": 0.9851, + "step": 5001 + }, + { + "epoch": 0.5260625501202887, + "grad_norm": 1.8750731661000946, + "learning_rate": 2.33482614938564e-06, + "loss": 1.0107, + "step": 5002 + }, + { + "epoch": 0.526167720562136, + "grad_norm": 2.7268491605591705, + "learning_rate": 2.3339931631969473e-06, + "loss": 1.0255, + "step": 5003 + }, + { + "epoch": 0.5262728910039833, + "grad_norm": 2.3624254794486292, + "learning_rate": 2.333160195519306e-06, + "loss": 1.0034, + "step": 5004 + }, + { + "epoch": 0.5263780614458307, + "grad_norm": 2.0278498795912956, + "learning_rate": 2.3323272464455987e-06, + "loss": 0.9947, + "step": 5005 + }, + { + "epoch": 0.526483231887678, + "grad_norm": 2.2289155760280592, + "learning_rate": 2.331494316068706e-06, + "loss": 0.9749, + "step": 5006 + }, + { + "epoch": 0.5265884023295253, + "grad_norm": 2.4790681587132335, + "learning_rate": 2.3306614044815066e-06, + "loss": 0.9922, + "step": 5007 + }, + { + "epoch": 0.5266935727713726, + "grad_norm": 2.253254009909184, + "learning_rate": 2.329828511776876e-06, + "loss": 0.9891, + "step": 5008 + }, + { + "epoch": 0.52679874321322, + "grad_norm": 2.4459741279261102, + "learning_rate": 2.328995638047687e-06, + "loss": 1.0001, + "step": 5009 + }, + { + "epoch": 0.5269039136550673, + "grad_norm": 2.9185918204182313, + "learning_rate": 2.328162783386814e-06, + "loss": 0.9973, + "step": 5010 + }, + { + "epoch": 0.5270090840969146, + "grad_norm": 2.0661644888279582, + "learning_rate": 2.327329947887125e-06, + "loss": 1.0438, + "step": 5011 + }, + { + "epoch": 0.5271142545387619, + "grad_norm": 1.9485234437575496, + "learning_rate": 2.3264971316414893e-06, + "loss": 1.0256, + "step": 5012 + }, + { + "epoch": 0.5272194249806093, + "grad_norm": 2.178434629119489, + "learning_rate": 2.325664334742771e-06, + "loss": 1.0014, + "step": 5013 + }, + { + "epoch": 0.5273245954224565, + "grad_norm": 2.0382608544824095, + "learning_rate": 2.3248315572838316e-06, + "loss": 0.9811, + "step": 5014 + }, + { + "epoch": 0.5274297658643038, + "grad_norm": 2.2529667292304634, + "learning_rate": 2.323998799357536e-06, + "loss": 0.9599, + "step": 5015 + }, + { + "epoch": 0.5275349363061511, + "grad_norm": 2.1875904622022673, + "learning_rate": 2.3231660610567415e-06, + "loss": 0.9548, + "step": 5016 + }, + { + "epoch": 0.5276401067479984, + "grad_norm": 2.294988151424705, + "learning_rate": 2.322333342474305e-06, + "loss": 1.0398, + "step": 5017 + }, + { + "epoch": 0.5277452771898458, + "grad_norm": 2.593613259374422, + "learning_rate": 2.32150064370308e-06, + "loss": 0.9981, + "step": 5018 + }, + { + "epoch": 0.5278504476316931, + "grad_norm": 2.4843528327017097, + "learning_rate": 2.3206679648359206e-06, + "loss": 0.9869, + "step": 5019 + }, + { + "epoch": 0.5279556180735404, + "grad_norm": 2.368580138964457, + "learning_rate": 2.3198353059656763e-06, + "loss": 0.942, + "step": 5020 + }, + { + "epoch": 0.5280607885153877, + "grad_norm": 3.1046397593748103, + "learning_rate": 2.319002667185195e-06, + "loss": 0.9921, + "step": 5021 + }, + { + "epoch": 0.5281659589572351, + "grad_norm": 2.386039450298401, + "learning_rate": 2.318170048587322e-06, + "loss": 0.9925, + "step": 5022 + }, + { + "epoch": 0.5282711293990824, + "grad_norm": 2.1962062400668736, + "learning_rate": 2.3173374502649006e-06, + "loss": 1.0175, + "step": 5023 + }, + { + "epoch": 0.5283762998409297, + "grad_norm": 1.897357930598202, + "learning_rate": 2.316504872310773e-06, + "loss": 1.015, + "step": 5024 + }, + { + "epoch": 0.528481470282777, + "grad_norm": 2.699356451977988, + "learning_rate": 2.315672314817778e-06, + "loss": 1.0146, + "step": 5025 + }, + { + "epoch": 0.5285866407246244, + "grad_norm": 2.291400597221481, + "learning_rate": 2.3148397778787514e-06, + "loss": 1.003, + "step": 5026 + }, + { + "epoch": 0.5286918111664717, + "grad_norm": 1.81066716415222, + "learning_rate": 2.3140072615865273e-06, + "loss": 0.9671, + "step": 5027 + }, + { + "epoch": 0.528796981608319, + "grad_norm": 2.428779670922376, + "learning_rate": 2.3131747660339396e-06, + "loss": 1.0034, + "step": 5028 + }, + { + "epoch": 0.5289021520501663, + "grad_norm": 2.2304748543670074, + "learning_rate": 2.3123422913138165e-06, + "loss": 0.9739, + "step": 5029 + }, + { + "epoch": 0.5290073224920137, + "grad_norm": 2.107958029699162, + "learning_rate": 2.3115098375189854e-06, + "loss": 1.0084, + "step": 5030 + }, + { + "epoch": 0.529112492933861, + "grad_norm": 3.179847886514185, + "learning_rate": 2.310677404742272e-06, + "loss": 1.0061, + "step": 5031 + }, + { + "epoch": 0.5292176633757083, + "grad_norm": 1.720669464426383, + "learning_rate": 2.309844993076498e-06, + "loss": 0.9939, + "step": 5032 + }, + { + "epoch": 0.5293228338175556, + "grad_norm": 2.00459599458911, + "learning_rate": 2.3090126026144847e-06, + "loss": 0.9521, + "step": 5033 + }, + { + "epoch": 0.5294280042594028, + "grad_norm": 2.3013300137074255, + "learning_rate": 2.3081802334490502e-06, + "loss": 0.9883, + "step": 5034 + }, + { + "epoch": 0.5295331747012502, + "grad_norm": 2.3045482143367697, + "learning_rate": 2.30734788567301e-06, + "loss": 0.9919, + "step": 5035 + }, + { + "epoch": 0.5296383451430975, + "grad_norm": 1.67169847842081, + "learning_rate": 2.3065155593791756e-06, + "loss": 0.9767, + "step": 5036 + }, + { + "epoch": 0.5297435155849448, + "grad_norm": 2.894233184600396, + "learning_rate": 2.3056832546603607e-06, + "loss": 1.0191, + "step": 5037 + }, + { + "epoch": 0.5298486860267921, + "grad_norm": 2.011314434528289, + "learning_rate": 2.3048509716093723e-06, + "loss": 0.9874, + "step": 5038 + }, + { + "epoch": 0.5299538564686395, + "grad_norm": 2.2683865903528293, + "learning_rate": 2.3040187103190165e-06, + "loss": 0.9704, + "step": 5039 + }, + { + "epoch": 0.5300590269104868, + "grad_norm": 2.092218000336117, + "learning_rate": 2.303186470882097e-06, + "loss": 0.9837, + "step": 5040 + }, + { + "epoch": 0.5301641973523341, + "grad_norm": 2.5135197465867765, + "learning_rate": 2.302354253391414e-06, + "loss": 0.9526, + "step": 5041 + }, + { + "epoch": 0.5302693677941814, + "grad_norm": 2.572619453041509, + "learning_rate": 2.301522057939768e-06, + "loss": 0.9711, + "step": 5042 + }, + { + "epoch": 0.5303745382360288, + "grad_norm": 2.9257982055535323, + "learning_rate": 2.3006898846199544e-06, + "loss": 1.0119, + "step": 5043 + }, + { + "epoch": 0.5304797086778761, + "grad_norm": 2.7098826216208995, + "learning_rate": 2.299857733524767e-06, + "loss": 1.0066, + "step": 5044 + }, + { + "epoch": 0.5305848791197234, + "grad_norm": 2.727684751730271, + "learning_rate": 2.299025604746997e-06, + "loss": 0.9946, + "step": 5045 + }, + { + "epoch": 0.5306900495615707, + "grad_norm": 2.3317306284613957, + "learning_rate": 2.2981934983794324e-06, + "loss": 0.9265, + "step": 5046 + }, + { + "epoch": 0.5307952200034181, + "grad_norm": 2.5826246634345753, + "learning_rate": 2.2973614145148616e-06, + "loss": 1.0235, + "step": 5047 + }, + { + "epoch": 0.5309003904452654, + "grad_norm": 2.433635645176636, + "learning_rate": 2.2965293532460673e-06, + "loss": 0.9693, + "step": 5048 + }, + { + "epoch": 0.5310055608871127, + "grad_norm": 2.259538998753554, + "learning_rate": 2.2956973146658303e-06, + "loss": 0.9975, + "step": 5049 + }, + { + "epoch": 0.53111073132896, + "grad_norm": 2.3054114447327847, + "learning_rate": 2.294865298866929e-06, + "loss": 0.991, + "step": 5050 + }, + { + "epoch": 0.5312159017708074, + "grad_norm": 3.420364728806349, + "learning_rate": 2.294033305942141e-06, + "loss": 1.0031, + "step": 5051 + }, + { + "epoch": 0.5313210722126547, + "grad_norm": 2.8657151853369234, + "learning_rate": 2.2932013359842398e-06, + "loss": 0.9782, + "step": 5052 + }, + { + "epoch": 0.531426242654502, + "grad_norm": 3.1777989573404217, + "learning_rate": 2.292369389085996e-06, + "loss": 0.9521, + "step": 5053 + }, + { + "epoch": 0.5315314130963493, + "grad_norm": 2.635587308253445, + "learning_rate": 2.291537465340178e-06, + "loss": 0.9735, + "step": 5054 + }, + { + "epoch": 0.5316365835381965, + "grad_norm": 2.7400667145766846, + "learning_rate": 2.2907055648395517e-06, + "loss": 1.0065, + "step": 5055 + }, + { + "epoch": 0.5317417539800439, + "grad_norm": 2.1418010167978383, + "learning_rate": 2.2898736876768816e-06, + "loss": 1.0043, + "step": 5056 + }, + { + "epoch": 0.5318469244218912, + "grad_norm": 2.4204236378757296, + "learning_rate": 2.289041833944927e-06, + "loss": 0.9886, + "step": 5057 + }, + { + "epoch": 0.5319520948637385, + "grad_norm": 2.2258054408956847, + "learning_rate": 2.2882100037364472e-06, + "loss": 0.9754, + "step": 5058 + }, + { + "epoch": 0.5320572653055858, + "grad_norm": 1.698663415933468, + "learning_rate": 2.2873781971441963e-06, + "loss": 1.0052, + "step": 5059 + }, + { + "epoch": 0.5321624357474332, + "grad_norm": 2.39347765845993, + "learning_rate": 2.2865464142609286e-06, + "loss": 1.0169, + "step": 5060 + }, + { + "epoch": 0.5322676061892805, + "grad_norm": 2.1009995798487497, + "learning_rate": 2.2857146551793943e-06, + "loss": 1.0106, + "step": 5061 + }, + { + "epoch": 0.5323727766311278, + "grad_norm": 2.695049651098317, + "learning_rate": 2.2848829199923405e-06, + "loss": 0.9646, + "step": 5062 + }, + { + "epoch": 0.5324779470729751, + "grad_norm": 2.092568062548765, + "learning_rate": 2.2840512087925127e-06, + "loss": 0.988, + "step": 5063 + }, + { + "epoch": 0.5325831175148225, + "grad_norm": 2.183783767757615, + "learning_rate": 2.283219521672651e-06, + "loss": 0.9897, + "step": 5064 + }, + { + "epoch": 0.5326882879566698, + "grad_norm": 1.8926321090318932, + "learning_rate": 2.282387858725498e-06, + "loss": 0.999, + "step": 5065 + }, + { + "epoch": 0.5327934583985171, + "grad_norm": 2.3234643524419205, + "learning_rate": 2.281556220043789e-06, + "loss": 0.9972, + "step": 5066 + }, + { + "epoch": 0.5328986288403644, + "grad_norm": 2.311574355596325, + "learning_rate": 2.280724605720258e-06, + "loss": 1.0388, + "step": 5067 + }, + { + "epoch": 0.5330037992822118, + "grad_norm": 2.6794835776991985, + "learning_rate": 2.2798930158476375e-06, + "loss": 1.0157, + "step": 5068 + }, + { + "epoch": 0.5331089697240591, + "grad_norm": 2.314690603027226, + "learning_rate": 2.279061450518655e-06, + "loss": 1.0114, + "step": 5069 + }, + { + "epoch": 0.5332141401659064, + "grad_norm": 2.199288704139533, + "learning_rate": 2.278229909826037e-06, + "loss": 0.9784, + "step": 5070 + }, + { + "epoch": 0.5333193106077537, + "grad_norm": 1.817939098985619, + "learning_rate": 2.2773983938625074e-06, + "loss": 0.9549, + "step": 5071 + }, + { + "epoch": 0.533424481049601, + "grad_norm": 2.0687055899360303, + "learning_rate": 2.276566902720786e-06, + "loss": 0.9939, + "step": 5072 + }, + { + "epoch": 0.5335296514914484, + "grad_norm": 2.9464457058239257, + "learning_rate": 2.2757354364935893e-06, + "loss": 0.9716, + "step": 5073 + }, + { + "epoch": 0.5336348219332957, + "grad_norm": 1.5594113007408739, + "learning_rate": 2.274903995273635e-06, + "loss": 0.9705, + "step": 5074 + }, + { + "epoch": 0.5337399923751429, + "grad_norm": 2.6090779806826836, + "learning_rate": 2.2740725791536337e-06, + "loss": 0.9881, + "step": 5075 + }, + { + "epoch": 0.5338451628169902, + "grad_norm": 1.8377197278317314, + "learning_rate": 2.2732411882262946e-06, + "loss": 0.9562, + "step": 5076 + }, + { + "epoch": 0.5339503332588376, + "grad_norm": 2.7435287336866505, + "learning_rate": 2.272409822584325e-06, + "loss": 1.0032, + "step": 5077 + }, + { + "epoch": 0.5340555037006849, + "grad_norm": 2.212795732909546, + "learning_rate": 2.2715784823204275e-06, + "loss": 0.9326, + "step": 5078 + }, + { + "epoch": 0.5341606741425322, + "grad_norm": 2.3267493730330373, + "learning_rate": 2.270747167527304e-06, + "loss": 1.0236, + "step": 5079 + }, + { + "epoch": 0.5342658445843795, + "grad_norm": 2.622033615860494, + "learning_rate": 2.2699158782976527e-06, + "loss": 0.9834, + "step": 5080 + }, + { + "epoch": 0.5343710150262269, + "grad_norm": 2.210083057529942, + "learning_rate": 2.269084614724168e-06, + "loss": 1.0015, + "step": 5081 + }, + { + "epoch": 0.5344761854680742, + "grad_norm": 2.8063796227677877, + "learning_rate": 2.268253376899542e-06, + "loss": 0.9861, + "step": 5082 + }, + { + "epoch": 0.5345813559099215, + "grad_norm": 2.54873266219812, + "learning_rate": 2.267422164916465e-06, + "loss": 0.9637, + "step": 5083 + }, + { + "epoch": 0.5346865263517688, + "grad_norm": 2.9683253595659123, + "learning_rate": 2.2665909788676236e-06, + "loss": 1.0272, + "step": 5084 + }, + { + "epoch": 0.5347916967936162, + "grad_norm": 2.795406678292719, + "learning_rate": 2.2657598188457015e-06, + "loss": 0.9601, + "step": 5085 + }, + { + "epoch": 0.5348968672354635, + "grad_norm": 2.423988503624797, + "learning_rate": 2.264928684943379e-06, + "loss": 1.016, + "step": 5086 + }, + { + "epoch": 0.5350020376773108, + "grad_norm": 3.0389490068520244, + "learning_rate": 2.264097577253333e-06, + "loss": 0.98, + "step": 5087 + }, + { + "epoch": 0.5351072081191581, + "grad_norm": 2.591628315768945, + "learning_rate": 2.263266495868241e-06, + "loss": 0.9451, + "step": 5088 + }, + { + "epoch": 0.5352123785610055, + "grad_norm": 2.85809426107325, + "learning_rate": 2.262435440880774e-06, + "loss": 1.0456, + "step": 5089 + }, + { + "epoch": 0.5353175490028528, + "grad_norm": 2.4229876711875367, + "learning_rate": 2.2616044123836005e-06, + "loss": 0.962, + "step": 5090 + }, + { + "epoch": 0.5354227194447001, + "grad_norm": 2.272935313170715, + "learning_rate": 2.2607734104693866e-06, + "loss": 1.0093, + "step": 5091 + }, + { + "epoch": 0.5355278898865474, + "grad_norm": 2.9194015986993644, + "learning_rate": 2.2599424352307958e-06, + "loss": 0.9677, + "step": 5092 + }, + { + "epoch": 0.5356330603283948, + "grad_norm": 2.5574503210340755, + "learning_rate": 2.2591114867604887e-06, + "loss": 0.9673, + "step": 5093 + }, + { + "epoch": 0.5357382307702421, + "grad_norm": 1.81531777330057, + "learning_rate": 2.258280565151122e-06, + "loss": 1.005, + "step": 5094 + }, + { + "epoch": 0.5358434012120893, + "grad_norm": 2.8458384104973278, + "learning_rate": 2.25744967049535e-06, + "loss": 0.9489, + "step": 5095 + }, + { + "epoch": 0.5359485716539366, + "grad_norm": 2.1465556016022735, + "learning_rate": 2.2566188028858228e-06, + "loss": 0.995, + "step": 5096 + }, + { + "epoch": 0.5360537420957839, + "grad_norm": 2.3813284857921637, + "learning_rate": 2.2557879624151912e-06, + "loss": 1.0476, + "step": 5097 + }, + { + "epoch": 0.5361589125376313, + "grad_norm": 2.2720674366303735, + "learning_rate": 2.2549571491760985e-06, + "loss": 0.9967, + "step": 5098 + }, + { + "epoch": 0.5362640829794786, + "grad_norm": 2.1256470660057305, + "learning_rate": 2.254126363261188e-06, + "loss": 1.0248, + "step": 5099 + }, + { + "epoch": 0.5363692534213259, + "grad_norm": 1.430140563761567, + "learning_rate": 2.2532956047630973e-06, + "loss": 0.9834, + "step": 5100 + }, + { + "epoch": 0.5364744238631732, + "grad_norm": 2.2091020729494324, + "learning_rate": 2.252464873774462e-06, + "loss": 0.9964, + "step": 5101 + }, + { + "epoch": 0.5365795943050206, + "grad_norm": 3.05315371393753, + "learning_rate": 2.2516341703879176e-06, + "loss": 1.0133, + "step": 5102 + }, + { + "epoch": 0.5366847647468679, + "grad_norm": 2.67327232115545, + "learning_rate": 2.2508034946960924e-06, + "loss": 1.0006, + "step": 5103 + }, + { + "epoch": 0.5367899351887152, + "grad_norm": 2.990185755256598, + "learning_rate": 2.2499728467916133e-06, + "loss": 0.9785, + "step": 5104 + }, + { + "epoch": 0.5368951056305625, + "grad_norm": 2.391793174882368, + "learning_rate": 2.249142226767104e-06, + "loss": 0.9347, + "step": 5105 + }, + { + "epoch": 0.5370002760724099, + "grad_norm": 2.5608418817555014, + "learning_rate": 2.248311634715185e-06, + "loss": 1.0156, + "step": 5106 + }, + { + "epoch": 0.5371054465142572, + "grad_norm": 2.8618225744834067, + "learning_rate": 2.247481070728474e-06, + "loss": 1.029, + "step": 5107 + }, + { + "epoch": 0.5372106169561045, + "grad_norm": 1.5332374205582922, + "learning_rate": 2.2466505348995854e-06, + "loss": 0.9641, + "step": 5108 + }, + { + "epoch": 0.5373157873979518, + "grad_norm": 2.445318078146189, + "learning_rate": 2.24582002732113e-06, + "loss": 1.0037, + "step": 5109 + }, + { + "epoch": 0.5374209578397992, + "grad_norm": 1.948245870338096, + "learning_rate": 2.244989548085716e-06, + "loss": 0.9739, + "step": 5110 + }, + { + "epoch": 0.5375261282816465, + "grad_norm": 1.9108232074625904, + "learning_rate": 2.2441590972859484e-06, + "loss": 1.0196, + "step": 5111 + }, + { + "epoch": 0.5376312987234938, + "grad_norm": 2.239416100740601, + "learning_rate": 2.2433286750144293e-06, + "loss": 0.9678, + "step": 5112 + }, + { + "epoch": 0.5377364691653411, + "grad_norm": 2.205078947975664, + "learning_rate": 2.2424982813637567e-06, + "loss": 0.9857, + "step": 5113 + }, + { + "epoch": 0.5378416396071884, + "grad_norm": 2.253513657680878, + "learning_rate": 2.241667916426526e-06, + "loss": 0.9517, + "step": 5114 + }, + { + "epoch": 0.5379468100490358, + "grad_norm": 3.06969161503822, + "learning_rate": 2.240837580295329e-06, + "loss": 1.0048, + "step": 5115 + }, + { + "epoch": 0.538051980490883, + "grad_norm": 2.435849474370094, + "learning_rate": 2.2400072730627556e-06, + "loss": 0.9674, + "step": 5116 + }, + { + "epoch": 0.5381571509327303, + "grad_norm": 2.5677719895294433, + "learning_rate": 2.239176994821391e-06, + "loss": 0.9729, + "step": 5117 + }, + { + "epoch": 0.5382623213745776, + "grad_norm": 2.258970213664192, + "learning_rate": 2.2383467456638175e-06, + "loss": 0.9593, + "step": 5118 + }, + { + "epoch": 0.538367491816425, + "grad_norm": 2.4559982555422057, + "learning_rate": 2.237516525682614e-06, + "loss": 1.0028, + "step": 5119 + }, + { + "epoch": 0.5384726622582723, + "grad_norm": 3.0661627782794625, + "learning_rate": 2.2366863349703574e-06, + "loss": 0.9849, + "step": 5120 + }, + { + "epoch": 0.5385778327001196, + "grad_norm": 2.208903743452161, + "learning_rate": 2.23585617361962e-06, + "loss": 0.9887, + "step": 5121 + }, + { + "epoch": 0.5386830031419669, + "grad_norm": 2.5131994229325, + "learning_rate": 2.2350260417229715e-06, + "loss": 1.0212, + "step": 5122 + }, + { + "epoch": 0.5387881735838143, + "grad_norm": 2.513688036331137, + "learning_rate": 2.234195939372977e-06, + "loss": 1.011, + "step": 5123 + }, + { + "epoch": 0.5388933440256616, + "grad_norm": 1.9065741666317384, + "learning_rate": 2.2333658666621995e-06, + "loss": 0.9949, + "step": 5124 + }, + { + "epoch": 0.5389985144675089, + "grad_norm": 2.2999327606370494, + "learning_rate": 2.2325358236832e-06, + "loss": 0.9497, + "step": 5125 + }, + { + "epoch": 0.5391036849093562, + "grad_norm": 3.0371065795572543, + "learning_rate": 2.231705810528534e-06, + "loss": 1.0135, + "step": 5126 + }, + { + "epoch": 0.5392088553512036, + "grad_norm": 3.086589128157086, + "learning_rate": 2.230875827290755e-06, + "loss": 1.0293, + "step": 5127 + }, + { + "epoch": 0.5393140257930509, + "grad_norm": 2.6399411648602977, + "learning_rate": 2.2300458740624102e-06, + "loss": 0.9888, + "step": 5128 + }, + { + "epoch": 0.5394191962348982, + "grad_norm": 1.58725667452964, + "learning_rate": 2.2292159509360487e-06, + "loss": 0.9702, + "step": 5129 + }, + { + "epoch": 0.5395243666767455, + "grad_norm": 2.7241128959636027, + "learning_rate": 2.228386058004212e-06, + "loss": 1.0053, + "step": 5130 + }, + { + "epoch": 0.5396295371185929, + "grad_norm": 1.7958176439133673, + "learning_rate": 2.22755619535944e-06, + "loss": 0.9931, + "step": 5131 + }, + { + "epoch": 0.5397347075604402, + "grad_norm": 2.215221909482859, + "learning_rate": 2.2267263630942682e-06, + "loss": 1.0076, + "step": 5132 + }, + { + "epoch": 0.5398398780022875, + "grad_norm": 2.3342570809884267, + "learning_rate": 2.2258965613012293e-06, + "loss": 1.0169, + "step": 5133 + }, + { + "epoch": 0.5399450484441348, + "grad_norm": 2.2114565815704097, + "learning_rate": 2.2250667900728543e-06, + "loss": 1.0046, + "step": 5134 + }, + { + "epoch": 0.5400502188859821, + "grad_norm": 2.32994956342613, + "learning_rate": 2.224237049501668e-06, + "loss": 1.0162, + "step": 5135 + }, + { + "epoch": 0.5401553893278294, + "grad_norm": 2.2005296173151296, + "learning_rate": 2.223407339680192e-06, + "loss": 1.0203, + "step": 5136 + }, + { + "epoch": 0.5402605597696767, + "grad_norm": 2.71416231138807, + "learning_rate": 2.222577660700947e-06, + "loss": 0.9696, + "step": 5137 + }, + { + "epoch": 0.540365730211524, + "grad_norm": 2.3021954271008283, + "learning_rate": 2.2217480126564462e-06, + "loss": 1.0134, + "step": 5138 + }, + { + "epoch": 0.5404709006533713, + "grad_norm": 2.069021577288529, + "learning_rate": 2.220918395639205e-06, + "loss": 0.9861, + "step": 5139 + }, + { + "epoch": 0.5405760710952187, + "grad_norm": 2.113347825343487, + "learning_rate": 2.2200888097417308e-06, + "loss": 0.9842, + "step": 5140 + }, + { + "epoch": 0.540681241537066, + "grad_norm": 2.862415076458137, + "learning_rate": 2.219259255056528e-06, + "loss": 0.9512, + "step": 5141 + }, + { + "epoch": 0.5407864119789133, + "grad_norm": 2.494202998333855, + "learning_rate": 2.2184297316760998e-06, + "loss": 0.9957, + "step": 5142 + }, + { + "epoch": 0.5408915824207606, + "grad_norm": 2.5037856881206646, + "learning_rate": 2.2176002396929435e-06, + "loss": 1.0187, + "step": 5143 + }, + { + "epoch": 0.540996752862608, + "grad_norm": 2.300224957499141, + "learning_rate": 2.2167707791995547e-06, + "loss": 0.9993, + "step": 5144 + }, + { + "epoch": 0.5411019233044553, + "grad_norm": 1.7557662844334694, + "learning_rate": 2.2159413502884237e-06, + "loss": 0.9592, + "step": 5145 + }, + { + "epoch": 0.5412070937463026, + "grad_norm": 2.5946831278066744, + "learning_rate": 2.2151119530520394e-06, + "loss": 1.0363, + "step": 5146 + }, + { + "epoch": 0.5413122641881499, + "grad_norm": 2.6143394641387623, + "learning_rate": 2.2142825875828838e-06, + "loss": 0.9531, + "step": 5147 + }, + { + "epoch": 0.5414174346299973, + "grad_norm": 2.381679279813613, + "learning_rate": 2.2134532539734406e-06, + "loss": 1.0057, + "step": 5148 + }, + { + "epoch": 0.5415226050718446, + "grad_norm": 1.8202176335628701, + "learning_rate": 2.2126239523161854e-06, + "loss": 1.0132, + "step": 5149 + }, + { + "epoch": 0.5416277755136919, + "grad_norm": 2.30712857841413, + "learning_rate": 2.2117946827035927e-06, + "loss": 0.9888, + "step": 5150 + }, + { + "epoch": 0.5417329459555392, + "grad_norm": 2.5917572577099084, + "learning_rate": 2.2109654452281297e-06, + "loss": 0.9757, + "step": 5151 + }, + { + "epoch": 0.5418381163973865, + "grad_norm": 1.9879341436149915, + "learning_rate": 2.210136239982266e-06, + "loss": 1.0149, + "step": 5152 + }, + { + "epoch": 0.5419432868392339, + "grad_norm": 2.784187511162275, + "learning_rate": 2.2093070670584636e-06, + "loss": 0.9713, + "step": 5153 + }, + { + "epoch": 0.5420484572810812, + "grad_norm": 2.8215281669994905, + "learning_rate": 2.2084779265491817e-06, + "loss": 0.996, + "step": 5154 + }, + { + "epoch": 0.5421536277229285, + "grad_norm": 2.659872718452111, + "learning_rate": 2.207648818546875e-06, + "loss": 0.9979, + "step": 5155 + }, + { + "epoch": 0.5422587981647757, + "grad_norm": 2.4973238896568617, + "learning_rate": 2.206819743143996e-06, + "loss": 1.0018, + "step": 5156 + }, + { + "epoch": 0.5423639686066231, + "grad_norm": 3.3899471789949573, + "learning_rate": 2.2059907004329934e-06, + "loss": 1.0076, + "step": 5157 + }, + { + "epoch": 0.5424691390484704, + "grad_norm": 3.008953501860152, + "learning_rate": 2.2051616905063112e-06, + "loss": 0.9814, + "step": 5158 + }, + { + "epoch": 0.5425743094903177, + "grad_norm": 2.4382683163743804, + "learning_rate": 2.2043327134563917e-06, + "loss": 0.9694, + "step": 5159 + }, + { + "epoch": 0.542679479932165, + "grad_norm": 2.4112838713428606, + "learning_rate": 2.203503769375671e-06, + "loss": 0.9944, + "step": 5160 + }, + { + "epoch": 0.5427846503740124, + "grad_norm": 2.3975472141330703, + "learning_rate": 2.2026748583565824e-06, + "loss": 0.9987, + "step": 5161 + }, + { + "epoch": 0.5428898208158597, + "grad_norm": 2.4732650547216, + "learning_rate": 2.201845980491558e-06, + "loss": 1.0106, + "step": 5162 + }, + { + "epoch": 0.542994991257707, + "grad_norm": 1.9440375264246743, + "learning_rate": 2.2010171358730227e-06, + "loss": 0.9711, + "step": 5163 + }, + { + "epoch": 0.5431001616995543, + "grad_norm": 2.950910241043643, + "learning_rate": 2.2001883245933992e-06, + "loss": 1.0359, + "step": 5164 + }, + { + "epoch": 0.5432053321414017, + "grad_norm": 1.7374181043854764, + "learning_rate": 2.199359546745106e-06, + "loss": 0.9894, + "step": 5165 + }, + { + "epoch": 0.543310502583249, + "grad_norm": 1.919204253825503, + "learning_rate": 2.19853080242056e-06, + "loss": 0.9888, + "step": 5166 + }, + { + "epoch": 0.5434156730250963, + "grad_norm": 2.194559285983916, + "learning_rate": 2.1977020917121707e-06, + "loss": 0.9831, + "step": 5167 + }, + { + "epoch": 0.5435208434669436, + "grad_norm": 2.616426607003015, + "learning_rate": 2.1968734147123467e-06, + "loss": 1.0058, + "step": 5168 + }, + { + "epoch": 0.543626013908791, + "grad_norm": 2.9485745152973513, + "learning_rate": 2.196044771513492e-06, + "loss": 1.0286, + "step": 5169 + }, + { + "epoch": 0.5437311843506383, + "grad_norm": 2.3426077436696096, + "learning_rate": 2.195216162208005e-06, + "loss": 0.9494, + "step": 5170 + }, + { + "epoch": 0.5438363547924856, + "grad_norm": 2.350276659584285, + "learning_rate": 2.1943875868882853e-06, + "loss": 1.0144, + "step": 5171 + }, + { + "epoch": 0.5439415252343329, + "grad_norm": 1.9725474379833872, + "learning_rate": 2.1935590456467232e-06, + "loss": 0.9755, + "step": 5172 + }, + { + "epoch": 0.5440466956761802, + "grad_norm": 2.4143702510240055, + "learning_rate": 2.192730538575708e-06, + "loss": 0.9701, + "step": 5173 + }, + { + "epoch": 0.5441518661180276, + "grad_norm": 2.1110930182865064, + "learning_rate": 2.191902065767624e-06, + "loss": 1.0355, + "step": 5174 + }, + { + "epoch": 0.5442570365598749, + "grad_norm": 4.361748721313425, + "learning_rate": 2.191073627314854e-06, + "loss": 0.9864, + "step": 5175 + }, + { + "epoch": 0.5443622070017222, + "grad_norm": 2.501366700470722, + "learning_rate": 2.1902452233097736e-06, + "loss": 0.9758, + "step": 5176 + }, + { + "epoch": 0.5444673774435694, + "grad_norm": 2.235469201389308, + "learning_rate": 2.1894168538447576e-06, + "loss": 1.0104, + "step": 5177 + }, + { + "epoch": 0.5445725478854168, + "grad_norm": 2.6134379475508367, + "learning_rate": 2.1885885190121753e-06, + "loss": 0.9644, + "step": 5178 + }, + { + "epoch": 0.5446777183272641, + "grad_norm": 2.6083073446693876, + "learning_rate": 2.187760218904392e-06, + "loss": 0.9883, + "step": 5179 + }, + { + "epoch": 0.5447828887691114, + "grad_norm": 2.888337664510672, + "learning_rate": 2.1869319536137693e-06, + "loss": 0.9995, + "step": 5180 + }, + { + "epoch": 0.5448880592109587, + "grad_norm": 2.1326399057874914, + "learning_rate": 2.1861037232326666e-06, + "loss": 0.9857, + "step": 5181 + }, + { + "epoch": 0.544993229652806, + "grad_norm": 2.545298533806266, + "learning_rate": 2.1852755278534373e-06, + "loss": 0.9809, + "step": 5182 + }, + { + "epoch": 0.5450984000946534, + "grad_norm": 2.0108239660027656, + "learning_rate": 2.18444736756843e-06, + "loss": 1.0229, + "step": 5183 + }, + { + "epoch": 0.5452035705365007, + "grad_norm": 2.6337533910173057, + "learning_rate": 2.183619242469994e-06, + "loss": 0.9919, + "step": 5184 + }, + { + "epoch": 0.545308740978348, + "grad_norm": 2.5146353222564235, + "learning_rate": 2.1827911526504702e-06, + "loss": 0.9946, + "step": 5185 + }, + { + "epoch": 0.5454139114201954, + "grad_norm": 2.613024635280223, + "learning_rate": 2.1819630982021967e-06, + "loss": 0.9718, + "step": 5186 + }, + { + "epoch": 0.5455190818620427, + "grad_norm": 2.26349598516355, + "learning_rate": 2.1811350792175084e-06, + "loss": 1.0053, + "step": 5187 + }, + { + "epoch": 0.54562425230389, + "grad_norm": 2.3470184717219422, + "learning_rate": 2.1803070957887348e-06, + "loss": 0.9996, + "step": 5188 + }, + { + "epoch": 0.5457294227457373, + "grad_norm": 2.476693756773956, + "learning_rate": 2.1794791480082046e-06, + "loss": 1.0368, + "step": 5189 + }, + { + "epoch": 0.5458345931875846, + "grad_norm": 2.836739750714608, + "learning_rate": 2.1786512359682394e-06, + "loss": 0.9856, + "step": 5190 + }, + { + "epoch": 0.545939763629432, + "grad_norm": 2.120438515765004, + "learning_rate": 2.1778233597611576e-06, + "loss": 0.9753, + "step": 5191 + }, + { + "epoch": 0.5460449340712793, + "grad_norm": 2.085336690843402, + "learning_rate": 2.1769955194792737e-06, + "loss": 1.0106, + "step": 5192 + }, + { + "epoch": 0.5461501045131266, + "grad_norm": 2.388620073312146, + "learning_rate": 2.176167715214898e-06, + "loss": 0.9767, + "step": 5193 + }, + { + "epoch": 0.546255274954974, + "grad_norm": 2.71199718772841, + "learning_rate": 2.1753399470603387e-06, + "loss": 1.0116, + "step": 5194 + }, + { + "epoch": 0.5463604453968213, + "grad_norm": 2.6031851920934397, + "learning_rate": 2.174512215107897e-06, + "loss": 0.997, + "step": 5195 + }, + { + "epoch": 0.5464656158386686, + "grad_norm": 2.5270157415798815, + "learning_rate": 2.173684519449872e-06, + "loss": 1.0268, + "step": 5196 + }, + { + "epoch": 0.5465707862805158, + "grad_norm": 3.0251465048345607, + "learning_rate": 2.1728568601785564e-06, + "loss": 1.0076, + "step": 5197 + }, + { + "epoch": 0.5466759567223631, + "grad_norm": 3.213521672648237, + "learning_rate": 2.172029237386244e-06, + "loss": 1.0297, + "step": 5198 + }, + { + "epoch": 0.5467811271642105, + "grad_norm": 2.4599343033374774, + "learning_rate": 2.1712016511652187e-06, + "loss": 1.0106, + "step": 5199 + }, + { + "epoch": 0.5468862976060578, + "grad_norm": 2.744190371006768, + "learning_rate": 2.170374101607764e-06, + "loss": 1.0298, + "step": 5200 + }, + { + "epoch": 0.5469914680479051, + "grad_norm": 2.49369766763488, + "learning_rate": 2.169546588806158e-06, + "loss": 1.0502, + "step": 5201 + }, + { + "epoch": 0.5470966384897524, + "grad_norm": 2.381587412730638, + "learning_rate": 2.168719112852673e-06, + "loss": 0.9862, + "step": 5202 + }, + { + "epoch": 0.5472018089315998, + "grad_norm": 3.9803983911874736, + "learning_rate": 2.167891673839581e-06, + "loss": 1.0081, + "step": 5203 + }, + { + "epoch": 0.5473069793734471, + "grad_norm": 2.8513338869013287, + "learning_rate": 2.1670642718591477e-06, + "loss": 0.9999, + "step": 5204 + }, + { + "epoch": 0.5474121498152944, + "grad_norm": 2.087249392253148, + "learning_rate": 2.166236907003634e-06, + "loss": 0.9908, + "step": 5205 + }, + { + "epoch": 0.5475173202571417, + "grad_norm": 2.1537694107930956, + "learning_rate": 2.1654095793652975e-06, + "loss": 0.9787, + "step": 5206 + }, + { + "epoch": 0.547622490698989, + "grad_norm": 2.398002826051045, + "learning_rate": 2.1645822890363928e-06, + "loss": 1.0128, + "step": 5207 + }, + { + "epoch": 0.5477276611408364, + "grad_norm": 1.8014888313043216, + "learning_rate": 2.1637550361091685e-06, + "loss": 0.9712, + "step": 5208 + }, + { + "epoch": 0.5478328315826837, + "grad_norm": 2.6606854949977783, + "learning_rate": 2.16292782067587e-06, + "loss": 1.0116, + "step": 5209 + }, + { + "epoch": 0.547938002024531, + "grad_norm": 2.688100293619295, + "learning_rate": 2.162100642828737e-06, + "loss": 1.0044, + "step": 5210 + }, + { + "epoch": 0.5480431724663783, + "grad_norm": 2.4430646701220544, + "learning_rate": 2.161273502660007e-06, + "loss": 1.0112, + "step": 5211 + }, + { + "epoch": 0.5481483429082257, + "grad_norm": 2.0241794197127927, + "learning_rate": 2.1604464002619135e-06, + "loss": 1.0267, + "step": 5212 + }, + { + "epoch": 0.548253513350073, + "grad_norm": 3.178008506690804, + "learning_rate": 2.1596193357266844e-06, + "loss": 0.9995, + "step": 5213 + }, + { + "epoch": 0.5483586837919203, + "grad_norm": 2.161378205518831, + "learning_rate": 2.1587923091465434e-06, + "loss": 0.976, + "step": 5214 + }, + { + "epoch": 0.5484638542337676, + "grad_norm": 2.2458856326219885, + "learning_rate": 2.1579653206137104e-06, + "loss": 1.0095, + "step": 5215 + }, + { + "epoch": 0.548569024675615, + "grad_norm": 2.276838712981099, + "learning_rate": 2.1571383702204006e-06, + "loss": 0.9981, + "step": 5216 + }, + { + "epoch": 0.5486741951174622, + "grad_norm": 2.5745480726564693, + "learning_rate": 2.1563114580588267e-06, + "loss": 1.0043, + "step": 5217 + }, + { + "epoch": 0.5487793655593095, + "grad_norm": 1.8966939253835509, + "learning_rate": 2.1554845842211954e-06, + "loss": 1.0158, + "step": 5218 + }, + { + "epoch": 0.5488845360011568, + "grad_norm": 2.0953028195766423, + "learning_rate": 2.1546577487997087e-06, + "loss": 0.9859, + "step": 5219 + }, + { + "epoch": 0.5489897064430042, + "grad_norm": 2.110354152880763, + "learning_rate": 2.1538309518865646e-06, + "loss": 0.9917, + "step": 5220 + }, + { + "epoch": 0.5490948768848515, + "grad_norm": 2.3921433816101945, + "learning_rate": 2.1530041935739604e-06, + "loss": 0.9732, + "step": 5221 + }, + { + "epoch": 0.5492000473266988, + "grad_norm": 1.5432785190992437, + "learning_rate": 2.1521774739540833e-06, + "loss": 0.9604, + "step": 5222 + }, + { + "epoch": 0.5493052177685461, + "grad_norm": 2.6627689190811914, + "learning_rate": 2.1513507931191203e-06, + "loss": 1.0003, + "step": 5223 + }, + { + "epoch": 0.5494103882103935, + "grad_norm": 2.4129431035360427, + "learning_rate": 2.1505241511612522e-06, + "loss": 0.9286, + "step": 5224 + }, + { + "epoch": 0.5495155586522408, + "grad_norm": 2.9528788317691554, + "learning_rate": 2.149697548172655e-06, + "loss": 0.9514, + "step": 5225 + }, + { + "epoch": 0.5496207290940881, + "grad_norm": 2.2730624529958146, + "learning_rate": 2.1488709842455033e-06, + "loss": 0.9983, + "step": 5226 + }, + { + "epoch": 0.5497258995359354, + "grad_norm": 3.166867500015208, + "learning_rate": 2.1480444594719647e-06, + "loss": 1.0307, + "step": 5227 + }, + { + "epoch": 0.5498310699777827, + "grad_norm": 2.4113920354615916, + "learning_rate": 2.1472179739442027e-06, + "loss": 0.9667, + "step": 5228 + }, + { + "epoch": 0.5499362404196301, + "grad_norm": 2.518675037782677, + "learning_rate": 2.1463915277543766e-06, + "loss": 1.0238, + "step": 5229 + }, + { + "epoch": 0.5500414108614774, + "grad_norm": 2.6250890442322454, + "learning_rate": 2.1455651209946428e-06, + "loss": 0.9742, + "step": 5230 + }, + { + "epoch": 0.5501465813033247, + "grad_norm": 2.02618334080473, + "learning_rate": 2.144738753757151e-06, + "loss": 0.9678, + "step": 5231 + }, + { + "epoch": 0.550251751745172, + "grad_norm": 2.667267963473209, + "learning_rate": 2.1439124261340484e-06, + "loss": 0.9861, + "step": 5232 + }, + { + "epoch": 0.5503569221870194, + "grad_norm": 2.1122311481039597, + "learning_rate": 2.1430861382174763e-06, + "loss": 0.9488, + "step": 5233 + }, + { + "epoch": 0.5504620926288667, + "grad_norm": 1.7454212223945111, + "learning_rate": 2.1422598900995715e-06, + "loss": 0.9995, + "step": 5234 + }, + { + "epoch": 0.550567263070714, + "grad_norm": 2.5491840982703975, + "learning_rate": 2.1414336818724685e-06, + "loss": 0.9631, + "step": 5235 + }, + { + "epoch": 0.5506724335125613, + "grad_norm": 2.0258659355875044, + "learning_rate": 2.140607513628296e-06, + "loss": 0.9681, + "step": 5236 + }, + { + "epoch": 0.5507776039544087, + "grad_norm": 3.187827818577956, + "learning_rate": 2.1397813854591778e-06, + "loss": 0.962, + "step": 5237 + }, + { + "epoch": 0.5508827743962559, + "grad_norm": 2.1053841231191734, + "learning_rate": 2.138955297457233e-06, + "loss": 0.9881, + "step": 5238 + }, + { + "epoch": 0.5509879448381032, + "grad_norm": 2.0350361564304102, + "learning_rate": 2.138129249714576e-06, + "loss": 0.978, + "step": 5239 + }, + { + "epoch": 0.5510931152799505, + "grad_norm": 2.624945121824399, + "learning_rate": 2.13730324232332e-06, + "loss": 0.9699, + "step": 5240 + }, + { + "epoch": 0.5511982857217979, + "grad_norm": 3.242345363363925, + "learning_rate": 2.13647727537557e-06, + "loss": 0.9856, + "step": 5241 + }, + { + "epoch": 0.5513034561636452, + "grad_norm": 2.8837592921549278, + "learning_rate": 2.135651348963428e-06, + "loss": 1.0201, + "step": 5242 + }, + { + "epoch": 0.5514086266054925, + "grad_norm": 2.3013474493107795, + "learning_rate": 2.1348254631789895e-06, + "loss": 0.9774, + "step": 5243 + }, + { + "epoch": 0.5515137970473398, + "grad_norm": 2.113009140432173, + "learning_rate": 2.13399961811435e-06, + "loss": 0.976, + "step": 5244 + }, + { + "epoch": 0.5516189674891872, + "grad_norm": 2.6604980643288236, + "learning_rate": 2.133173813861596e-06, + "loss": 1.0091, + "step": 5245 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 3.1134737984113303, + "learning_rate": 2.1323480505128115e-06, + "loss": 0.9967, + "step": 5246 + }, + { + "epoch": 0.5518293083728818, + "grad_norm": 2.167403323911424, + "learning_rate": 2.131522328160075e-06, + "loss": 0.9711, + "step": 5247 + }, + { + "epoch": 0.5519344788147291, + "grad_norm": 2.3379019799741236, + "learning_rate": 2.130696646895461e-06, + "loss": 0.9972, + "step": 5248 + }, + { + "epoch": 0.5520396492565764, + "grad_norm": 2.059963339031156, + "learning_rate": 2.12987100681104e-06, + "loss": 0.9488, + "step": 5249 + }, + { + "epoch": 0.5521448196984238, + "grad_norm": 2.4196742713351194, + "learning_rate": 2.1290454079988775e-06, + "loss": 1.0105, + "step": 5250 + }, + { + "epoch": 0.5522499901402711, + "grad_norm": 2.462036105246982, + "learning_rate": 2.128219850551034e-06, + "loss": 0.9945, + "step": 5251 + }, + { + "epoch": 0.5523551605821184, + "grad_norm": 1.8416505320697634, + "learning_rate": 2.1273943345595637e-06, + "loss": 1.0043, + "step": 5252 + }, + { + "epoch": 0.5524603310239657, + "grad_norm": 3.251515812829299, + "learning_rate": 2.1265688601165206e-06, + "loss": 1.0242, + "step": 5253 + }, + { + "epoch": 0.5525655014658131, + "grad_norm": 2.6527329475849313, + "learning_rate": 2.125743427313951e-06, + "loss": 1.0045, + "step": 5254 + }, + { + "epoch": 0.5526706719076604, + "grad_norm": 1.9950343766089205, + "learning_rate": 2.124918036243896e-06, + "loss": 0.9664, + "step": 5255 + }, + { + "epoch": 0.5527758423495077, + "grad_norm": 2.2943708516221455, + "learning_rate": 2.124092686998394e-06, + "loss": 1.0111, + "step": 5256 + }, + { + "epoch": 0.552881012791355, + "grad_norm": 1.6382639178089367, + "learning_rate": 2.123267379669477e-06, + "loss": 1.0042, + "step": 5257 + }, + { + "epoch": 0.5529861832332023, + "grad_norm": 2.9901639984780255, + "learning_rate": 2.122442114349174e-06, + "loss": 1.0122, + "step": 5258 + }, + { + "epoch": 0.5530913536750496, + "grad_norm": 2.1998167103396287, + "learning_rate": 2.1216168911295085e-06, + "loss": 1.0353, + "step": 5259 + }, + { + "epoch": 0.5531965241168969, + "grad_norm": 2.9897721880211736, + "learning_rate": 2.120791710102499e-06, + "loss": 1.0083, + "step": 5260 + }, + { + "epoch": 0.5533016945587442, + "grad_norm": 2.054680773983714, + "learning_rate": 2.1199665713601593e-06, + "loss": 1.0036, + "step": 5261 + }, + { + "epoch": 0.5534068650005916, + "grad_norm": 2.617746462194278, + "learning_rate": 2.1191414749944985e-06, + "loss": 0.9665, + "step": 5262 + }, + { + "epoch": 0.5535120354424389, + "grad_norm": 1.9936196114564972, + "learning_rate": 2.1183164210975226e-06, + "loss": 1.0104, + "step": 5263 + }, + { + "epoch": 0.5536172058842862, + "grad_norm": 2.469231832845493, + "learning_rate": 2.1174914097612308e-06, + "loss": 0.994, + "step": 5264 + }, + { + "epoch": 0.5537223763261335, + "grad_norm": 2.93259002955845, + "learning_rate": 2.1166664410776184e-06, + "loss": 1.0127, + "step": 5265 + }, + { + "epoch": 0.5538275467679808, + "grad_norm": 2.0537797985124326, + "learning_rate": 2.1158415151386746e-06, + "loss": 0.9471, + "step": 5266 + }, + { + "epoch": 0.5539327172098282, + "grad_norm": 2.134532328734205, + "learning_rate": 2.115016632036387e-06, + "loss": 0.9859, + "step": 5267 + }, + { + "epoch": 0.5540378876516755, + "grad_norm": 2.2964327708883134, + "learning_rate": 2.1141917918627357e-06, + "loss": 1.0047, + "step": 5268 + }, + { + "epoch": 0.5541430580935228, + "grad_norm": 2.8516902511053535, + "learning_rate": 2.113366994709697e-06, + "loss": 1.0079, + "step": 5269 + }, + { + "epoch": 0.5542482285353701, + "grad_norm": 2.3546571378436743, + "learning_rate": 2.1125422406692416e-06, + "loss": 0.9691, + "step": 5270 + }, + { + "epoch": 0.5543533989772175, + "grad_norm": 2.630944240951188, + "learning_rate": 2.1117175298333347e-06, + "loss": 0.983, + "step": 5271 + }, + { + "epoch": 0.5544585694190648, + "grad_norm": 1.8823762990892248, + "learning_rate": 2.1108928622939413e-06, + "loss": 0.9626, + "step": 5272 + }, + { + "epoch": 0.5545637398609121, + "grad_norm": 2.355887099553958, + "learning_rate": 2.110068238143016e-06, + "loss": 0.9441, + "step": 5273 + }, + { + "epoch": 0.5546689103027594, + "grad_norm": 2.4481500085464205, + "learning_rate": 2.1092436574725113e-06, + "loss": 1.0182, + "step": 5274 + }, + { + "epoch": 0.5547740807446068, + "grad_norm": 2.088588569858711, + "learning_rate": 2.1084191203743732e-06, + "loss": 0.9869, + "step": 5275 + }, + { + "epoch": 0.5548792511864541, + "grad_norm": 2.935034768068429, + "learning_rate": 2.1075946269405464e-06, + "loss": 1.0415, + "step": 5276 + }, + { + "epoch": 0.5549844216283014, + "grad_norm": 2.511909936791334, + "learning_rate": 2.106770177262967e-06, + "loss": 1.0, + "step": 5277 + }, + { + "epoch": 0.5550895920701486, + "grad_norm": 2.489430938185479, + "learning_rate": 2.105945771433567e-06, + "loss": 1.013, + "step": 5278 + }, + { + "epoch": 0.555194762511996, + "grad_norm": 2.8817643395461388, + "learning_rate": 2.105121409544275e-06, + "loss": 1.002, + "step": 5279 + }, + { + "epoch": 0.5552999329538433, + "grad_norm": 2.9876800647989525, + "learning_rate": 2.104297091687013e-06, + "loss": 0.9837, + "step": 5280 + }, + { + "epoch": 0.5554051033956906, + "grad_norm": 2.521676022126096, + "learning_rate": 2.1034728179536996e-06, + "loss": 0.9923, + "step": 5281 + }, + { + "epoch": 0.5555102738375379, + "grad_norm": 2.3179342164652317, + "learning_rate": 2.102648588436247e-06, + "loss": 1.0151, + "step": 5282 + }, + { + "epoch": 0.5556154442793853, + "grad_norm": 2.328687099694681, + "learning_rate": 2.101824403226564e-06, + "loss": 0.9904, + "step": 5283 + }, + { + "epoch": 0.5557206147212326, + "grad_norm": 2.4299365708893346, + "learning_rate": 2.1010002624165528e-06, + "loss": 1.0372, + "step": 5284 + }, + { + "epoch": 0.5558257851630799, + "grad_norm": 1.9606388049139083, + "learning_rate": 2.100176166098111e-06, + "loss": 0.9814, + "step": 5285 + }, + { + "epoch": 0.5559309556049272, + "grad_norm": 1.6767864003425093, + "learning_rate": 2.0993521143631335e-06, + "loss": 0.9458, + "step": 5286 + }, + { + "epoch": 0.5560361260467745, + "grad_norm": 2.1286537540888175, + "learning_rate": 2.098528107303508e-06, + "loss": 0.962, + "step": 5287 + }, + { + "epoch": 0.5561412964886219, + "grad_norm": 2.5220581659039674, + "learning_rate": 2.0977041450111173e-06, + "loss": 1.0144, + "step": 5288 + }, + { + "epoch": 0.5562464669304692, + "grad_norm": 2.168639158568607, + "learning_rate": 2.0968802275778384e-06, + "loss": 1.0465, + "step": 5289 + }, + { + "epoch": 0.5563516373723165, + "grad_norm": 2.760268469404148, + "learning_rate": 2.0960563550955465e-06, + "loss": 0.9649, + "step": 5290 + }, + { + "epoch": 0.5564568078141638, + "grad_norm": 2.1553647272545158, + "learning_rate": 2.095232527656109e-06, + "loss": 0.999, + "step": 5291 + }, + { + "epoch": 0.5565619782560112, + "grad_norm": 1.619929018860887, + "learning_rate": 2.0944087453513887e-06, + "loss": 0.977, + "step": 5292 + }, + { + "epoch": 0.5566671486978585, + "grad_norm": 3.121330671866382, + "learning_rate": 2.0935850082732444e-06, + "loss": 1.0139, + "step": 5293 + }, + { + "epoch": 0.5567723191397058, + "grad_norm": 2.3387671861808363, + "learning_rate": 2.0927613165135285e-06, + "loss": 1.0007, + "step": 5294 + }, + { + "epoch": 0.5568774895815531, + "grad_norm": 2.200825022390463, + "learning_rate": 2.0919376701640897e-06, + "loss": 0.9796, + "step": 5295 + }, + { + "epoch": 0.5569826600234005, + "grad_norm": 2.3884861298168962, + "learning_rate": 2.0911140693167703e-06, + "loss": 0.9875, + "step": 5296 + }, + { + "epoch": 0.5570878304652478, + "grad_norm": 2.5987592283460597, + "learning_rate": 2.0902905140634087e-06, + "loss": 1.0331, + "step": 5297 + }, + { + "epoch": 0.5571930009070951, + "grad_norm": 3.301654294448108, + "learning_rate": 2.0894670044958364e-06, + "loss": 1.0113, + "step": 5298 + }, + { + "epoch": 0.5572981713489423, + "grad_norm": 2.711878855343128, + "learning_rate": 2.0886435407058836e-06, + "loss": 1.0027, + "step": 5299 + }, + { + "epoch": 0.5574033417907897, + "grad_norm": 2.8480009398658406, + "learning_rate": 2.087820122785371e-06, + "loss": 0.974, + "step": 5300 + }, + { + "epoch": 0.557508512232637, + "grad_norm": 2.3312290115794108, + "learning_rate": 2.0869967508261175e-06, + "loss": 0.9928, + "step": 5301 + }, + { + "epoch": 0.5576136826744843, + "grad_norm": 1.936637576330423, + "learning_rate": 2.086173424919934e-06, + "loss": 1.0031, + "step": 5302 + }, + { + "epoch": 0.5577188531163316, + "grad_norm": 1.902158601451205, + "learning_rate": 2.085350145158628e-06, + "loss": 1.0094, + "step": 5303 + }, + { + "epoch": 0.557824023558179, + "grad_norm": 2.1117431500701827, + "learning_rate": 2.084526911634002e-06, + "loss": 0.9817, + "step": 5304 + }, + { + "epoch": 0.5579291940000263, + "grad_norm": 2.541061650720203, + "learning_rate": 2.0837037244378534e-06, + "loss": 0.9666, + "step": 5305 + }, + { + "epoch": 0.5580343644418736, + "grad_norm": 2.1751713964862716, + "learning_rate": 2.082880583661973e-06, + "loss": 0.9924, + "step": 5306 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 2.0010450338415353, + "learning_rate": 2.082057489398148e-06, + "loss": 1.012, + "step": 5307 + }, + { + "epoch": 0.5582447053255682, + "grad_norm": 2.476716525076265, + "learning_rate": 2.0812344417381595e-06, + "loss": 1.0308, + "step": 5308 + }, + { + "epoch": 0.5583498757674156, + "grad_norm": 2.563557102937592, + "learning_rate": 2.0804114407737837e-06, + "loss": 0.9702, + "step": 5309 + }, + { + "epoch": 0.5584550462092629, + "grad_norm": 2.7438116844536573, + "learning_rate": 2.0795884865967922e-06, + "loss": 1.0164, + "step": 5310 + }, + { + "epoch": 0.5585602166511102, + "grad_norm": 2.8471554577442273, + "learning_rate": 2.07876557929895e-06, + "loss": 1.007, + "step": 5311 + }, + { + "epoch": 0.5586653870929575, + "grad_norm": 2.018132015140999, + "learning_rate": 2.077942718972017e-06, + "loss": 1.0139, + "step": 5312 + }, + { + "epoch": 0.5587705575348049, + "grad_norm": 2.125285679282089, + "learning_rate": 2.0771199057077507e-06, + "loss": 1.028, + "step": 5313 + }, + { + "epoch": 0.5588757279766522, + "grad_norm": 2.7625535043659775, + "learning_rate": 2.0762971395978996e-06, + "loss": 1.0062, + "step": 5314 + }, + { + "epoch": 0.5589808984184995, + "grad_norm": 2.1017174794213997, + "learning_rate": 2.0754744207342097e-06, + "loss": 1.0082, + "step": 5315 + }, + { + "epoch": 0.5590860688603468, + "grad_norm": 1.782677163365059, + "learning_rate": 2.074651749208419e-06, + "loss": 0.9765, + "step": 5316 + }, + { + "epoch": 0.5591912393021942, + "grad_norm": 3.167958590956396, + "learning_rate": 2.0738291251122624e-06, + "loss": 0.9923, + "step": 5317 + }, + { + "epoch": 0.5592964097440415, + "grad_norm": 1.907619393947842, + "learning_rate": 2.073006548537469e-06, + "loss": 1.0287, + "step": 5318 + }, + { + "epoch": 0.5594015801858887, + "grad_norm": 2.7345986219575256, + "learning_rate": 2.0721840195757626e-06, + "loss": 0.9527, + "step": 5319 + }, + { + "epoch": 0.559506750627736, + "grad_norm": 2.4886965308181384, + "learning_rate": 2.0713615383188615e-06, + "loss": 1.0136, + "step": 5320 + }, + { + "epoch": 0.5596119210695834, + "grad_norm": 2.6055536744435512, + "learning_rate": 2.0705391048584775e-06, + "loss": 0.9965, + "step": 5321 + }, + { + "epoch": 0.5597170915114307, + "grad_norm": 2.087005530742932, + "learning_rate": 2.0697167192863205e-06, + "loss": 1.0241, + "step": 5322 + }, + { + "epoch": 0.559822261953278, + "grad_norm": 2.5369112749685554, + "learning_rate": 2.0688943816940927e-06, + "loss": 1.0075, + "step": 5323 + }, + { + "epoch": 0.5599274323951253, + "grad_norm": 2.7221541004536953, + "learning_rate": 2.0680720921734894e-06, + "loss": 0.9315, + "step": 5324 + }, + { + "epoch": 0.5600326028369726, + "grad_norm": 2.022091119318397, + "learning_rate": 2.067249850816203e-06, + "loss": 0.9864, + "step": 5325 + }, + { + "epoch": 0.56013777327882, + "grad_norm": 2.3688556004256567, + "learning_rate": 2.0664276577139193e-06, + "loss": 1.0195, + "step": 5326 + }, + { + "epoch": 0.5602429437206673, + "grad_norm": 2.4773968134541096, + "learning_rate": 2.065605512958321e-06, + "loss": 1.0232, + "step": 5327 + }, + { + "epoch": 0.5603481141625146, + "grad_norm": 2.462817643536204, + "learning_rate": 2.0647834166410825e-06, + "loss": 0.9874, + "step": 5328 + }, + { + "epoch": 0.560453284604362, + "grad_norm": 2.305518728419192, + "learning_rate": 2.0639613688538733e-06, + "loss": 0.9921, + "step": 5329 + }, + { + "epoch": 0.5605584550462093, + "grad_norm": 2.6367842405171005, + "learning_rate": 2.063139369688359e-06, + "loss": 0.9613, + "step": 5330 + }, + { + "epoch": 0.5606636254880566, + "grad_norm": 1.8818850010645942, + "learning_rate": 2.062317419236199e-06, + "loss": 0.9777, + "step": 5331 + }, + { + "epoch": 0.5607687959299039, + "grad_norm": 2.1790160668617444, + "learning_rate": 2.0614955175890464e-06, + "loss": 1.0092, + "step": 5332 + }, + { + "epoch": 0.5608739663717512, + "grad_norm": 2.0639573437892857, + "learning_rate": 2.06067366483855e-06, + "loss": 1.0138, + "step": 5333 + }, + { + "epoch": 0.5609791368135986, + "grad_norm": 1.9579365333347298, + "learning_rate": 2.0598518610763534e-06, + "loss": 0.9952, + "step": 5334 + }, + { + "epoch": 0.5610843072554459, + "grad_norm": 2.2186014741559656, + "learning_rate": 2.0590301063940917e-06, + "loss": 0.9515, + "step": 5335 + }, + { + "epoch": 0.5611894776972932, + "grad_norm": 1.9011562436125449, + "learning_rate": 2.0582084008834003e-06, + "loss": 0.9342, + "step": 5336 + }, + { + "epoch": 0.5612946481391405, + "grad_norm": 1.8095902323216666, + "learning_rate": 2.057386744635904e-06, + "loss": 1.0039, + "step": 5337 + }, + { + "epoch": 0.5613998185809879, + "grad_norm": 1.8780136670652687, + "learning_rate": 2.056565137743224e-06, + "loss": 0.9827, + "step": 5338 + }, + { + "epoch": 0.5615049890228351, + "grad_norm": 3.2992844080330874, + "learning_rate": 2.055743580296976e-06, + "loss": 0.9805, + "step": 5339 + }, + { + "epoch": 0.5616101594646824, + "grad_norm": 3.092310843970352, + "learning_rate": 2.0549220723887687e-06, + "loss": 1.0094, + "step": 5340 + }, + { + "epoch": 0.5617153299065297, + "grad_norm": 2.4046006250279435, + "learning_rate": 2.0541006141102086e-06, + "loss": 0.967, + "step": 5341 + }, + { + "epoch": 0.561820500348377, + "grad_norm": 2.1278856112151336, + "learning_rate": 2.0532792055528946e-06, + "loss": 1.012, + "step": 5342 + }, + { + "epoch": 0.5619256707902244, + "grad_norm": 3.1700362603693475, + "learning_rate": 2.052457846808419e-06, + "loss": 0.9602, + "step": 5343 + }, + { + "epoch": 0.5620308412320717, + "grad_norm": 1.9326932220836064, + "learning_rate": 2.0516365379683694e-06, + "loss": 0.9952, + "step": 5344 + }, + { + "epoch": 0.562136011673919, + "grad_norm": 2.7114145309970086, + "learning_rate": 2.0508152791243296e-06, + "loss": 0.948, + "step": 5345 + }, + { + "epoch": 0.5622411821157663, + "grad_norm": 2.3203144056712706, + "learning_rate": 2.0499940703678755e-06, + "loss": 0.9971, + "step": 5346 + }, + { + "epoch": 0.5623463525576137, + "grad_norm": 2.3494148393308154, + "learning_rate": 2.049172911790578e-06, + "loss": 0.9742, + "step": 5347 + }, + { + "epoch": 0.562451522999461, + "grad_norm": 2.0875975574978782, + "learning_rate": 2.0483518034840034e-06, + "loss": 0.9948, + "step": 5348 + }, + { + "epoch": 0.5625566934413083, + "grad_norm": 1.970777139152171, + "learning_rate": 2.0475307455397103e-06, + "loss": 0.946, + "step": 5349 + }, + { + "epoch": 0.5626618638831556, + "grad_norm": 2.6489850276180555, + "learning_rate": 2.0467097380492547e-06, + "loss": 1.0242, + "step": 5350 + }, + { + "epoch": 0.562767034325003, + "grad_norm": 2.199452020540128, + "learning_rate": 2.0458887811041842e-06, + "loss": 1.0094, + "step": 5351 + }, + { + "epoch": 0.5628722047668503, + "grad_norm": 2.3720136627286084, + "learning_rate": 2.045067874796043e-06, + "loss": 0.9981, + "step": 5352 + }, + { + "epoch": 0.5629773752086976, + "grad_norm": 2.5209154043777073, + "learning_rate": 2.044247019216367e-06, + "loss": 1.0028, + "step": 5353 + }, + { + "epoch": 0.5630825456505449, + "grad_norm": 2.5610925238059186, + "learning_rate": 2.0434262144566895e-06, + "loss": 1.0665, + "step": 5354 + }, + { + "epoch": 0.5631877160923923, + "grad_norm": 2.897831403216789, + "learning_rate": 2.0426054606085356e-06, + "loss": 1.0275, + "step": 5355 + }, + { + "epoch": 0.5632928865342396, + "grad_norm": 2.376676302583607, + "learning_rate": 2.0417847577634263e-06, + "loss": 0.9664, + "step": 5356 + }, + { + "epoch": 0.5633980569760869, + "grad_norm": 1.902345338626602, + "learning_rate": 2.040964106012876e-06, + "loss": 0.964, + "step": 5357 + }, + { + "epoch": 0.5635032274179342, + "grad_norm": 2.8141159096441513, + "learning_rate": 2.0401435054483925e-06, + "loss": 0.959, + "step": 5358 + }, + { + "epoch": 0.5636083978597816, + "grad_norm": 2.213475597622369, + "learning_rate": 2.0393229561614817e-06, + "loss": 0.9913, + "step": 5359 + }, + { + "epoch": 0.5637135683016288, + "grad_norm": 2.8077710198008616, + "learning_rate": 2.03850245824364e-06, + "loss": 0.9795, + "step": 5360 + }, + { + "epoch": 0.5638187387434761, + "grad_norm": 1.7009560943465891, + "learning_rate": 2.037682011786359e-06, + "loss": 0.9519, + "step": 5361 + }, + { + "epoch": 0.5639239091853234, + "grad_norm": 1.9158342012022915, + "learning_rate": 2.036861616881125e-06, + "loss": 1.0132, + "step": 5362 + }, + { + "epoch": 0.5640290796271707, + "grad_norm": 2.4899098257579384, + "learning_rate": 2.036041273619418e-06, + "loss": 0.9862, + "step": 5363 + }, + { + "epoch": 0.5641342500690181, + "grad_norm": 2.8756220189009487, + "learning_rate": 2.035220982092714e-06, + "loss": 0.9549, + "step": 5364 + }, + { + "epoch": 0.5642394205108654, + "grad_norm": 2.466341733072218, + "learning_rate": 2.0344007423924807e-06, + "loss": 0.9962, + "step": 5365 + }, + { + "epoch": 0.5643445909527127, + "grad_norm": 2.1782294813736254, + "learning_rate": 2.0335805546101817e-06, + "loss": 0.9483, + "step": 5366 + }, + { + "epoch": 0.56444976139456, + "grad_norm": 2.117012458223317, + "learning_rate": 2.0327604188372735e-06, + "loss": 1.0088, + "step": 5367 + }, + { + "epoch": 0.5645549318364074, + "grad_norm": 1.9302647942163518, + "learning_rate": 2.0319403351652086e-06, + "loss": 0.9985, + "step": 5368 + }, + { + "epoch": 0.5646601022782547, + "grad_norm": 2.2737930556012915, + "learning_rate": 2.0311203036854326e-06, + "loss": 0.9287, + "step": 5369 + }, + { + "epoch": 0.564765272720102, + "grad_norm": 1.6722842954733659, + "learning_rate": 2.0303003244893853e-06, + "loss": 1.0125, + "step": 5370 + }, + { + "epoch": 0.5648704431619493, + "grad_norm": 2.9762602394882434, + "learning_rate": 2.0294803976685006e-06, + "loss": 0.9461, + "step": 5371 + }, + { + "epoch": 0.5649756136037967, + "grad_norm": 2.979884850333689, + "learning_rate": 2.028660523314205e-06, + "loss": 1.0165, + "step": 5372 + }, + { + "epoch": 0.565080784045644, + "grad_norm": 2.4718005740112305, + "learning_rate": 2.0278407015179243e-06, + "loss": 0.9953, + "step": 5373 + }, + { + "epoch": 0.5651859544874913, + "grad_norm": 2.038230117240658, + "learning_rate": 2.027020932371073e-06, + "loss": 0.9773, + "step": 5374 + }, + { + "epoch": 0.5652911249293386, + "grad_norm": 2.3536455408799988, + "learning_rate": 2.0262012159650624e-06, + "loss": 0.9947, + "step": 5375 + }, + { + "epoch": 0.565396295371186, + "grad_norm": 2.4699699581202252, + "learning_rate": 2.0253815523912955e-06, + "loss": 1.0202, + "step": 5376 + }, + { + "epoch": 0.5655014658130333, + "grad_norm": 2.7568128217452608, + "learning_rate": 2.024561941741173e-06, + "loss": 0.9816, + "step": 5377 + }, + { + "epoch": 0.5656066362548806, + "grad_norm": 2.5880748676931025, + "learning_rate": 2.0237423841060877e-06, + "loss": 0.9817, + "step": 5378 + }, + { + "epoch": 0.5657118066967279, + "grad_norm": 2.3807827153101138, + "learning_rate": 2.0229228795774264e-06, + "loss": 0.9829, + "step": 5379 + }, + { + "epoch": 0.5658169771385752, + "grad_norm": 2.419880425354416, + "learning_rate": 2.02210342824657e-06, + "loss": 0.9412, + "step": 5380 + }, + { + "epoch": 0.5659221475804225, + "grad_norm": 3.206944366362426, + "learning_rate": 2.021284030204893e-06, + "loss": 1.0397, + "step": 5381 + }, + { + "epoch": 0.5660273180222698, + "grad_norm": 2.48547978811849, + "learning_rate": 2.020464685543766e-06, + "loss": 0.971, + "step": 5382 + }, + { + "epoch": 0.5661324884641171, + "grad_norm": 2.6366008661640525, + "learning_rate": 2.0196453943545517e-06, + "loss": 1.0017, + "step": 5383 + }, + { + "epoch": 0.5662376589059644, + "grad_norm": 2.6842165173464867, + "learning_rate": 2.0188261567286076e-06, + "loss": 1.0313, + "step": 5384 + }, + { + "epoch": 0.5663428293478118, + "grad_norm": 2.6982167121358684, + "learning_rate": 2.018006972757285e-06, + "loss": 0.9802, + "step": 5385 + }, + { + "epoch": 0.5664479997896591, + "grad_norm": 2.4433736862921362, + "learning_rate": 2.0171878425319283e-06, + "loss": 0.962, + "step": 5386 + }, + { + "epoch": 0.5665531702315064, + "grad_norm": 2.475980829378924, + "learning_rate": 2.0163687661438786e-06, + "loss": 0.9586, + "step": 5387 + }, + { + "epoch": 0.5666583406733537, + "grad_norm": 2.6704965382072063, + "learning_rate": 2.0155497436844684e-06, + "loss": 1.0241, + "step": 5388 + }, + { + "epoch": 0.5667635111152011, + "grad_norm": 2.6472388985915325, + "learning_rate": 2.0147307752450253e-06, + "loss": 0.9816, + "step": 5389 + }, + { + "epoch": 0.5668686815570484, + "grad_norm": 2.2155377465887067, + "learning_rate": 2.0139118609168697e-06, + "loss": 0.996, + "step": 5390 + }, + { + "epoch": 0.5669738519988957, + "grad_norm": 2.6569851803737827, + "learning_rate": 2.0130930007913184e-06, + "loss": 1.003, + "step": 5391 + }, + { + "epoch": 0.567079022440743, + "grad_norm": 2.2619946690776236, + "learning_rate": 2.01227419495968e-06, + "loss": 1.0121, + "step": 5392 + }, + { + "epoch": 0.5671841928825904, + "grad_norm": 3.0146876977488497, + "learning_rate": 2.011455443513257e-06, + "loss": 1.0017, + "step": 5393 + }, + { + "epoch": 0.5672893633244377, + "grad_norm": 2.7358969902861303, + "learning_rate": 2.010636746543348e-06, + "loss": 1.0223, + "step": 5394 + }, + { + "epoch": 0.567394533766285, + "grad_norm": 2.5446999109646975, + "learning_rate": 2.009818104141242e-06, + "loss": 0.9872, + "step": 5395 + }, + { + "epoch": 0.5674997042081323, + "grad_norm": 2.5508462818540094, + "learning_rate": 2.0089995163982263e-06, + "loss": 1.0033, + "step": 5396 + }, + { + "epoch": 0.5676048746499797, + "grad_norm": 1.9993969226847457, + "learning_rate": 2.0081809834055787e-06, + "loss": 0.9702, + "step": 5397 + }, + { + "epoch": 0.567710045091827, + "grad_norm": 2.836401177949353, + "learning_rate": 2.007362505254572e-06, + "loss": 0.9877, + "step": 5398 + }, + { + "epoch": 0.5678152155336743, + "grad_norm": 2.653491277012509, + "learning_rate": 2.0065440820364718e-06, + "loss": 0.9803, + "step": 5399 + }, + { + "epoch": 0.5679203859755215, + "grad_norm": 3.4470366242642854, + "learning_rate": 2.005725713842541e-06, + "loss": 1.0328, + "step": 5400 + }, + { + "epoch": 0.5680255564173688, + "grad_norm": 2.9653592066566667, + "learning_rate": 2.0049074007640324e-06, + "loss": 1.0015, + "step": 5401 + }, + { + "epoch": 0.5681307268592162, + "grad_norm": 1.4100311048508116, + "learning_rate": 2.004089142892195e-06, + "loss": 0.9846, + "step": 5402 + }, + { + "epoch": 0.5682358973010635, + "grad_norm": 2.9910996927005007, + "learning_rate": 2.0032709403182705e-06, + "loss": 1.0175, + "step": 5403 + }, + { + "epoch": 0.5683410677429108, + "grad_norm": 1.6159196986325306, + "learning_rate": 2.002452793133494e-06, + "loss": 0.958, + "step": 5404 + }, + { + "epoch": 0.5684462381847581, + "grad_norm": 2.343270925363438, + "learning_rate": 2.001634701429097e-06, + "loss": 1.0338, + "step": 5405 + }, + { + "epoch": 0.5685514086266055, + "grad_norm": 2.6339615242917187, + "learning_rate": 2.000816665296302e-06, + "loss": 0.9807, + "step": 5406 + }, + { + "epoch": 0.5686565790684528, + "grad_norm": 2.928072201278944, + "learning_rate": 1.999998684826327e-06, + "loss": 1.0176, + "step": 5407 + }, + { + "epoch": 0.5687617495103001, + "grad_norm": 2.4139743488107976, + "learning_rate": 1.9991807601103823e-06, + "loss": 0.984, + "step": 5408 + }, + { + "epoch": 0.5688669199521474, + "grad_norm": 2.171791427192439, + "learning_rate": 1.9983628912396726e-06, + "loss": 1.0245, + "step": 5409 + }, + { + "epoch": 0.5689720903939948, + "grad_norm": 2.804934183732406, + "learning_rate": 1.997545078305399e-06, + "loss": 1.0397, + "step": 5410 + }, + { + "epoch": 0.5690772608358421, + "grad_norm": 1.974490805904351, + "learning_rate": 1.9967273213987515e-06, + "loss": 0.9931, + "step": 5411 + }, + { + "epoch": 0.5691824312776894, + "grad_norm": 2.2394117401253917, + "learning_rate": 1.995909620610918e-06, + "loss": 1.0064, + "step": 5412 + }, + { + "epoch": 0.5692876017195367, + "grad_norm": 2.7031568108174393, + "learning_rate": 1.9950919760330757e-06, + "loss": 0.9608, + "step": 5413 + }, + { + "epoch": 0.5693927721613841, + "grad_norm": 1.7767471050410926, + "learning_rate": 1.9942743877564018e-06, + "loss": 0.9177, + "step": 5414 + }, + { + "epoch": 0.5694979426032314, + "grad_norm": 1.9741959110358989, + "learning_rate": 1.9934568558720623e-06, + "loss": 1.0155, + "step": 5415 + }, + { + "epoch": 0.5696031130450787, + "grad_norm": 2.120718009384387, + "learning_rate": 1.9926393804712183e-06, + "loss": 0.9933, + "step": 5416 + }, + { + "epoch": 0.569708283486926, + "grad_norm": 2.3893886698304554, + "learning_rate": 1.9918219616450246e-06, + "loss": 0.9487, + "step": 5417 + }, + { + "epoch": 0.5698134539287734, + "grad_norm": 2.1203012449126217, + "learning_rate": 1.9910045994846294e-06, + "loss": 1.0161, + "step": 5418 + }, + { + "epoch": 0.5699186243706207, + "grad_norm": 2.402604527740921, + "learning_rate": 1.990187294081176e-06, + "loss": 0.9735, + "step": 5419 + }, + { + "epoch": 0.570023794812468, + "grad_norm": 2.3672965950739213, + "learning_rate": 1.9893700455257996e-06, + "loss": 0.9659, + "step": 5420 + }, + { + "epoch": 0.5701289652543152, + "grad_norm": 2.210201260927024, + "learning_rate": 1.98855285390963e-06, + "loss": 0.9688, + "step": 5421 + }, + { + "epoch": 0.5702341356961625, + "grad_norm": 2.653764999915913, + "learning_rate": 1.987735719323789e-06, + "loss": 0.9776, + "step": 5422 + }, + { + "epoch": 0.5703393061380099, + "grad_norm": 2.5642420040701692, + "learning_rate": 1.986918641859396e-06, + "loss": 0.995, + "step": 5423 + }, + { + "epoch": 0.5704444765798572, + "grad_norm": 2.2961949153030443, + "learning_rate": 1.9861016216075596e-06, + "loss": 0.9892, + "step": 5424 + }, + { + "epoch": 0.5705496470217045, + "grad_norm": 2.767041920692721, + "learning_rate": 1.985284658659385e-06, + "loss": 1.0365, + "step": 5425 + }, + { + "epoch": 0.5706548174635518, + "grad_norm": 1.648569046551329, + "learning_rate": 1.9844677531059698e-06, + "loss": 0.9728, + "step": 5426 + }, + { + "epoch": 0.5707599879053992, + "grad_norm": 2.6969141927182005, + "learning_rate": 1.9836509050384035e-06, + "loss": 0.9942, + "step": 5427 + }, + { + "epoch": 0.5708651583472465, + "grad_norm": 2.2808653469120084, + "learning_rate": 1.982834114547773e-06, + "loss": 0.9663, + "step": 5428 + }, + { + "epoch": 0.5709703287890938, + "grad_norm": 2.167902550863519, + "learning_rate": 1.982017381725157e-06, + "loss": 0.992, + "step": 5429 + }, + { + "epoch": 0.5710754992309411, + "grad_norm": 2.3499387994244323, + "learning_rate": 1.981200706661626e-06, + "loss": 0.9837, + "step": 5430 + }, + { + "epoch": 0.5711806696727885, + "grad_norm": 2.182644078032054, + "learning_rate": 1.9803840894482468e-06, + "loss": 0.9578, + "step": 5431 + }, + { + "epoch": 0.5712858401146358, + "grad_norm": 3.3676977139942434, + "learning_rate": 1.9795675301760776e-06, + "loss": 0.9728, + "step": 5432 + }, + { + "epoch": 0.5713910105564831, + "grad_norm": 2.353096165552454, + "learning_rate": 1.978751028936172e-06, + "loss": 0.9739, + "step": 5433 + }, + { + "epoch": 0.5714961809983304, + "grad_norm": 2.5627941445914373, + "learning_rate": 1.9779345858195757e-06, + "loss": 0.9801, + "step": 5434 + }, + { + "epoch": 0.5716013514401778, + "grad_norm": 2.8224949763649367, + "learning_rate": 1.977118200917329e-06, + "loss": 1.0196, + "step": 5435 + }, + { + "epoch": 0.5717065218820251, + "grad_norm": 2.41926342232906, + "learning_rate": 1.9763018743204637e-06, + "loss": 0.9623, + "step": 5436 + }, + { + "epoch": 0.5718116923238724, + "grad_norm": 2.5803768185373457, + "learning_rate": 1.9754856061200085e-06, + "loss": 1.0174, + "step": 5437 + }, + { + "epoch": 0.5719168627657197, + "grad_norm": 2.6612045523496857, + "learning_rate": 1.974669396406983e-06, + "loss": 0.9732, + "step": 5438 + }, + { + "epoch": 0.5720220332075671, + "grad_norm": 1.9777273507957978, + "learning_rate": 1.9738532452724007e-06, + "loss": 0.9457, + "step": 5439 + }, + { + "epoch": 0.5721272036494144, + "grad_norm": 3.0778735684461056, + "learning_rate": 1.9730371528072687e-06, + "loss": 1.0039, + "step": 5440 + }, + { + "epoch": 0.5722323740912616, + "grad_norm": 2.7734640366863537, + "learning_rate": 1.972221119102587e-06, + "loss": 1.0047, + "step": 5441 + }, + { + "epoch": 0.5723375445331089, + "grad_norm": 2.360250328220414, + "learning_rate": 1.9714051442493516e-06, + "loss": 0.9788, + "step": 5442 + }, + { + "epoch": 0.5724427149749562, + "grad_norm": 2.027404275154293, + "learning_rate": 1.970589228338548e-06, + "loss": 0.9784, + "step": 5443 + }, + { + "epoch": 0.5725478854168036, + "grad_norm": 2.3164867712708825, + "learning_rate": 1.969773371461159e-06, + "loss": 0.9658, + "step": 5444 + }, + { + "epoch": 0.5726530558586509, + "grad_norm": 1.9144239908740546, + "learning_rate": 1.9689575737081567e-06, + "loss": 0.9609, + "step": 5445 + }, + { + "epoch": 0.5727582263004982, + "grad_norm": 2.938174043030208, + "learning_rate": 1.9681418351705116e-06, + "loss": 1.0395, + "step": 5446 + }, + { + "epoch": 0.5728633967423455, + "grad_norm": 2.1982822359165013, + "learning_rate": 1.967326155939183e-06, + "loss": 0.9507, + "step": 5447 + }, + { + "epoch": 0.5729685671841929, + "grad_norm": 2.795386799438768, + "learning_rate": 1.9665105361051255e-06, + "loss": 0.9534, + "step": 5448 + }, + { + "epoch": 0.5730737376260402, + "grad_norm": 2.748854629184378, + "learning_rate": 1.965694975759288e-06, + "loss": 0.99, + "step": 5449 + }, + { + "epoch": 0.5731789080678875, + "grad_norm": 2.829975197200465, + "learning_rate": 1.9648794749926103e-06, + "loss": 1.0186, + "step": 5450 + }, + { + "epoch": 0.5732840785097348, + "grad_norm": 2.7529320683500447, + "learning_rate": 1.9640640338960294e-06, + "loss": 0.9807, + "step": 5451 + }, + { + "epoch": 0.5733892489515822, + "grad_norm": 2.558464051548766, + "learning_rate": 1.9632486525604715e-06, + "loss": 0.9763, + "step": 5452 + }, + { + "epoch": 0.5734944193934295, + "grad_norm": 2.2378183775994893, + "learning_rate": 1.962433331076859e-06, + "loss": 1.0049, + "step": 5453 + }, + { + "epoch": 0.5735995898352768, + "grad_norm": 1.9794409306196663, + "learning_rate": 1.961618069536105e-06, + "loss": 0.9516, + "step": 5454 + }, + { + "epoch": 0.5737047602771241, + "grad_norm": 2.5990704751367244, + "learning_rate": 1.960802868029119e-06, + "loss": 0.9981, + "step": 5455 + }, + { + "epoch": 0.5738099307189715, + "grad_norm": 2.111642340238224, + "learning_rate": 1.9599877266468024e-06, + "loss": 0.9729, + "step": 5456 + }, + { + "epoch": 0.5739151011608188, + "grad_norm": 2.337460712610555, + "learning_rate": 1.959172645480049e-06, + "loss": 0.9886, + "step": 5457 + }, + { + "epoch": 0.5740202716026661, + "grad_norm": 3.42592059314716, + "learning_rate": 1.958357624619747e-06, + "loss": 0.9792, + "step": 5458 + }, + { + "epoch": 0.5741254420445134, + "grad_norm": 3.04481036795497, + "learning_rate": 1.957542664156776e-06, + "loss": 1.0186, + "step": 5459 + }, + { + "epoch": 0.5742306124863608, + "grad_norm": 3.566327107470445, + "learning_rate": 1.9567277641820136e-06, + "loss": 1.0102, + "step": 5460 + }, + { + "epoch": 0.574335782928208, + "grad_norm": 2.262412810337984, + "learning_rate": 1.9559129247863253e-06, + "loss": 0.9987, + "step": 5461 + }, + { + "epoch": 0.5744409533700553, + "grad_norm": 2.230615471687081, + "learning_rate": 1.9550981460605734e-06, + "loss": 0.9789, + "step": 5462 + }, + { + "epoch": 0.5745461238119026, + "grad_norm": 2.3226218833907217, + "learning_rate": 1.9542834280956102e-06, + "loss": 0.9984, + "step": 5463 + }, + { + "epoch": 0.57465129425375, + "grad_norm": 3.038633583612286, + "learning_rate": 1.9534687709822834e-06, + "loss": 1.0057, + "step": 5464 + }, + { + "epoch": 0.5747564646955973, + "grad_norm": 2.271266550812082, + "learning_rate": 1.952654174811435e-06, + "loss": 0.9788, + "step": 5465 + }, + { + "epoch": 0.5748616351374446, + "grad_norm": 1.7249233590279198, + "learning_rate": 1.951839639673898e-06, + "loss": 1.0057, + "step": 5466 + }, + { + "epoch": 0.5749668055792919, + "grad_norm": 2.7803541788635417, + "learning_rate": 1.9510251656605e-06, + "loss": 1.0254, + "step": 5467 + }, + { + "epoch": 0.5750719760211392, + "grad_norm": 1.8011230248841785, + "learning_rate": 1.9502107528620593e-06, + "loss": 0.9893, + "step": 5468 + }, + { + "epoch": 0.5751771464629866, + "grad_norm": 2.82475143858172, + "learning_rate": 1.949396401369392e-06, + "loss": 0.9615, + "step": 5469 + }, + { + "epoch": 0.5752823169048339, + "grad_norm": 2.6365865937523014, + "learning_rate": 1.9485821112733023e-06, + "loss": 0.9794, + "step": 5470 + }, + { + "epoch": 0.5753874873466812, + "grad_norm": 2.8901960806978737, + "learning_rate": 1.947767882664591e-06, + "loss": 1.0382, + "step": 5471 + }, + { + "epoch": 0.5754926577885285, + "grad_norm": 1.6028594756674903, + "learning_rate": 1.946953715634051e-06, + "loss": 1.0084, + "step": 5472 + }, + { + "epoch": 0.5755978282303759, + "grad_norm": 2.680056010624629, + "learning_rate": 1.9461396102724666e-06, + "loss": 0.9539, + "step": 5473 + }, + { + "epoch": 0.5757029986722232, + "grad_norm": 1.9522822778958524, + "learning_rate": 1.9453255666706193e-06, + "loss": 0.9717, + "step": 5474 + }, + { + "epoch": 0.5758081691140705, + "grad_norm": 1.8400125921360837, + "learning_rate": 1.94451158491928e-06, + "loss": 1.0238, + "step": 5475 + }, + { + "epoch": 0.5759133395559178, + "grad_norm": 2.9729085425856856, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.9872, + "step": 5476 + }, + { + "epoch": 0.5760185099977652, + "grad_norm": 3.0463506648492644, + "learning_rate": 1.94288380733118e-06, + "loss": 1.0292, + "step": 5477 + }, + { + "epoch": 0.5761236804396125, + "grad_norm": 3.312807191743345, + "learning_rate": 1.9420700116759295e-06, + "loss": 0.9883, + "step": 5478 + }, + { + "epoch": 0.5762288508814598, + "grad_norm": 2.193873965057158, + "learning_rate": 1.9412562782342067e-06, + "loss": 0.9968, + "step": 5479 + }, + { + "epoch": 0.5763340213233071, + "grad_norm": 2.4838716286811677, + "learning_rate": 1.9404426070967495e-06, + "loss": 0.9702, + "step": 5480 + }, + { + "epoch": 0.5764391917651545, + "grad_norm": 3.7749376463759794, + "learning_rate": 1.9396289983542884e-06, + "loss": 1.0172, + "step": 5481 + }, + { + "epoch": 0.5765443622070017, + "grad_norm": 2.5023157843427355, + "learning_rate": 1.9388154520975465e-06, + "loss": 1.0102, + "step": 5482 + }, + { + "epoch": 0.576649532648849, + "grad_norm": 2.7098943623571663, + "learning_rate": 1.938001968417242e-06, + "loss": 1.0124, + "step": 5483 + }, + { + "epoch": 0.5767547030906963, + "grad_norm": 2.548253174199779, + "learning_rate": 1.9371885474040838e-06, + "loss": 1.0292, + "step": 5484 + }, + { + "epoch": 0.5768598735325436, + "grad_norm": 2.5033277887085417, + "learning_rate": 1.936375189148774e-06, + "loss": 0.9901, + "step": 5485 + }, + { + "epoch": 0.576965043974391, + "grad_norm": 2.114247029054567, + "learning_rate": 1.9355618937420092e-06, + "loss": 0.9394, + "step": 5486 + }, + { + "epoch": 0.5770702144162383, + "grad_norm": 1.8131362680388996, + "learning_rate": 1.934748661274477e-06, + "loss": 0.957, + "step": 5487 + }, + { + "epoch": 0.5771753848580856, + "grad_norm": 2.203134930907001, + "learning_rate": 1.9339354918368613e-06, + "loss": 0.9839, + "step": 5488 + }, + { + "epoch": 0.5772805552999329, + "grad_norm": 1.8390231714993754, + "learning_rate": 1.9331223855198355e-06, + "loss": 0.9737, + "step": 5489 + }, + { + "epoch": 0.5773857257417803, + "grad_norm": 2.67902585867112, + "learning_rate": 1.9323093424140673e-06, + "loss": 0.9992, + "step": 5490 + }, + { + "epoch": 0.5774908961836276, + "grad_norm": 1.7899460341786475, + "learning_rate": 1.931496362610217e-06, + "loss": 1.0174, + "step": 5491 + }, + { + "epoch": 0.5775960666254749, + "grad_norm": 2.056265293390317, + "learning_rate": 1.930683446198939e-06, + "loss": 1.0018, + "step": 5492 + }, + { + "epoch": 0.5777012370673222, + "grad_norm": 2.1358816380940886, + "learning_rate": 1.9298705932708793e-06, + "loss": 0.9516, + "step": 5493 + }, + { + "epoch": 0.5778064075091696, + "grad_norm": 1.7026983219962548, + "learning_rate": 1.9290578039166775e-06, + "loss": 1.0184, + "step": 5494 + }, + { + "epoch": 0.5779115779510169, + "grad_norm": 2.710604777509262, + "learning_rate": 1.9282450782269657e-06, + "loss": 0.9627, + "step": 5495 + }, + { + "epoch": 0.5780167483928642, + "grad_norm": 2.4275052932179864, + "learning_rate": 1.9274324162923685e-06, + "loss": 0.981, + "step": 5496 + }, + { + "epoch": 0.5781219188347115, + "grad_norm": 2.639347852054867, + "learning_rate": 1.926619818203506e-06, + "loss": 1.0225, + "step": 5497 + }, + { + "epoch": 0.5782270892765589, + "grad_norm": 3.013422310390402, + "learning_rate": 1.925807284050987e-06, + "loss": 1.0114, + "step": 5498 + }, + { + "epoch": 0.5783322597184062, + "grad_norm": 2.5584374470603453, + "learning_rate": 1.9249948139254173e-06, + "loss": 0.9257, + "step": 5499 + }, + { + "epoch": 0.5784374301602535, + "grad_norm": 2.6663867612637335, + "learning_rate": 1.924182407917391e-06, + "loss": 0.9599, + "step": 5500 + }, + { + "epoch": 0.5785426006021008, + "grad_norm": 3.6833651707957387, + "learning_rate": 1.9233700661175e-06, + "loss": 0.9995, + "step": 5501 + }, + { + "epoch": 0.578647771043948, + "grad_norm": 1.9915527610671544, + "learning_rate": 1.922557788616327e-06, + "loss": 1.0072, + "step": 5502 + }, + { + "epoch": 0.5787529414857954, + "grad_norm": 2.3509129428046993, + "learning_rate": 1.921745575504446e-06, + "loss": 0.9915, + "step": 5503 + }, + { + "epoch": 0.5788581119276427, + "grad_norm": 2.4528194678305097, + "learning_rate": 1.920933426872425e-06, + "loss": 0.9599, + "step": 5504 + }, + { + "epoch": 0.57896328236949, + "grad_norm": 2.6177871712631915, + "learning_rate": 1.9201213428108246e-06, + "loss": 0.9898, + "step": 5505 + }, + { + "epoch": 0.5790684528113373, + "grad_norm": 2.5053984489921155, + "learning_rate": 1.9193093234102e-06, + "loss": 1.0196, + "step": 5506 + }, + { + "epoch": 0.5791736232531847, + "grad_norm": 2.498558815345497, + "learning_rate": 1.9184973687610965e-06, + "loss": 0.9645, + "step": 5507 + }, + { + "epoch": 0.579278793695032, + "grad_norm": 3.05754856518924, + "learning_rate": 1.917685478954054e-06, + "loss": 1.0264, + "step": 5508 + }, + { + "epoch": 0.5793839641368793, + "grad_norm": 2.2653910526244214, + "learning_rate": 1.9168736540796043e-06, + "loss": 0.9704, + "step": 5509 + }, + { + "epoch": 0.5794891345787266, + "grad_norm": 2.48443214283071, + "learning_rate": 1.9160618942282705e-06, + "loss": 1.0255, + "step": 5510 + }, + { + "epoch": 0.579594305020574, + "grad_norm": 3.004247233784531, + "learning_rate": 1.915250199490573e-06, + "loss": 1.0294, + "step": 5511 + }, + { + "epoch": 0.5796994754624213, + "grad_norm": 2.4418258201583103, + "learning_rate": 1.914438569957021e-06, + "loss": 1.0082, + "step": 5512 + }, + { + "epoch": 0.5798046459042686, + "grad_norm": 2.4279113250200823, + "learning_rate": 1.9136270057181173e-06, + "loss": 0.951, + "step": 5513 + }, + { + "epoch": 0.5799098163461159, + "grad_norm": 2.2619878885329263, + "learning_rate": 1.9128155068643563e-06, + "loss": 1.0286, + "step": 5514 + }, + { + "epoch": 0.5800149867879633, + "grad_norm": 3.0238638965690576, + "learning_rate": 1.912004073486229e-06, + "loss": 0.998, + "step": 5515 + }, + { + "epoch": 0.5801201572298106, + "grad_norm": 2.392433746833724, + "learning_rate": 1.9111927056742146e-06, + "loss": 0.971, + "step": 5516 + }, + { + "epoch": 0.5802253276716579, + "grad_norm": 2.2813223909499403, + "learning_rate": 1.9103814035187877e-06, + "loss": 1.0173, + "step": 5517 + }, + { + "epoch": 0.5803304981135052, + "grad_norm": 2.7032063827189092, + "learning_rate": 1.909570167110415e-06, + "loss": 1.0182, + "step": 5518 + }, + { + "epoch": 0.5804356685553526, + "grad_norm": 2.30490410235801, + "learning_rate": 1.9087589965395547e-06, + "loss": 0.9975, + "step": 5519 + }, + { + "epoch": 0.5805408389971999, + "grad_norm": 2.769410328624913, + "learning_rate": 1.9079478918966595e-06, + "loss": 0.9609, + "step": 5520 + }, + { + "epoch": 0.5806460094390472, + "grad_norm": 2.3183308581320694, + "learning_rate": 1.9071368532721734e-06, + "loss": 1.0247, + "step": 5521 + }, + { + "epoch": 0.5807511798808944, + "grad_norm": 3.0996340248511767, + "learning_rate": 1.9063258807565338e-06, + "loss": 0.9765, + "step": 5522 + }, + { + "epoch": 0.5808563503227417, + "grad_norm": 2.6197973686870553, + "learning_rate": 1.9055149744401693e-06, + "loss": 0.9358, + "step": 5523 + }, + { + "epoch": 0.5809615207645891, + "grad_norm": 2.241372830499171, + "learning_rate": 1.9047041344135045e-06, + "loss": 0.9821, + "step": 5524 + }, + { + "epoch": 0.5810666912064364, + "grad_norm": 2.9668398883596905, + "learning_rate": 1.903893360766953e-06, + "loss": 1.0332, + "step": 5525 + }, + { + "epoch": 0.5811718616482837, + "grad_norm": 2.2000463719499064, + "learning_rate": 1.9030826535909225e-06, + "loss": 0.9604, + "step": 5526 + }, + { + "epoch": 0.581277032090131, + "grad_norm": 2.212791966762905, + "learning_rate": 1.9022720129758132e-06, + "loss": 0.9615, + "step": 5527 + }, + { + "epoch": 0.5813822025319784, + "grad_norm": 2.2976157836326805, + "learning_rate": 1.9014614390120174e-06, + "loss": 0.9992, + "step": 5528 + }, + { + "epoch": 0.5814873729738257, + "grad_norm": 3.038579357250074, + "learning_rate": 1.9006509317899207e-06, + "loss": 1.0264, + "step": 5529 + }, + { + "epoch": 0.581592543415673, + "grad_norm": 2.8863407093044735, + "learning_rate": 1.8998404913999016e-06, + "loss": 0.9813, + "step": 5530 + }, + { + "epoch": 0.5816977138575203, + "grad_norm": 2.8846661345060953, + "learning_rate": 1.8990301179323298e-06, + "loss": 0.9626, + "step": 5531 + }, + { + "epoch": 0.5818028842993677, + "grad_norm": 2.4582156857408557, + "learning_rate": 1.8982198114775683e-06, + "loss": 0.9984, + "step": 5532 + }, + { + "epoch": 0.581908054741215, + "grad_norm": 2.178455937450243, + "learning_rate": 1.8974095721259718e-06, + "loss": 0.9677, + "step": 5533 + }, + { + "epoch": 0.5820132251830623, + "grad_norm": 2.4539058224075823, + "learning_rate": 1.89659939996789e-06, + "loss": 0.947, + "step": 5534 + }, + { + "epoch": 0.5821183956249096, + "grad_norm": 2.9776623960749364, + "learning_rate": 1.8957892950936623e-06, + "loss": 1.0148, + "step": 5535 + }, + { + "epoch": 0.582223566066757, + "grad_norm": 3.114821126421381, + "learning_rate": 1.8949792575936222e-06, + "loss": 0.9682, + "step": 5536 + }, + { + "epoch": 0.5823287365086043, + "grad_norm": 1.7570932709447848, + "learning_rate": 1.8941692875580934e-06, + "loss": 0.9917, + "step": 5537 + }, + { + "epoch": 0.5824339069504516, + "grad_norm": 2.612838069227518, + "learning_rate": 1.8933593850773963e-06, + "loss": 1.0041, + "step": 5538 + }, + { + "epoch": 0.5825390773922989, + "grad_norm": 2.0944013087449957, + "learning_rate": 1.8925495502418407e-06, + "loss": 0.9842, + "step": 5539 + }, + { + "epoch": 0.5826442478341463, + "grad_norm": 1.8402090587110502, + "learning_rate": 1.8917397831417285e-06, + "loss": 0.9741, + "step": 5540 + }, + { + "epoch": 0.5827494182759936, + "grad_norm": 2.3971865882038634, + "learning_rate": 1.8909300838673562e-06, + "loss": 0.9897, + "step": 5541 + }, + { + "epoch": 0.5828545887178409, + "grad_norm": 2.7386768450510988, + "learning_rate": 1.89012045250901e-06, + "loss": 1.0536, + "step": 5542 + }, + { + "epoch": 0.5829597591596881, + "grad_norm": 2.4719585825244157, + "learning_rate": 1.889310889156972e-06, + "loss": 0.9932, + "step": 5543 + }, + { + "epoch": 0.5830649296015354, + "grad_norm": 1.9873200200685415, + "learning_rate": 1.8885013939015133e-06, + "loss": 1.0105, + "step": 5544 + }, + { + "epoch": 0.5831701000433828, + "grad_norm": 2.0027592555824563, + "learning_rate": 1.8876919668329002e-06, + "loss": 0.9932, + "step": 5545 + }, + { + "epoch": 0.5832752704852301, + "grad_norm": 2.9904414990454433, + "learning_rate": 1.8868826080413876e-06, + "loss": 1.0046, + "step": 5546 + }, + { + "epoch": 0.5833804409270774, + "grad_norm": 2.7824243964175452, + "learning_rate": 1.8860733176172286e-06, + "loss": 1.0286, + "step": 5547 + }, + { + "epoch": 0.5834856113689247, + "grad_norm": 2.1679565097639917, + "learning_rate": 1.8852640956506643e-06, + "loss": 0.9693, + "step": 5548 + }, + { + "epoch": 0.5835907818107721, + "grad_norm": 2.0987314485767423, + "learning_rate": 1.884454942231928e-06, + "loss": 0.9306, + "step": 5549 + }, + { + "epoch": 0.5836959522526194, + "grad_norm": 2.74265547993407, + "learning_rate": 1.8836458574512478e-06, + "loss": 0.9962, + "step": 5550 + }, + { + "epoch": 0.5838011226944667, + "grad_norm": 1.8785431998355895, + "learning_rate": 1.882836841398841e-06, + "loss": 0.9127, + "step": 5551 + }, + { + "epoch": 0.583906293136314, + "grad_norm": 2.4599246039003364, + "learning_rate": 1.882027894164922e-06, + "loss": 0.9913, + "step": 5552 + }, + { + "epoch": 0.5840114635781614, + "grad_norm": 2.296968019204936, + "learning_rate": 1.8812190158396931e-06, + "loss": 0.9596, + "step": 5553 + }, + { + "epoch": 0.5841166340200087, + "grad_norm": 2.65439428097566, + "learning_rate": 1.880410206513351e-06, + "loss": 1.025, + "step": 5554 + }, + { + "epoch": 0.584221804461856, + "grad_norm": 2.1109947099788995, + "learning_rate": 1.8796014662760842e-06, + "loss": 1.0029, + "step": 5555 + }, + { + "epoch": 0.5843269749037033, + "grad_norm": 2.7664051470715405, + "learning_rate": 1.8787927952180724e-06, + "loss": 0.971, + "step": 5556 + }, + { + "epoch": 0.5844321453455507, + "grad_norm": 2.202625098363754, + "learning_rate": 1.8779841934294904e-06, + "loss": 0.9731, + "step": 5557 + }, + { + "epoch": 0.584537315787398, + "grad_norm": 2.2262523402556313, + "learning_rate": 1.8771756610005028e-06, + "loss": 0.9645, + "step": 5558 + }, + { + "epoch": 0.5846424862292453, + "grad_norm": 2.203570080279773, + "learning_rate": 1.8763671980212673e-06, + "loss": 0.9762, + "step": 5559 + }, + { + "epoch": 0.5847476566710926, + "grad_norm": 3.406979001707062, + "learning_rate": 1.8755588045819325e-06, + "loss": 0.988, + "step": 5560 + }, + { + "epoch": 0.58485282711294, + "grad_norm": 2.3316323807570685, + "learning_rate": 1.8747504807726433e-06, + "loss": 0.9723, + "step": 5561 + }, + { + "epoch": 0.5849579975547873, + "grad_norm": 2.5275255811504995, + "learning_rate": 1.8739422266835321e-06, + "loss": 1.0129, + "step": 5562 + }, + { + "epoch": 0.5850631679966345, + "grad_norm": 2.700141617045467, + "learning_rate": 1.8731340424047265e-06, + "loss": 0.9889, + "step": 5563 + }, + { + "epoch": 0.5851683384384818, + "grad_norm": 2.367236088955419, + "learning_rate": 1.8723259280263451e-06, + "loss": 0.9935, + "step": 5564 + }, + { + "epoch": 0.5852735088803291, + "grad_norm": 2.0726305566915997, + "learning_rate": 1.871517883638497e-06, + "loss": 0.9489, + "step": 5565 + }, + { + "epoch": 0.5853786793221765, + "grad_norm": 2.140334180895899, + "learning_rate": 1.8707099093312883e-06, + "loss": 0.9789, + "step": 5566 + }, + { + "epoch": 0.5854838497640238, + "grad_norm": 2.3801510907541035, + "learning_rate": 1.869902005194813e-06, + "loss": 0.9661, + "step": 5567 + }, + { + "epoch": 0.5855890202058711, + "grad_norm": 1.94348395585535, + "learning_rate": 1.869094171319159e-06, + "loss": 0.9675, + "step": 5568 + }, + { + "epoch": 0.5856941906477184, + "grad_norm": 1.8197908570293184, + "learning_rate": 1.8682864077944058e-06, + "loss": 0.9888, + "step": 5569 + }, + { + "epoch": 0.5857993610895658, + "grad_norm": 2.3503761909453478, + "learning_rate": 1.8674787147106255e-06, + "loss": 0.9775, + "step": 5570 + }, + { + "epoch": 0.5859045315314131, + "grad_norm": 2.424973046847033, + "learning_rate": 1.8666710921578823e-06, + "loss": 1.0109, + "step": 5571 + }, + { + "epoch": 0.5860097019732604, + "grad_norm": 3.1180462687524346, + "learning_rate": 1.865863540226232e-06, + "loss": 1.001, + "step": 5572 + }, + { + "epoch": 0.5861148724151077, + "grad_norm": 2.5532427885272946, + "learning_rate": 1.8650560590057235e-06, + "loss": 1.0227, + "step": 5573 + }, + { + "epoch": 0.5862200428569551, + "grad_norm": 2.2339931967688704, + "learning_rate": 1.8642486485863954e-06, + "loss": 1.0216, + "step": 5574 + }, + { + "epoch": 0.5863252132988024, + "grad_norm": 1.9277968092888864, + "learning_rate": 1.8634413090582831e-06, + "loss": 0.98, + "step": 5575 + }, + { + "epoch": 0.5864303837406497, + "grad_norm": 2.7536242032196583, + "learning_rate": 1.8626340405114097e-06, + "loss": 0.9137, + "step": 5576 + }, + { + "epoch": 0.586535554182497, + "grad_norm": 2.4161018794939157, + "learning_rate": 1.8618268430357916e-06, + "loss": 1.0122, + "step": 5577 + }, + { + "epoch": 0.5866407246243444, + "grad_norm": 2.243061062715193, + "learning_rate": 1.8610197167214383e-06, + "loss": 0.9863, + "step": 5578 + }, + { + "epoch": 0.5867458950661917, + "grad_norm": 2.371022287772955, + "learning_rate": 1.8602126616583499e-06, + "loss": 0.9608, + "step": 5579 + }, + { + "epoch": 0.586851065508039, + "grad_norm": 1.9392011011181172, + "learning_rate": 1.8594056779365202e-06, + "loss": 0.9605, + "step": 5580 + }, + { + "epoch": 0.5869562359498863, + "grad_norm": 2.688618985702904, + "learning_rate": 1.8585987656459337e-06, + "loss": 0.9798, + "step": 5581 + }, + { + "epoch": 0.5870614063917337, + "grad_norm": 2.0172735352124884, + "learning_rate": 1.8577919248765675e-06, + "loss": 0.9679, + "step": 5582 + }, + { + "epoch": 0.5871665768335809, + "grad_norm": 2.4283093844913197, + "learning_rate": 1.8569851557183894e-06, + "loss": 1.0153, + "step": 5583 + }, + { + "epoch": 0.5872717472754282, + "grad_norm": 2.4357436603887823, + "learning_rate": 1.8561784582613631e-06, + "loss": 0.9547, + "step": 5584 + }, + { + "epoch": 0.5873769177172755, + "grad_norm": 1.6496752603527884, + "learning_rate": 1.8553718325954395e-06, + "loss": 1.0061, + "step": 5585 + }, + { + "epoch": 0.5874820881591228, + "grad_norm": 2.3550989815363454, + "learning_rate": 1.8545652788105644e-06, + "loss": 1.0181, + "step": 5586 + }, + { + "epoch": 0.5875872586009702, + "grad_norm": 2.4247760719245592, + "learning_rate": 1.8537587969966746e-06, + "loss": 1.0417, + "step": 5587 + }, + { + "epoch": 0.5876924290428175, + "grad_norm": 2.7483529202080748, + "learning_rate": 1.852952387243698e-06, + "loss": 0.9984, + "step": 5588 + }, + { + "epoch": 0.5877975994846648, + "grad_norm": 2.3988717114809655, + "learning_rate": 1.8521460496415577e-06, + "loss": 1.021, + "step": 5589 + }, + { + "epoch": 0.5879027699265121, + "grad_norm": 3.4921295132942594, + "learning_rate": 1.8513397842801655e-06, + "loss": 0.9935, + "step": 5590 + }, + { + "epoch": 0.5880079403683595, + "grad_norm": 2.9336016166705066, + "learning_rate": 1.8505335912494265e-06, + "loss": 0.9968, + "step": 5591 + }, + { + "epoch": 0.5881131108102068, + "grad_norm": 2.8340234509298563, + "learning_rate": 1.8497274706392369e-06, + "loss": 1.0128, + "step": 5592 + }, + { + "epoch": 0.5882182812520541, + "grad_norm": 3.1961181107327, + "learning_rate": 1.848921422539486e-06, + "loss": 1.022, + "step": 5593 + }, + { + "epoch": 0.5883234516939014, + "grad_norm": 1.9909100410145897, + "learning_rate": 1.8481154470400545e-06, + "loss": 0.9937, + "step": 5594 + }, + { + "epoch": 0.5884286221357488, + "grad_norm": 3.066382990655515, + "learning_rate": 1.8473095442308145e-06, + "loss": 0.9469, + "step": 5595 + }, + { + "epoch": 0.5885337925775961, + "grad_norm": 2.6179963095077716, + "learning_rate": 1.8465037142016306e-06, + "loss": 0.9608, + "step": 5596 + }, + { + "epoch": 0.5886389630194434, + "grad_norm": 2.3037057873169524, + "learning_rate": 1.8456979570423583e-06, + "loss": 1.0054, + "step": 5597 + }, + { + "epoch": 0.5887441334612907, + "grad_norm": 2.2460590369251268, + "learning_rate": 1.8448922728428474e-06, + "loss": 0.9827, + "step": 5598 + }, + { + "epoch": 0.588849303903138, + "grad_norm": 2.7120869322911623, + "learning_rate": 1.844086661692937e-06, + "loss": 0.9871, + "step": 5599 + }, + { + "epoch": 0.5889544743449854, + "grad_norm": 3.41715465139414, + "learning_rate": 1.84328112368246e-06, + "loss": 0.9804, + "step": 5600 + }, + { + "epoch": 0.5890596447868327, + "grad_norm": 3.3908096687842604, + "learning_rate": 1.8424756589012385e-06, + "loss": 1.0025, + "step": 5601 + }, + { + "epoch": 0.58916481522868, + "grad_norm": 2.0383431427479506, + "learning_rate": 1.841670267439088e-06, + "loss": 0.9688, + "step": 5602 + }, + { + "epoch": 0.5892699856705274, + "grad_norm": 2.6775576431658092, + "learning_rate": 1.8408649493858176e-06, + "loss": 0.966, + "step": 5603 + }, + { + "epoch": 0.5893751561123746, + "grad_norm": 2.571976853064117, + "learning_rate": 1.8400597048312257e-06, + "loss": 1.0121, + "step": 5604 + }, + { + "epoch": 0.5894803265542219, + "grad_norm": 2.4059753404598236, + "learning_rate": 1.8392545338651036e-06, + "loss": 0.9888, + "step": 5605 + }, + { + "epoch": 0.5895854969960692, + "grad_norm": 2.069543872655015, + "learning_rate": 1.8384494365772333e-06, + "loss": 0.9747, + "step": 5606 + }, + { + "epoch": 0.5896906674379165, + "grad_norm": 2.30354396158409, + "learning_rate": 1.8376444130573901e-06, + "loss": 0.9904, + "step": 5607 + }, + { + "epoch": 0.5897958378797639, + "grad_norm": 2.2282342710722456, + "learning_rate": 1.8368394633953402e-06, + "loss": 0.9951, + "step": 5608 + }, + { + "epoch": 0.5899010083216112, + "grad_norm": 2.523786425607499, + "learning_rate": 1.8360345876808422e-06, + "loss": 1.0192, + "step": 5609 + }, + { + "epoch": 0.5900061787634585, + "grad_norm": 2.435738401461322, + "learning_rate": 1.835229786003645e-06, + "loss": 0.9807, + "step": 5610 + }, + { + "epoch": 0.5901113492053058, + "grad_norm": 2.4101222248980094, + "learning_rate": 1.83442505845349e-06, + "loss": 0.9915, + "step": 5611 + }, + { + "epoch": 0.5902165196471532, + "grad_norm": 1.8954669177419814, + "learning_rate": 1.8336204051201124e-06, + "loss": 0.9815, + "step": 5612 + }, + { + "epoch": 0.5903216900890005, + "grad_norm": 2.367357060810497, + "learning_rate": 1.832815826093236e-06, + "loss": 1.0251, + "step": 5613 + }, + { + "epoch": 0.5904268605308478, + "grad_norm": 2.4881726831979085, + "learning_rate": 1.8320113214625783e-06, + "loss": 0.9472, + "step": 5614 + }, + { + "epoch": 0.5905320309726951, + "grad_norm": 2.7845683502036715, + "learning_rate": 1.8312068913178466e-06, + "loss": 0.9873, + "step": 5615 + }, + { + "epoch": 0.5906372014145425, + "grad_norm": 1.968736042696628, + "learning_rate": 1.8304025357487427e-06, + "loss": 0.9252, + "step": 5616 + }, + { + "epoch": 0.5907423718563898, + "grad_norm": 2.8097759990413094, + "learning_rate": 1.829598254844957e-06, + "loss": 0.9349, + "step": 5617 + }, + { + "epoch": 0.5908475422982371, + "grad_norm": 2.375269564916978, + "learning_rate": 1.8287940486961744e-06, + "loss": 1.0258, + "step": 5618 + }, + { + "epoch": 0.5909527127400844, + "grad_norm": 2.3758359793157484, + "learning_rate": 1.8279899173920692e-06, + "loss": 0.9658, + "step": 5619 + }, + { + "epoch": 0.5910578831819318, + "grad_norm": 2.9877292184232345, + "learning_rate": 1.827185861022308e-06, + "loss": 1.0124, + "step": 5620 + }, + { + "epoch": 0.5911630536237791, + "grad_norm": 1.9064000817314557, + "learning_rate": 1.8263818796765506e-06, + "loss": 0.9492, + "step": 5621 + }, + { + "epoch": 0.5912682240656264, + "grad_norm": 2.4552260061481768, + "learning_rate": 1.8255779734444462e-06, + "loss": 0.9983, + "step": 5622 + }, + { + "epoch": 0.5913733945074737, + "grad_norm": 2.652810955909284, + "learning_rate": 1.8247741424156373e-06, + "loss": 1.0046, + "step": 5623 + }, + { + "epoch": 0.5914785649493209, + "grad_norm": 2.4957564210840797, + "learning_rate": 1.8239703866797553e-06, + "loss": 1.0137, + "step": 5624 + }, + { + "epoch": 0.5915837353911683, + "grad_norm": 2.776375551754447, + "learning_rate": 1.8231667063264282e-06, + "loss": 1.016, + "step": 5625 + }, + { + "epoch": 0.5916889058330156, + "grad_norm": 2.2349936262589067, + "learning_rate": 1.822363101445271e-06, + "loss": 0.99, + "step": 5626 + }, + { + "epoch": 0.5917940762748629, + "grad_norm": 2.136732395843005, + "learning_rate": 1.8215595721258921e-06, + "loss": 0.9894, + "step": 5627 + }, + { + "epoch": 0.5918992467167102, + "grad_norm": 2.695017472417158, + "learning_rate": 1.8207561184578915e-06, + "loss": 1.0002, + "step": 5628 + }, + { + "epoch": 0.5920044171585576, + "grad_norm": 2.69308063651352, + "learning_rate": 1.8199527405308593e-06, + "loss": 1.0041, + "step": 5629 + }, + { + "epoch": 0.5921095876004049, + "grad_norm": 2.534730264118862, + "learning_rate": 1.81914943843438e-06, + "loss": 1.0193, + "step": 5630 + }, + { + "epoch": 0.5922147580422522, + "grad_norm": 2.4582659603999564, + "learning_rate": 1.818346212258027e-06, + "loss": 1.0088, + "step": 5631 + }, + { + "epoch": 0.5923199284840995, + "grad_norm": 2.495714807866576, + "learning_rate": 1.817543062091367e-06, + "loss": 0.9929, + "step": 5632 + }, + { + "epoch": 0.5924250989259469, + "grad_norm": 2.58654760819409, + "learning_rate": 1.8167399880239572e-06, + "loss": 0.995, + "step": 5633 + }, + { + "epoch": 0.5925302693677942, + "grad_norm": 2.876672103469995, + "learning_rate": 1.815936990145345e-06, + "loss": 0.954, + "step": 5634 + }, + { + "epoch": 0.5926354398096415, + "grad_norm": 2.384145352263863, + "learning_rate": 1.8151340685450745e-06, + "loss": 0.9896, + "step": 5635 + }, + { + "epoch": 0.5927406102514888, + "grad_norm": 3.345693108309314, + "learning_rate": 1.8143312233126748e-06, + "loss": 1.0084, + "step": 5636 + }, + { + "epoch": 0.5928457806933362, + "grad_norm": 3.5913950173149782, + "learning_rate": 1.81352845453767e-06, + "loss": 0.983, + "step": 5637 + }, + { + "epoch": 0.5929509511351835, + "grad_norm": 2.3914332678489827, + "learning_rate": 1.8127257623095743e-06, + "loss": 1.002, + "step": 5638 + }, + { + "epoch": 0.5930561215770308, + "grad_norm": 3.185510171534475, + "learning_rate": 1.811923146717896e-06, + "loss": 0.9976, + "step": 5639 + }, + { + "epoch": 0.5931612920188781, + "grad_norm": 2.0993299580165767, + "learning_rate": 1.811120607852132e-06, + "loss": 0.9655, + "step": 5640 + }, + { + "epoch": 0.5932664624607255, + "grad_norm": 2.8049029671520116, + "learning_rate": 1.8103181458017719e-06, + "loss": 0.9852, + "step": 5641 + }, + { + "epoch": 0.5933716329025728, + "grad_norm": 2.0996772552523675, + "learning_rate": 1.8095157606562957e-06, + "loss": 0.9822, + "step": 5642 + }, + { + "epoch": 0.5934768033444201, + "grad_norm": 2.3389036391566878, + "learning_rate": 1.8087134525051762e-06, + "loss": 1.0309, + "step": 5643 + }, + { + "epoch": 0.5935819737862674, + "grad_norm": 2.3139277809580077, + "learning_rate": 1.8079112214378769e-06, + "loss": 0.9498, + "step": 5644 + }, + { + "epoch": 0.5936871442281146, + "grad_norm": 2.6546675909969113, + "learning_rate": 1.807109067543853e-06, + "loss": 1.0182, + "step": 5645 + }, + { + "epoch": 0.593792314669962, + "grad_norm": 2.591901764829732, + "learning_rate": 1.8063069909125502e-06, + "loss": 1.0092, + "step": 5646 + }, + { + "epoch": 0.5938974851118093, + "grad_norm": 1.5887306890493018, + "learning_rate": 1.805504991633406e-06, + "loss": 0.9922, + "step": 5647 + }, + { + "epoch": 0.5940026555536566, + "grad_norm": 2.160624868604629, + "learning_rate": 1.8047030697958513e-06, + "loss": 0.9504, + "step": 5648 + }, + { + "epoch": 0.5941078259955039, + "grad_norm": 2.0657716812683518, + "learning_rate": 1.8039012254893054e-06, + "loss": 0.993, + "step": 5649 + }, + { + "epoch": 0.5942129964373513, + "grad_norm": 2.827332461738818, + "learning_rate": 1.8030994588031804e-06, + "loss": 1.009, + "step": 5650 + }, + { + "epoch": 0.5943181668791986, + "grad_norm": 2.5335501756770125, + "learning_rate": 1.80229776982688e-06, + "loss": 0.9963, + "step": 5651 + }, + { + "epoch": 0.5944233373210459, + "grad_norm": 2.394767186447968, + "learning_rate": 1.801496158649797e-06, + "loss": 1.0434, + "step": 5652 + }, + { + "epoch": 0.5945285077628932, + "grad_norm": 3.032987653965299, + "learning_rate": 1.800694625361319e-06, + "loss": 1.0317, + "step": 5653 + }, + { + "epoch": 0.5946336782047406, + "grad_norm": 2.3486440028502384, + "learning_rate": 1.799893170050823e-06, + "loss": 0.9893, + "step": 5654 + }, + { + "epoch": 0.5947388486465879, + "grad_norm": 2.014009694496243, + "learning_rate": 1.7990917928076768e-06, + "loss": 0.9973, + "step": 5655 + }, + { + "epoch": 0.5948440190884352, + "grad_norm": 1.9133594346364118, + "learning_rate": 1.7982904937212409e-06, + "loss": 0.9813, + "step": 5656 + }, + { + "epoch": 0.5949491895302825, + "grad_norm": 3.1233038474070423, + "learning_rate": 1.7974892728808653e-06, + "loss": 0.9512, + "step": 5657 + }, + { + "epoch": 0.5950543599721299, + "grad_norm": 2.19478691298938, + "learning_rate": 1.7966881303758938e-06, + "loss": 0.9901, + "step": 5658 + }, + { + "epoch": 0.5951595304139772, + "grad_norm": 2.1551749316228093, + "learning_rate": 1.795887066295659e-06, + "loss": 1.0007, + "step": 5659 + }, + { + "epoch": 0.5952647008558245, + "grad_norm": 2.5085227649724957, + "learning_rate": 1.7950860807294863e-06, + "loss": 1.0337, + "step": 5660 + }, + { + "epoch": 0.5953698712976718, + "grad_norm": 2.385692972652722, + "learning_rate": 1.7942851737666906e-06, + "loss": 0.9654, + "step": 5661 + }, + { + "epoch": 0.5954750417395192, + "grad_norm": 2.221445423122614, + "learning_rate": 1.7934843454965808e-06, + "loss": 0.9642, + "step": 5662 + }, + { + "epoch": 0.5955802121813665, + "grad_norm": 2.745661167238812, + "learning_rate": 1.7926835960084555e-06, + "loss": 1.0256, + "step": 5663 + }, + { + "epoch": 0.5956853826232138, + "grad_norm": 2.227270973619197, + "learning_rate": 1.7918829253916032e-06, + "loss": 1.0145, + "step": 5664 + }, + { + "epoch": 0.595790553065061, + "grad_norm": 1.9107730031675045, + "learning_rate": 1.7910823337353062e-06, + "loss": 0.9939, + "step": 5665 + }, + { + "epoch": 0.5958957235069083, + "grad_norm": 2.273694926717976, + "learning_rate": 1.790281821128835e-06, + "loss": 1.0495, + "step": 5666 + }, + { + "epoch": 0.5960008939487557, + "grad_norm": 2.3672345502650383, + "learning_rate": 1.7894813876614547e-06, + "loss": 0.9918, + "step": 5667 + }, + { + "epoch": 0.596106064390603, + "grad_norm": 2.338013846800265, + "learning_rate": 1.7886810334224192e-06, + "loss": 1.0049, + "step": 5668 + }, + { + "epoch": 0.5962112348324503, + "grad_norm": 2.2878641074827413, + "learning_rate": 1.7878807585009744e-06, + "loss": 0.9282, + "step": 5669 + }, + { + "epoch": 0.5963164052742976, + "grad_norm": 2.447388144552986, + "learning_rate": 1.7870805629863563e-06, + "loss": 0.9712, + "step": 5670 + }, + { + "epoch": 0.596421575716145, + "grad_norm": 2.4810306355833025, + "learning_rate": 1.7862804469677942e-06, + "loss": 0.9651, + "step": 5671 + }, + { + "epoch": 0.5965267461579923, + "grad_norm": 2.049575677232311, + "learning_rate": 1.7854804105345064e-06, + "loss": 0.971, + "step": 5672 + }, + { + "epoch": 0.5966319165998396, + "grad_norm": 2.715949255809658, + "learning_rate": 1.7846804537757034e-06, + "loss": 0.9999, + "step": 5673 + }, + { + "epoch": 0.5967370870416869, + "grad_norm": 2.384190947460149, + "learning_rate": 1.7838805767805866e-06, + "loss": 0.9835, + "step": 5674 + }, + { + "epoch": 0.5968422574835343, + "grad_norm": 1.9117408819820847, + "learning_rate": 1.7830807796383475e-06, + "loss": 0.969, + "step": 5675 + }, + { + "epoch": 0.5969474279253816, + "grad_norm": 2.26677544024507, + "learning_rate": 1.782281062438172e-06, + "loss": 0.9658, + "step": 5676 + }, + { + "epoch": 0.5970525983672289, + "grad_norm": 2.0051996758671806, + "learning_rate": 1.7814814252692333e-06, + "loss": 0.9929, + "step": 5677 + }, + { + "epoch": 0.5971577688090762, + "grad_norm": 2.6826187119200426, + "learning_rate": 1.7806818682206972e-06, + "loss": 1.0851, + "step": 5678 + }, + { + "epoch": 0.5972629392509236, + "grad_norm": 2.619403217077916, + "learning_rate": 1.779882391381721e-06, + "loss": 0.9448, + "step": 5679 + }, + { + "epoch": 0.5973681096927709, + "grad_norm": 2.5065928671207316, + "learning_rate": 1.7790829948414512e-06, + "loss": 1.0461, + "step": 5680 + }, + { + "epoch": 0.5974732801346182, + "grad_norm": 2.450270322938859, + "learning_rate": 1.778283678689029e-06, + "loss": 0.9973, + "step": 5681 + }, + { + "epoch": 0.5975784505764655, + "grad_norm": 2.519020526501517, + "learning_rate": 1.7774844430135823e-06, + "loss": 0.9834, + "step": 5682 + }, + { + "epoch": 0.5976836210183128, + "grad_norm": 1.6420026627989386, + "learning_rate": 1.7766852879042335e-06, + "loss": 0.9535, + "step": 5683 + }, + { + "epoch": 0.5977887914601602, + "grad_norm": 2.0627308895943623, + "learning_rate": 1.7758862134500926e-06, + "loss": 0.9999, + "step": 5684 + }, + { + "epoch": 0.5978939619020074, + "grad_norm": 2.6591210105862904, + "learning_rate": 1.7750872197402652e-06, + "loss": 0.9714, + "step": 5685 + }, + { + "epoch": 0.5979991323438547, + "grad_norm": 2.4500227653148294, + "learning_rate": 1.7742883068638447e-06, + "loss": 0.9585, + "step": 5686 + }, + { + "epoch": 0.598104302785702, + "grad_norm": 2.740294615631952, + "learning_rate": 1.773489474909915e-06, + "loss": 1.014, + "step": 5687 + }, + { + "epoch": 0.5982094732275494, + "grad_norm": 3.1109859813510115, + "learning_rate": 1.7726907239675523e-06, + "loss": 0.9821, + "step": 5688 + }, + { + "epoch": 0.5983146436693967, + "grad_norm": 2.1772995997746176, + "learning_rate": 1.771892054125823e-06, + "loss": 1.0209, + "step": 5689 + }, + { + "epoch": 0.598419814111244, + "grad_norm": 2.8946158732426333, + "learning_rate": 1.7710934654737868e-06, + "loss": 1.0099, + "step": 5690 + }, + { + "epoch": 0.5985249845530913, + "grad_norm": 2.1027519425308787, + "learning_rate": 1.7702949581004917e-06, + "loss": 1.0164, + "step": 5691 + }, + { + "epoch": 0.5986301549949387, + "grad_norm": 2.931727859528439, + "learning_rate": 1.769496532094977e-06, + "loss": 0.9897, + "step": 5692 + }, + { + "epoch": 0.598735325436786, + "grad_norm": 2.3348996193471274, + "learning_rate": 1.7686981875462733e-06, + "loss": 1.0096, + "step": 5693 + }, + { + "epoch": 0.5988404958786333, + "grad_norm": 2.4906181074969207, + "learning_rate": 1.7678999245434036e-06, + "loss": 0.985, + "step": 5694 + }, + { + "epoch": 0.5989456663204806, + "grad_norm": 1.712480678425093, + "learning_rate": 1.7671017431753789e-06, + "loss": 0.9953, + "step": 5695 + }, + { + "epoch": 0.599050836762328, + "grad_norm": 1.9969216227932245, + "learning_rate": 1.7663036435312037e-06, + "loss": 1.0244, + "step": 5696 + }, + { + "epoch": 0.5991560072041753, + "grad_norm": 2.663976765479451, + "learning_rate": 1.7655056256998712e-06, + "loss": 1.0014, + "step": 5697 + }, + { + "epoch": 0.5992611776460226, + "grad_norm": 2.1061164190156885, + "learning_rate": 1.7647076897703664e-06, + "loss": 0.9526, + "step": 5698 + }, + { + "epoch": 0.5993663480878699, + "grad_norm": 2.164328415014977, + "learning_rate": 1.7639098358316673e-06, + "loss": 0.9751, + "step": 5699 + }, + { + "epoch": 0.5994715185297173, + "grad_norm": 2.926709142359428, + "learning_rate": 1.7631120639727396e-06, + "loss": 0.993, + "step": 5700 + }, + { + "epoch": 0.5995766889715646, + "grad_norm": 2.5586116221192676, + "learning_rate": 1.762314374282541e-06, + "loss": 0.9988, + "step": 5701 + }, + { + "epoch": 0.5996818594134119, + "grad_norm": 1.9550872865008981, + "learning_rate": 1.7615167668500205e-06, + "loss": 0.9676, + "step": 5702 + }, + { + "epoch": 0.5997870298552592, + "grad_norm": 2.7161983180737646, + "learning_rate": 1.7607192417641164e-06, + "loss": 0.9666, + "step": 5703 + }, + { + "epoch": 0.5998922002971065, + "grad_norm": 2.6847054400836576, + "learning_rate": 1.7599217991137604e-06, + "loss": 1.0108, + "step": 5704 + }, + { + "epoch": 0.5999973707389539, + "grad_norm": 2.3979400888890416, + "learning_rate": 1.759124438987873e-06, + "loss": 0.9661, + "step": 5705 + }, + { + "epoch": 0.6001025411808011, + "grad_norm": 2.2431642149168467, + "learning_rate": 1.758327161475366e-06, + "loss": 0.982, + "step": 5706 + }, + { + "epoch": 0.6002077116226484, + "grad_norm": 2.177608584163875, + "learning_rate": 1.7575299666651413e-06, + "loss": 0.9424, + "step": 5707 + }, + { + "epoch": 0.6003128820644957, + "grad_norm": 1.7290319576886468, + "learning_rate": 1.7567328546460939e-06, + "loss": 0.944, + "step": 5708 + }, + { + "epoch": 0.6004180525063431, + "grad_norm": 3.080894919454507, + "learning_rate": 1.7559358255071068e-06, + "loss": 1.0085, + "step": 5709 + }, + { + "epoch": 0.6005232229481904, + "grad_norm": 2.060775484827086, + "learning_rate": 1.755138879337055e-06, + "loss": 0.9477, + "step": 5710 + }, + { + "epoch": 0.6006283933900377, + "grad_norm": 2.6684216105836183, + "learning_rate": 1.754342016224805e-06, + "loss": 1.0047, + "step": 5711 + }, + { + "epoch": 0.600733563831885, + "grad_norm": 2.1366313634883154, + "learning_rate": 1.7535452362592116e-06, + "loss": 0.9851, + "step": 5712 + }, + { + "epoch": 0.6008387342737324, + "grad_norm": 2.3649889418446284, + "learning_rate": 1.7527485395291234e-06, + "loss": 1.0063, + "step": 5713 + }, + { + "epoch": 0.6009439047155797, + "grad_norm": 1.9751341643467382, + "learning_rate": 1.7519519261233786e-06, + "loss": 0.9699, + "step": 5714 + }, + { + "epoch": 0.601049075157427, + "grad_norm": 3.1232686070504982, + "learning_rate": 1.7511553961308048e-06, + "loss": 0.9794, + "step": 5715 + }, + { + "epoch": 0.6011542455992743, + "grad_norm": 1.7374660242199027, + "learning_rate": 1.750358949640221e-06, + "loss": 0.9681, + "step": 5716 + }, + { + "epoch": 0.6012594160411217, + "grad_norm": 2.5925063706238216, + "learning_rate": 1.749562586740438e-06, + "loss": 1.0276, + "step": 5717 + }, + { + "epoch": 0.601364586482969, + "grad_norm": 2.2871091706939097, + "learning_rate": 1.7487663075202565e-06, + "loss": 0.9771, + "step": 5718 + }, + { + "epoch": 0.6014697569248163, + "grad_norm": 2.64298782172665, + "learning_rate": 1.7479701120684678e-06, + "loss": 1.0117, + "step": 5719 + }, + { + "epoch": 0.6015749273666636, + "grad_norm": 2.2690844026727097, + "learning_rate": 1.747174000473853e-06, + "loss": 0.9819, + "step": 5720 + }, + { + "epoch": 0.601680097808511, + "grad_norm": 3.5195991147155135, + "learning_rate": 1.7463779728251844e-06, + "loss": 1.0233, + "step": 5721 + }, + { + "epoch": 0.6017852682503583, + "grad_norm": 2.001707103161917, + "learning_rate": 1.745582029211228e-06, + "loss": 1.0275, + "step": 5722 + }, + { + "epoch": 0.6018904386922056, + "grad_norm": 2.008284524458236, + "learning_rate": 1.7447861697207352e-06, + "loss": 1.0409, + "step": 5723 + }, + { + "epoch": 0.6019956091340529, + "grad_norm": 2.814557277899713, + "learning_rate": 1.7439903944424513e-06, + "loss": 0.9048, + "step": 5724 + }, + { + "epoch": 0.6021007795759002, + "grad_norm": 2.115344313736399, + "learning_rate": 1.7431947034651111e-06, + "loss": 1.0137, + "step": 5725 + }, + { + "epoch": 0.6022059500177475, + "grad_norm": 1.708140537223153, + "learning_rate": 1.7423990968774397e-06, + "loss": 0.93, + "step": 5726 + }, + { + "epoch": 0.6023111204595948, + "grad_norm": 3.0743193234151365, + "learning_rate": 1.7416035747681554e-06, + "loss": 1.0175, + "step": 5727 + }, + { + "epoch": 0.6024162909014421, + "grad_norm": 2.109346929311083, + "learning_rate": 1.7408081372259633e-06, + "loss": 0.957, + "step": 5728 + }, + { + "epoch": 0.6025214613432894, + "grad_norm": 2.63470957119076, + "learning_rate": 1.740012784339562e-06, + "loss": 0.9973, + "step": 5729 + }, + { + "epoch": 0.6026266317851368, + "grad_norm": 2.789184435937574, + "learning_rate": 1.7392175161976384e-06, + "loss": 0.9654, + "step": 5730 + }, + { + "epoch": 0.6027318022269841, + "grad_norm": 2.891274776951523, + "learning_rate": 1.7384223328888724e-06, + "loss": 0.9714, + "step": 5731 + }, + { + "epoch": 0.6028369726688314, + "grad_norm": 2.3509105765329568, + "learning_rate": 1.7376272345019325e-06, + "loss": 1.0159, + "step": 5732 + }, + { + "epoch": 0.6029421431106787, + "grad_norm": 2.8871291599462374, + "learning_rate": 1.736832221125478e-06, + "loss": 0.9938, + "step": 5733 + }, + { + "epoch": 0.603047313552526, + "grad_norm": 2.5476087471404476, + "learning_rate": 1.7360372928481594e-06, + "loss": 1.0238, + "step": 5734 + }, + { + "epoch": 0.6031524839943734, + "grad_norm": 2.607771150913734, + "learning_rate": 1.7352424497586163e-06, + "loss": 0.9731, + "step": 5735 + }, + { + "epoch": 0.6032576544362207, + "grad_norm": 1.834225254634065, + "learning_rate": 1.7344476919454816e-06, + "loss": 0.9735, + "step": 5736 + }, + { + "epoch": 0.603362824878068, + "grad_norm": 2.1538425332658617, + "learning_rate": 1.7336530194973766e-06, + "loss": 0.9937, + "step": 5737 + }, + { + "epoch": 0.6034679953199154, + "grad_norm": 2.0723158686851604, + "learning_rate": 1.732858432502914e-06, + "loss": 0.9705, + "step": 5738 + }, + { + "epoch": 0.6035731657617627, + "grad_norm": 2.7600166097053385, + "learning_rate": 1.7320639310506935e-06, + "loss": 0.9406, + "step": 5739 + }, + { + "epoch": 0.60367833620361, + "grad_norm": 2.4094461693884144, + "learning_rate": 1.7312695152293112e-06, + "loss": 1.017, + "step": 5740 + }, + { + "epoch": 0.6037835066454573, + "grad_norm": 3.320433240431043, + "learning_rate": 1.73047518512735e-06, + "loss": 1.0132, + "step": 5741 + }, + { + "epoch": 0.6038886770873046, + "grad_norm": 2.3770891741620512, + "learning_rate": 1.729680940833383e-06, + "loss": 1.0349, + "step": 5742 + }, + { + "epoch": 0.603993847529152, + "grad_norm": 2.2580054578011373, + "learning_rate": 1.7288867824359757e-06, + "loss": 0.9242, + "step": 5743 + }, + { + "epoch": 0.6040990179709993, + "grad_norm": 2.280728334297776, + "learning_rate": 1.7280927100236816e-06, + "loss": 0.9499, + "step": 5744 + }, + { + "epoch": 0.6042041884128466, + "grad_norm": 3.1522624706614413, + "learning_rate": 1.7272987236850475e-06, + "loss": 1.0292, + "step": 5745 + }, + { + "epoch": 0.6043093588546938, + "grad_norm": 2.9642417340517553, + "learning_rate": 1.7265048235086078e-06, + "loss": 1.0452, + "step": 5746 + }, + { + "epoch": 0.6044145292965412, + "grad_norm": 2.6256509593397146, + "learning_rate": 1.7257110095828894e-06, + "loss": 0.9713, + "step": 5747 + }, + { + "epoch": 0.6045196997383885, + "grad_norm": 2.363877425800695, + "learning_rate": 1.7249172819964084e-06, + "loss": 0.9749, + "step": 5748 + }, + { + "epoch": 0.6046248701802358, + "grad_norm": 2.8124098873748755, + "learning_rate": 1.7241236408376702e-06, + "loss": 1.031, + "step": 5749 + }, + { + "epoch": 0.6047300406220831, + "grad_norm": 1.9881986505849607, + "learning_rate": 1.7233300861951743e-06, + "loss": 1.024, + "step": 5750 + }, + { + "epoch": 0.6048352110639305, + "grad_norm": 2.6635556816470602, + "learning_rate": 1.7225366181574072e-06, + "loss": 1.0287, + "step": 5751 + }, + { + "epoch": 0.6049403815057778, + "grad_norm": 2.277134066909746, + "learning_rate": 1.7217432368128468e-06, + "loss": 0.9973, + "step": 5752 + }, + { + "epoch": 0.6050455519476251, + "grad_norm": 2.3189335868813776, + "learning_rate": 1.7209499422499607e-06, + "loss": 1.0081, + "step": 5753 + }, + { + "epoch": 0.6051507223894724, + "grad_norm": 2.2043466967505334, + "learning_rate": 1.7201567345572084e-06, + "loss": 1.0241, + "step": 5754 + }, + { + "epoch": 0.6052558928313198, + "grad_norm": 2.1236697874477737, + "learning_rate": 1.7193636138230382e-06, + "loss": 0.9812, + "step": 5755 + }, + { + "epoch": 0.6053610632731671, + "grad_norm": 2.774793111808096, + "learning_rate": 1.7185705801358892e-06, + "loss": 0.9876, + "step": 5756 + }, + { + "epoch": 0.6054662337150144, + "grad_norm": 2.7424681309962864, + "learning_rate": 1.7177776335841912e-06, + "loss": 1.0057, + "step": 5757 + }, + { + "epoch": 0.6055714041568617, + "grad_norm": 1.6016238303844932, + "learning_rate": 1.7169847742563624e-06, + "loss": 0.9981, + "step": 5758 + }, + { + "epoch": 0.605676574598709, + "grad_norm": 2.5812078320001364, + "learning_rate": 1.7161920022408153e-06, + "loss": 0.9899, + "step": 5759 + }, + { + "epoch": 0.6057817450405564, + "grad_norm": 2.099760762558072, + "learning_rate": 1.7153993176259481e-06, + "loss": 0.9424, + "step": 5760 + }, + { + "epoch": 0.6058869154824037, + "grad_norm": 2.7673202803512353, + "learning_rate": 1.714606720500152e-06, + "loss": 0.9918, + "step": 5761 + }, + { + "epoch": 0.605992085924251, + "grad_norm": 2.0971157563339053, + "learning_rate": 1.7138142109518068e-06, + "loss": 1.0119, + "step": 5762 + }, + { + "epoch": 0.6060972563660983, + "grad_norm": 2.4244381472334124, + "learning_rate": 1.7130217890692857e-06, + "loss": 0.9751, + "step": 5763 + }, + { + "epoch": 0.6062024268079457, + "grad_norm": 1.875983236644177, + "learning_rate": 1.7122294549409486e-06, + "loss": 0.9974, + "step": 5764 + }, + { + "epoch": 0.606307597249793, + "grad_norm": 1.9342289332061562, + "learning_rate": 1.7114372086551466e-06, + "loss": 0.9757, + "step": 5765 + }, + { + "epoch": 0.6064127676916403, + "grad_norm": 2.6297959475374175, + "learning_rate": 1.7106450503002214e-06, + "loss": 0.9784, + "step": 5766 + }, + { + "epoch": 0.6065179381334875, + "grad_norm": 3.026075897619725, + "learning_rate": 1.709852979964505e-06, + "loss": 0.9993, + "step": 5767 + }, + { + "epoch": 0.6066231085753349, + "grad_norm": 2.099309327822422, + "learning_rate": 1.7090609977363198e-06, + "loss": 0.9941, + "step": 5768 + }, + { + "epoch": 0.6067282790171822, + "grad_norm": 2.839764446363711, + "learning_rate": 1.7082691037039772e-06, + "loss": 0.9789, + "step": 5769 + }, + { + "epoch": 0.6068334494590295, + "grad_norm": 2.6592502093600325, + "learning_rate": 1.7074772979557802e-06, + "loss": 1.0025, + "step": 5770 + }, + { + "epoch": 0.6069386199008768, + "grad_norm": 2.3396615859703527, + "learning_rate": 1.7066855805800203e-06, + "loss": 0.945, + "step": 5771 + }, + { + "epoch": 0.6070437903427242, + "grad_norm": 2.967177827861926, + "learning_rate": 1.7058939516649814e-06, + "loss": 0.9851, + "step": 5772 + }, + { + "epoch": 0.6071489607845715, + "grad_norm": 2.8406257980156466, + "learning_rate": 1.7051024112989365e-06, + "loss": 0.9895, + "step": 5773 + }, + { + "epoch": 0.6072541312264188, + "grad_norm": 2.539983854743361, + "learning_rate": 1.7043109595701472e-06, + "loss": 1.0268, + "step": 5774 + }, + { + "epoch": 0.6073593016682661, + "grad_norm": 2.991656526457313, + "learning_rate": 1.7035195965668669e-06, + "loss": 0.9855, + "step": 5775 + }, + { + "epoch": 0.6074644721101135, + "grad_norm": 1.9105665351289631, + "learning_rate": 1.7027283223773378e-06, + "loss": 0.9728, + "step": 5776 + }, + { + "epoch": 0.6075696425519608, + "grad_norm": 2.2526519815756303, + "learning_rate": 1.7019371370897953e-06, + "loss": 0.9993, + "step": 5777 + }, + { + "epoch": 0.6076748129938081, + "grad_norm": 1.9285557068670938, + "learning_rate": 1.7011460407924616e-06, + "loss": 0.9362, + "step": 5778 + }, + { + "epoch": 0.6077799834356554, + "grad_norm": 3.059051701475386, + "learning_rate": 1.70035503357355e-06, + "loss": 1.0401, + "step": 5779 + }, + { + "epoch": 0.6078851538775027, + "grad_norm": 2.7521768591254525, + "learning_rate": 1.6995641155212638e-06, + "loss": 1.0341, + "step": 5780 + }, + { + "epoch": 0.6079903243193501, + "grad_norm": 2.680907512225668, + "learning_rate": 1.6987732867237967e-06, + "loss": 0.9964, + "step": 5781 + }, + { + "epoch": 0.6080954947611974, + "grad_norm": 2.8137952010868705, + "learning_rate": 1.6979825472693325e-06, + "loss": 0.9617, + "step": 5782 + }, + { + "epoch": 0.6082006652030447, + "grad_norm": 3.038100520437388, + "learning_rate": 1.6971918972460446e-06, + "loss": 1.0006, + "step": 5783 + }, + { + "epoch": 0.608305835644892, + "grad_norm": 2.754354820698447, + "learning_rate": 1.6964013367420967e-06, + "loss": 0.9527, + "step": 5784 + }, + { + "epoch": 0.6084110060867394, + "grad_norm": 2.8270349910908354, + "learning_rate": 1.695610865845641e-06, + "loss": 0.9745, + "step": 5785 + }, + { + "epoch": 0.6085161765285867, + "grad_norm": 2.7388567829030115, + "learning_rate": 1.694820484644824e-06, + "loss": 0.9969, + "step": 5786 + }, + { + "epoch": 0.6086213469704339, + "grad_norm": 2.0265884465537396, + "learning_rate": 1.6940301932277775e-06, + "loss": 0.982, + "step": 5787 + }, + { + "epoch": 0.6087265174122812, + "grad_norm": 2.679646606508221, + "learning_rate": 1.6932399916826254e-06, + "loss": 1.0164, + "step": 5788 + }, + { + "epoch": 0.6088316878541286, + "grad_norm": 2.4228792065312215, + "learning_rate": 1.692449880097482e-06, + "loss": 0.9963, + "step": 5789 + }, + { + "epoch": 0.6089368582959759, + "grad_norm": 2.5768372333502376, + "learning_rate": 1.6916598585604488e-06, + "loss": 0.9351, + "step": 5790 + }, + { + "epoch": 0.6090420287378232, + "grad_norm": 2.1737942757800415, + "learning_rate": 1.6908699271596213e-06, + "loss": 0.9949, + "step": 5791 + }, + { + "epoch": 0.6091471991796705, + "grad_norm": 3.049935748235343, + "learning_rate": 1.690080085983083e-06, + "loss": 0.9867, + "step": 5792 + }, + { + "epoch": 0.6092523696215179, + "grad_norm": 2.1746481010421723, + "learning_rate": 1.6892903351189065e-06, + "loss": 0.982, + "step": 5793 + }, + { + "epoch": 0.6093575400633652, + "grad_norm": 2.9706327728285493, + "learning_rate": 1.6885006746551551e-06, + "loss": 0.9631, + "step": 5794 + }, + { + "epoch": 0.6094627105052125, + "grad_norm": 1.935639999637283, + "learning_rate": 1.6877111046798829e-06, + "loss": 0.9725, + "step": 5795 + }, + { + "epoch": 0.6095678809470598, + "grad_norm": 1.936700358641335, + "learning_rate": 1.6869216252811327e-06, + "loss": 0.939, + "step": 5796 + }, + { + "epoch": 0.6096730513889072, + "grad_norm": 2.0817904454728176, + "learning_rate": 1.6861322365469373e-06, + "loss": 0.9761, + "step": 5797 + }, + { + "epoch": 0.6097782218307545, + "grad_norm": 1.5332951079387047, + "learning_rate": 1.6853429385653196e-06, + "loss": 1.0034, + "step": 5798 + }, + { + "epoch": 0.6098833922726018, + "grad_norm": 1.995502353302454, + "learning_rate": 1.6845537314242925e-06, + "loss": 0.9639, + "step": 5799 + }, + { + "epoch": 0.6099885627144491, + "grad_norm": 2.0050706202560553, + "learning_rate": 1.6837646152118592e-06, + "loss": 0.9861, + "step": 5800 + }, + { + "epoch": 0.6100937331562964, + "grad_norm": 2.331889223890956, + "learning_rate": 1.6829755900160127e-06, + "loss": 0.9623, + "step": 5801 + }, + { + "epoch": 0.6101989035981438, + "grad_norm": 2.77022089749751, + "learning_rate": 1.6821866559247344e-06, + "loss": 1.0013, + "step": 5802 + }, + { + "epoch": 0.6103040740399911, + "grad_norm": 2.550781452325086, + "learning_rate": 1.681397813025997e-06, + "loss": 1.0148, + "step": 5803 + }, + { + "epoch": 0.6104092444818384, + "grad_norm": 2.978580897273411, + "learning_rate": 1.6806090614077619e-06, + "loss": 0.9679, + "step": 5804 + }, + { + "epoch": 0.6105144149236857, + "grad_norm": 2.394625694101748, + "learning_rate": 1.6798204011579824e-06, + "loss": 0.9528, + "step": 5805 + }, + { + "epoch": 0.6106195853655331, + "grad_norm": 2.0977205974223407, + "learning_rate": 1.6790318323645996e-06, + "loss": 1.0424, + "step": 5806 + }, + { + "epoch": 0.6107247558073803, + "grad_norm": 2.7073708250144715, + "learning_rate": 1.678243355115545e-06, + "loss": 0.9837, + "step": 5807 + }, + { + "epoch": 0.6108299262492276, + "grad_norm": 2.815173180361358, + "learning_rate": 1.6774549694987388e-06, + "loss": 0.9832, + "step": 5808 + }, + { + "epoch": 0.6109350966910749, + "grad_norm": 2.3860074360187573, + "learning_rate": 1.676666675602095e-06, + "loss": 1.0108, + "step": 5809 + }, + { + "epoch": 0.6110402671329223, + "grad_norm": 2.5298738306057746, + "learning_rate": 1.6758784735135119e-06, + "loss": 1.0427, + "step": 5810 + }, + { + "epoch": 0.6111454375747696, + "grad_norm": 2.5000170631689826, + "learning_rate": 1.6750903633208812e-06, + "loss": 0.9571, + "step": 5811 + }, + { + "epoch": 0.6112506080166169, + "grad_norm": 2.50169544611617, + "learning_rate": 1.6743023451120831e-06, + "loss": 0.9463, + "step": 5812 + }, + { + "epoch": 0.6113557784584642, + "grad_norm": 1.7081141405864757, + "learning_rate": 1.673514418974987e-06, + "loss": 0.9924, + "step": 5813 + }, + { + "epoch": 0.6114609489003116, + "grad_norm": 3.2506880206165825, + "learning_rate": 1.672726584997454e-06, + "loss": 0.9991, + "step": 5814 + }, + { + "epoch": 0.6115661193421589, + "grad_norm": 2.953416905081632, + "learning_rate": 1.6719388432673336e-06, + "loss": 1.0007, + "step": 5815 + }, + { + "epoch": 0.6116712897840062, + "grad_norm": 2.818662791521142, + "learning_rate": 1.6711511938724648e-06, + "loss": 0.9789, + "step": 5816 + }, + { + "epoch": 0.6117764602258535, + "grad_norm": 2.3521060058811187, + "learning_rate": 1.670363636900676e-06, + "loss": 0.9882, + "step": 5817 + }, + { + "epoch": 0.6118816306677008, + "grad_norm": 2.293305321670319, + "learning_rate": 1.669576172439787e-06, + "loss": 1.0534, + "step": 5818 + }, + { + "epoch": 0.6119868011095482, + "grad_norm": 1.6726020449012442, + "learning_rate": 1.668788800577606e-06, + "loss": 0.9545, + "step": 5819 + }, + { + "epoch": 0.6120919715513955, + "grad_norm": 2.76117276419402, + "learning_rate": 1.6680015214019305e-06, + "loss": 0.977, + "step": 5820 + }, + { + "epoch": 0.6121971419932428, + "grad_norm": 2.1802762486981258, + "learning_rate": 1.6672143350005488e-06, + "loss": 1.0135, + "step": 5821 + }, + { + "epoch": 0.6123023124350901, + "grad_norm": 3.9598469389280613, + "learning_rate": 1.6664272414612368e-06, + "loss": 1.0158, + "step": 5822 + }, + { + "epoch": 0.6124074828769375, + "grad_norm": 2.2549595245033145, + "learning_rate": 1.6656402408717636e-06, + "loss": 0.9378, + "step": 5823 + }, + { + "epoch": 0.6125126533187848, + "grad_norm": 2.196920678193086, + "learning_rate": 1.6648533333198858e-06, + "loss": 1.0162, + "step": 5824 + }, + { + "epoch": 0.6126178237606321, + "grad_norm": 2.6642892656099866, + "learning_rate": 1.664066518893349e-06, + "loss": 1.0044, + "step": 5825 + }, + { + "epoch": 0.6127229942024794, + "grad_norm": 2.7734023454217027, + "learning_rate": 1.6632797976798887e-06, + "loss": 0.9841, + "step": 5826 + }, + { + "epoch": 0.6128281646443268, + "grad_norm": 2.3104109605714718, + "learning_rate": 1.6624931697672298e-06, + "loss": 0.973, + "step": 5827 + }, + { + "epoch": 0.612933335086174, + "grad_norm": 2.9928279355445984, + "learning_rate": 1.6617066352430893e-06, + "loss": 0.984, + "step": 5828 + }, + { + "epoch": 0.6130385055280213, + "grad_norm": 2.5371426562781116, + "learning_rate": 1.6609201941951715e-06, + "loss": 0.9845, + "step": 5829 + }, + { + "epoch": 0.6131436759698686, + "grad_norm": 1.8754458670700893, + "learning_rate": 1.6601338467111699e-06, + "loss": 0.9817, + "step": 5830 + }, + { + "epoch": 0.613248846411716, + "grad_norm": 2.322347493475784, + "learning_rate": 1.6593475928787683e-06, + "loss": 0.9833, + "step": 5831 + }, + { + "epoch": 0.6133540168535633, + "grad_norm": 2.604820493235049, + "learning_rate": 1.6585614327856408e-06, + "loss": 0.9869, + "step": 5832 + }, + { + "epoch": 0.6134591872954106, + "grad_norm": 2.2184504978618222, + "learning_rate": 1.6577753665194502e-06, + "loss": 0.9635, + "step": 5833 + }, + { + "epoch": 0.6135643577372579, + "grad_norm": 2.8750863642027644, + "learning_rate": 1.656989394167849e-06, + "loss": 1.0122, + "step": 5834 + }, + { + "epoch": 0.6136695281791053, + "grad_norm": 2.429229413863499, + "learning_rate": 1.6562035158184791e-06, + "loss": 0.9418, + "step": 5835 + }, + { + "epoch": 0.6137746986209526, + "grad_norm": 2.404031095251293, + "learning_rate": 1.655417731558971e-06, + "loss": 0.9868, + "step": 5836 + }, + { + "epoch": 0.6138798690627999, + "grad_norm": 3.657059627061674, + "learning_rate": 1.6546320414769474e-06, + "loss": 1.0213, + "step": 5837 + }, + { + "epoch": 0.6139850395046472, + "grad_norm": 2.3207503976843564, + "learning_rate": 1.6538464456600186e-06, + "loss": 0.9973, + "step": 5838 + }, + { + "epoch": 0.6140902099464945, + "grad_norm": 2.1274755433279586, + "learning_rate": 1.6530609441957844e-06, + "loss": 0.9843, + "step": 5839 + }, + { + "epoch": 0.6141953803883419, + "grad_norm": 1.7062502169444627, + "learning_rate": 1.6522755371718333e-06, + "loss": 0.9573, + "step": 5840 + }, + { + "epoch": 0.6143005508301892, + "grad_norm": 2.7514773804668002, + "learning_rate": 1.6514902246757458e-06, + "loss": 1.0072, + "step": 5841 + }, + { + "epoch": 0.6144057212720365, + "grad_norm": 2.251985168187469, + "learning_rate": 1.6507050067950898e-06, + "loss": 0.9661, + "step": 5842 + }, + { + "epoch": 0.6145108917138838, + "grad_norm": 1.8065977765727852, + "learning_rate": 1.649919883617423e-06, + "loss": 0.9683, + "step": 5843 + }, + { + "epoch": 0.6146160621557312, + "grad_norm": 2.9561496412074106, + "learning_rate": 1.6491348552302927e-06, + "loss": 1.0008, + "step": 5844 + }, + { + "epoch": 0.6147212325975785, + "grad_norm": 1.9869101556732869, + "learning_rate": 1.6483499217212357e-06, + "loss": 1.0071, + "step": 5845 + }, + { + "epoch": 0.6148264030394258, + "grad_norm": 2.1835075291524184, + "learning_rate": 1.6475650831777784e-06, + "loss": 0.9863, + "step": 5846 + }, + { + "epoch": 0.6149315734812731, + "grad_norm": 2.430561118173312, + "learning_rate": 1.6467803396874365e-06, + "loss": 0.9679, + "step": 5847 + }, + { + "epoch": 0.6150367439231204, + "grad_norm": 2.28403426164745, + "learning_rate": 1.6459956913377145e-06, + "loss": 1.02, + "step": 5848 + }, + { + "epoch": 0.6151419143649677, + "grad_norm": 2.6620061740930017, + "learning_rate": 1.6452111382161074e-06, + "loss": 0.9631, + "step": 5849 + }, + { + "epoch": 0.615247084806815, + "grad_norm": 1.6904530329363399, + "learning_rate": 1.6444266804100972e-06, + "loss": 0.9713, + "step": 5850 + }, + { + "epoch": 0.6153522552486623, + "grad_norm": 2.3212203494074455, + "learning_rate": 1.6436423180071598e-06, + "loss": 0.968, + "step": 5851 + }, + { + "epoch": 0.6154574256905097, + "grad_norm": 2.5364871090662837, + "learning_rate": 1.6428580510947563e-06, + "loss": 0.9652, + "step": 5852 + }, + { + "epoch": 0.615562596132357, + "grad_norm": 3.030037498205161, + "learning_rate": 1.6420738797603386e-06, + "loss": 0.9879, + "step": 5853 + }, + { + "epoch": 0.6156677665742043, + "grad_norm": 2.822749830603122, + "learning_rate": 1.6412898040913471e-06, + "loss": 1.0141, + "step": 5854 + }, + { + "epoch": 0.6157729370160516, + "grad_norm": 2.5948534835106645, + "learning_rate": 1.6405058241752142e-06, + "loss": 0.9961, + "step": 5855 + }, + { + "epoch": 0.615878107457899, + "grad_norm": 1.7892323129704704, + "learning_rate": 1.6397219400993584e-06, + "loss": 0.9357, + "step": 5856 + }, + { + "epoch": 0.6159832778997463, + "grad_norm": 2.4386090952796935, + "learning_rate": 1.6389381519511893e-06, + "loss": 0.977, + "step": 5857 + }, + { + "epoch": 0.6160884483415936, + "grad_norm": 2.621745541745822, + "learning_rate": 1.6381544598181048e-06, + "loss": 1.0185, + "step": 5858 + }, + { + "epoch": 0.6161936187834409, + "grad_norm": 2.232535333948487, + "learning_rate": 1.6373708637874925e-06, + "loss": 1.0113, + "step": 5859 + }, + { + "epoch": 0.6162987892252882, + "grad_norm": 3.2749764704846553, + "learning_rate": 1.6365873639467314e-06, + "loss": 0.9737, + "step": 5860 + }, + { + "epoch": 0.6164039596671356, + "grad_norm": 1.869914950095561, + "learning_rate": 1.635803960383186e-06, + "loss": 1.0177, + "step": 5861 + }, + { + "epoch": 0.6165091301089829, + "grad_norm": 2.285719824488398, + "learning_rate": 1.6350206531842122e-06, + "loss": 1.0154, + "step": 5862 + }, + { + "epoch": 0.6166143005508302, + "grad_norm": 2.4241270283535825, + "learning_rate": 1.634237442437154e-06, + "loss": 1.0028, + "step": 5863 + }, + { + "epoch": 0.6167194709926775, + "grad_norm": 2.2622451802360932, + "learning_rate": 1.6334543282293475e-06, + "loss": 0.9626, + "step": 5864 + }, + { + "epoch": 0.6168246414345249, + "grad_norm": 3.2500786695737562, + "learning_rate": 1.6326713106481148e-06, + "loss": 0.9625, + "step": 5865 + }, + { + "epoch": 0.6169298118763722, + "grad_norm": 3.008447944721862, + "learning_rate": 1.6318883897807688e-06, + "loss": 0.9794, + "step": 5866 + }, + { + "epoch": 0.6170349823182195, + "grad_norm": 2.7479663807080907, + "learning_rate": 1.631105565714611e-06, + "loss": 0.988, + "step": 5867 + }, + { + "epoch": 0.6171401527600667, + "grad_norm": 2.784120946238147, + "learning_rate": 1.6303228385369318e-06, + "loss": 1.0219, + "step": 5868 + }, + { + "epoch": 0.617245323201914, + "grad_norm": 2.3151431857814155, + "learning_rate": 1.629540208335013e-06, + "loss": 1.0192, + "step": 5869 + }, + { + "epoch": 0.6173504936437614, + "grad_norm": 2.5308826576755656, + "learning_rate": 1.6287576751961225e-06, + "loss": 0.9902, + "step": 5870 + }, + { + "epoch": 0.6174556640856087, + "grad_norm": 2.3429901480764976, + "learning_rate": 1.6279752392075192e-06, + "loss": 0.9569, + "step": 5871 + }, + { + "epoch": 0.617560834527456, + "grad_norm": 2.181050276365485, + "learning_rate": 1.6271929004564514e-06, + "loss": 1.0071, + "step": 5872 + }, + { + "epoch": 0.6176660049693034, + "grad_norm": 2.601702484291357, + "learning_rate": 1.6264106590301538e-06, + "loss": 0.9857, + "step": 5873 + }, + { + "epoch": 0.6177711754111507, + "grad_norm": 2.459923668745263, + "learning_rate": 1.6256285150158551e-06, + "loss": 0.9918, + "step": 5874 + }, + { + "epoch": 0.617876345852998, + "grad_norm": 2.197411706775619, + "learning_rate": 1.6248464685007692e-06, + "loss": 0.9421, + "step": 5875 + }, + { + "epoch": 0.6179815162948453, + "grad_norm": 2.095985627899745, + "learning_rate": 1.6240645195721017e-06, + "loss": 1.0025, + "step": 5876 + }, + { + "epoch": 0.6180866867366926, + "grad_norm": 2.749362024760833, + "learning_rate": 1.6232826683170425e-06, + "loss": 1.034, + "step": 5877 + }, + { + "epoch": 0.61819185717854, + "grad_norm": 2.274015303414764, + "learning_rate": 1.6225009148227778e-06, + "loss": 1.0224, + "step": 5878 + }, + { + "epoch": 0.6182970276203873, + "grad_norm": 2.895694165275508, + "learning_rate": 1.6217192591764774e-06, + "loss": 1.0139, + "step": 5879 + }, + { + "epoch": 0.6184021980622346, + "grad_norm": 2.7394375689175225, + "learning_rate": 1.6209377014653028e-06, + "loss": 1.001, + "step": 5880 + }, + { + "epoch": 0.618507368504082, + "grad_norm": 2.9670712927952763, + "learning_rate": 1.6201562417764028e-06, + "loss": 1.0286, + "step": 5881 + }, + { + "epoch": 0.6186125389459293, + "grad_norm": 2.313393062040808, + "learning_rate": 1.6193748801969164e-06, + "loss": 1.0052, + "step": 5882 + }, + { + "epoch": 0.6187177093877766, + "grad_norm": 2.578512587304975, + "learning_rate": 1.6185936168139724e-06, + "loss": 1.0067, + "step": 5883 + }, + { + "epoch": 0.6188228798296239, + "grad_norm": 2.489037978706978, + "learning_rate": 1.617812451714687e-06, + "loss": 0.9524, + "step": 5884 + }, + { + "epoch": 0.6189280502714712, + "grad_norm": 2.056449619227656, + "learning_rate": 1.6170313849861662e-06, + "loss": 0.9762, + "step": 5885 + }, + { + "epoch": 0.6190332207133186, + "grad_norm": 2.7761192879641396, + "learning_rate": 1.6162504167155044e-06, + "loss": 0.9772, + "step": 5886 + }, + { + "epoch": 0.6191383911551659, + "grad_norm": 2.2429168320600237, + "learning_rate": 1.6154695469897873e-06, + "loss": 0.9714, + "step": 5887 + }, + { + "epoch": 0.6192435615970132, + "grad_norm": 2.0592296766661806, + "learning_rate": 1.6146887758960865e-06, + "loss": 1.0256, + "step": 5888 + }, + { + "epoch": 0.6193487320388604, + "grad_norm": 2.393849999328136, + "learning_rate": 1.6139081035214654e-06, + "loss": 0.9837, + "step": 5889 + }, + { + "epoch": 0.6194539024807078, + "grad_norm": 2.618576013105238, + "learning_rate": 1.6131275299529736e-06, + "loss": 1.0011, + "step": 5890 + }, + { + "epoch": 0.6195590729225551, + "grad_norm": 2.2774723438717595, + "learning_rate": 1.612347055277651e-06, + "loss": 0.9881, + "step": 5891 + }, + { + "epoch": 0.6196642433644024, + "grad_norm": 1.7906851084234103, + "learning_rate": 1.6115666795825284e-06, + "loss": 0.9792, + "step": 5892 + }, + { + "epoch": 0.6197694138062497, + "grad_norm": 2.148647148559524, + "learning_rate": 1.6107864029546221e-06, + "loss": 1.0106, + "step": 5893 + }, + { + "epoch": 0.619874584248097, + "grad_norm": 2.6630752042411907, + "learning_rate": 1.6100062254809395e-06, + "loss": 1.0022, + "step": 5894 + }, + { + "epoch": 0.6199797546899444, + "grad_norm": 2.1012206126791355, + "learning_rate": 1.6092261472484765e-06, + "loss": 0.9314, + "step": 5895 + }, + { + "epoch": 0.6200849251317917, + "grad_norm": 2.581191874335221, + "learning_rate": 1.6084461683442176e-06, + "loss": 0.9863, + "step": 5896 + }, + { + "epoch": 0.620190095573639, + "grad_norm": 2.4711381450267513, + "learning_rate": 1.6076662888551373e-06, + "loss": 0.9523, + "step": 5897 + }, + { + "epoch": 0.6202952660154863, + "grad_norm": 1.9627152201554907, + "learning_rate": 1.6068865088681975e-06, + "loss": 0.9788, + "step": 5898 + }, + { + "epoch": 0.6204004364573337, + "grad_norm": 3.1900135916304433, + "learning_rate": 1.6061068284703502e-06, + "loss": 0.9554, + "step": 5899 + }, + { + "epoch": 0.620505606899181, + "grad_norm": 2.4902737582100762, + "learning_rate": 1.605327247748534e-06, + "loss": 0.9749, + "step": 5900 + }, + { + "epoch": 0.6206107773410283, + "grad_norm": 1.979528958259765, + "learning_rate": 1.604547766789681e-06, + "loss": 0.9627, + "step": 5901 + }, + { + "epoch": 0.6207159477828756, + "grad_norm": 2.5553977998948945, + "learning_rate": 1.6037683856807083e-06, + "loss": 0.9983, + "step": 5902 + }, + { + "epoch": 0.620821118224723, + "grad_norm": 2.5672010413971638, + "learning_rate": 1.6029891045085224e-06, + "loss": 1.0212, + "step": 5903 + }, + { + "epoch": 0.6209262886665703, + "grad_norm": 2.4249488163123267, + "learning_rate": 1.60220992336002e-06, + "loss": 1.0261, + "step": 5904 + }, + { + "epoch": 0.6210314591084176, + "grad_norm": 2.6586835053107656, + "learning_rate": 1.6014308423220848e-06, + "loss": 0.9869, + "step": 5905 + }, + { + "epoch": 0.6211366295502649, + "grad_norm": 1.9453115647301724, + "learning_rate": 1.6006518614815913e-06, + "loss": 1.002, + "step": 5906 + }, + { + "epoch": 0.6212417999921123, + "grad_norm": 2.0323748172785425, + "learning_rate": 1.5998729809254017e-06, + "loss": 0.9806, + "step": 5907 + }, + { + "epoch": 0.6213469704339596, + "grad_norm": 2.1843910280641445, + "learning_rate": 1.599094200740367e-06, + "loss": 0.9464, + "step": 5908 + }, + { + "epoch": 0.6214521408758068, + "grad_norm": 2.7758359003572113, + "learning_rate": 1.5983155210133267e-06, + "loss": 1.0226, + "step": 5909 + }, + { + "epoch": 0.6215573113176541, + "grad_norm": 2.6474203799852445, + "learning_rate": 1.5975369418311113e-06, + "loss": 1.0131, + "step": 5910 + }, + { + "epoch": 0.6216624817595015, + "grad_norm": 2.2015849233799907, + "learning_rate": 1.5967584632805378e-06, + "loss": 0.9715, + "step": 5911 + }, + { + "epoch": 0.6217676522013488, + "grad_norm": 2.5493391965343144, + "learning_rate": 1.595980085448412e-06, + "loss": 0.9627, + "step": 5912 + }, + { + "epoch": 0.6218728226431961, + "grad_norm": 2.116283213919637, + "learning_rate": 1.5952018084215293e-06, + "loss": 0.9863, + "step": 5913 + }, + { + "epoch": 0.6219779930850434, + "grad_norm": 2.4987388936618853, + "learning_rate": 1.5944236322866725e-06, + "loss": 0.9764, + "step": 5914 + }, + { + "epoch": 0.6220831635268907, + "grad_norm": 2.327898890577115, + "learning_rate": 1.5936455571306164e-06, + "loss": 1.0053, + "step": 5915 + }, + { + "epoch": 0.6221883339687381, + "grad_norm": 2.0393178342923215, + "learning_rate": 1.592867583040122e-06, + "loss": 0.9501, + "step": 5916 + }, + { + "epoch": 0.6222935044105854, + "grad_norm": 2.5407681947430527, + "learning_rate": 1.5920897101019384e-06, + "loss": 1.0122, + "step": 5917 + }, + { + "epoch": 0.6223986748524327, + "grad_norm": 2.4916758411681226, + "learning_rate": 1.5913119384028048e-06, + "loss": 0.9687, + "step": 5918 + }, + { + "epoch": 0.62250384529428, + "grad_norm": 1.9567570424692333, + "learning_rate": 1.5905342680294495e-06, + "loss": 0.9773, + "step": 5919 + }, + { + "epoch": 0.6226090157361274, + "grad_norm": 3.263714903866836, + "learning_rate": 1.589756699068588e-06, + "loss": 0.9846, + "step": 5920 + }, + { + "epoch": 0.6227141861779747, + "grad_norm": 2.9575517862827936, + "learning_rate": 1.5889792316069259e-06, + "loss": 1.0221, + "step": 5921 + }, + { + "epoch": 0.622819356619822, + "grad_norm": 2.606415686984147, + "learning_rate": 1.5882018657311563e-06, + "loss": 0.9988, + "step": 5922 + }, + { + "epoch": 0.6229245270616693, + "grad_norm": 2.446109177024214, + "learning_rate": 1.587424601527961e-06, + "loss": 0.9587, + "step": 5923 + }, + { + "epoch": 0.6230296975035167, + "grad_norm": 2.553224170320699, + "learning_rate": 1.5866474390840126e-06, + "loss": 0.9538, + "step": 5924 + }, + { + "epoch": 0.623134867945364, + "grad_norm": 2.5094380956851507, + "learning_rate": 1.5858703784859697e-06, + "loss": 1.03, + "step": 5925 + }, + { + "epoch": 0.6232400383872113, + "grad_norm": 2.0855074263373163, + "learning_rate": 1.5850934198204812e-06, + "loss": 0.98, + "step": 5926 + }, + { + "epoch": 0.6233452088290586, + "grad_norm": 2.6961341409896455, + "learning_rate": 1.5843165631741841e-06, + "loss": 1.0272, + "step": 5927 + }, + { + "epoch": 0.623450379270906, + "grad_norm": 2.4500422545848313, + "learning_rate": 1.5835398086337017e-06, + "loss": 0.9762, + "step": 5928 + }, + { + "epoch": 0.6235555497127532, + "grad_norm": 1.9103182982629987, + "learning_rate": 1.582763156285651e-06, + "loss": 0.9472, + "step": 5929 + }, + { + "epoch": 0.6236607201546005, + "grad_norm": 2.412571799050157, + "learning_rate": 1.581986606216634e-06, + "loss": 0.9921, + "step": 5930 + }, + { + "epoch": 0.6237658905964478, + "grad_norm": 2.5521277009233536, + "learning_rate": 1.5812101585132416e-06, + "loss": 0.9579, + "step": 5931 + }, + { + "epoch": 0.6238710610382951, + "grad_norm": 2.6571937101267458, + "learning_rate": 1.5804338132620535e-06, + "loss": 1.017, + "step": 5932 + }, + { + "epoch": 0.6239762314801425, + "grad_norm": 2.0622319382949676, + "learning_rate": 1.5796575705496392e-06, + "loss": 0.9466, + "step": 5933 + }, + { + "epoch": 0.6240814019219898, + "grad_norm": 3.2155213422084974, + "learning_rate": 1.578881430462555e-06, + "loss": 1.0423, + "step": 5934 + }, + { + "epoch": 0.6241865723638371, + "grad_norm": 2.8066759471700085, + "learning_rate": 1.5781053930873468e-06, + "loss": 1.0102, + "step": 5935 + }, + { + "epoch": 0.6242917428056844, + "grad_norm": 2.3021253355837206, + "learning_rate": 1.5773294585105492e-06, + "loss": 0.9828, + "step": 5936 + }, + { + "epoch": 0.6243969132475318, + "grad_norm": 2.476492213812445, + "learning_rate": 1.5765536268186832e-06, + "loss": 1.0049, + "step": 5937 + }, + { + "epoch": 0.6245020836893791, + "grad_norm": 2.5139739664998375, + "learning_rate": 1.5757778980982627e-06, + "loss": 1.0335, + "step": 5938 + }, + { + "epoch": 0.6246072541312264, + "grad_norm": 2.2258196498122844, + "learning_rate": 1.5750022724357861e-06, + "loss": 0.9761, + "step": 5939 + }, + { + "epoch": 0.6247124245730737, + "grad_norm": 2.726588286992223, + "learning_rate": 1.5742267499177416e-06, + "loss": 1.0145, + "step": 5940 + }, + { + "epoch": 0.6248175950149211, + "grad_norm": 2.6669317872534544, + "learning_rate": 1.5734513306306053e-06, + "loss": 0.9992, + "step": 5941 + }, + { + "epoch": 0.6249227654567684, + "grad_norm": 2.084506573805443, + "learning_rate": 1.5726760146608443e-06, + "loss": 1.0112, + "step": 5942 + }, + { + "epoch": 0.6250279358986157, + "grad_norm": 2.6171495295222855, + "learning_rate": 1.5719008020949114e-06, + "loss": 0.9778, + "step": 5943 + }, + { + "epoch": 0.625133106340463, + "grad_norm": 2.140554094520001, + "learning_rate": 1.5711256930192486e-06, + "loss": 1.0035, + "step": 5944 + }, + { + "epoch": 0.6252382767823104, + "grad_norm": 1.9264447426695412, + "learning_rate": 1.5703506875202868e-06, + "loss": 0.9721, + "step": 5945 + }, + { + "epoch": 0.6253434472241577, + "grad_norm": 2.5055312262881144, + "learning_rate": 1.569575785684444e-06, + "loss": 0.9955, + "step": 5946 + }, + { + "epoch": 0.625448617666005, + "grad_norm": 2.807271348200614, + "learning_rate": 1.5688009875981308e-06, + "loss": 1.0316, + "step": 5947 + }, + { + "epoch": 0.6255537881078523, + "grad_norm": 2.0493312335861074, + "learning_rate": 1.5680262933477405e-06, + "loss": 0.9913, + "step": 5948 + }, + { + "epoch": 0.6256589585496997, + "grad_norm": 2.7559029051850708, + "learning_rate": 1.5672517030196582e-06, + "loss": 1.0043, + "step": 5949 + }, + { + "epoch": 0.6257641289915469, + "grad_norm": 2.1694704480396445, + "learning_rate": 1.5664772167002573e-06, + "loss": 0.9878, + "step": 5950 + }, + { + "epoch": 0.6258692994333942, + "grad_norm": 2.1988058558900323, + "learning_rate": 1.565702834475897e-06, + "loss": 0.9623, + "step": 5951 + }, + { + "epoch": 0.6259744698752415, + "grad_norm": 2.6284645651941245, + "learning_rate": 1.5649285564329296e-06, + "loss": 0.9351, + "step": 5952 + }, + { + "epoch": 0.6260796403170888, + "grad_norm": 2.2987819649896393, + "learning_rate": 1.5641543826576926e-06, + "loss": 0.9896, + "step": 5953 + }, + { + "epoch": 0.6261848107589362, + "grad_norm": 3.0571010198179485, + "learning_rate": 1.5633803132365117e-06, + "loss": 1.0123, + "step": 5954 + }, + { + "epoch": 0.6262899812007835, + "grad_norm": 3.3037893530878386, + "learning_rate": 1.5626063482557009e-06, + "loss": 0.9803, + "step": 5955 + }, + { + "epoch": 0.6263951516426308, + "grad_norm": 2.9959633481925416, + "learning_rate": 1.561832487801565e-06, + "loss": 0.9568, + "step": 5956 + }, + { + "epoch": 0.6265003220844781, + "grad_norm": 2.3258563684955638, + "learning_rate": 1.5610587319603942e-06, + "loss": 0.9807, + "step": 5957 + }, + { + "epoch": 0.6266054925263255, + "grad_norm": 2.434441203619329, + "learning_rate": 1.5602850808184695e-06, + "loss": 0.9661, + "step": 5958 + }, + { + "epoch": 0.6267106629681728, + "grad_norm": 2.33530559179327, + "learning_rate": 1.5595115344620575e-06, + "loss": 0.9838, + "step": 5959 + }, + { + "epoch": 0.6268158334100201, + "grad_norm": 2.2518352944630355, + "learning_rate": 1.558738092977415e-06, + "loss": 1.0295, + "step": 5960 + }, + { + "epoch": 0.6269210038518674, + "grad_norm": 2.4291496261842473, + "learning_rate": 1.5579647564507878e-06, + "loss": 1.0062, + "step": 5961 + }, + { + "epoch": 0.6270261742937148, + "grad_norm": 2.436150506595444, + "learning_rate": 1.5571915249684084e-06, + "loss": 0.9884, + "step": 5962 + }, + { + "epoch": 0.6271313447355621, + "grad_norm": 2.3348752680999665, + "learning_rate": 1.5564183986164982e-06, + "loss": 0.9936, + "step": 5963 + }, + { + "epoch": 0.6272365151774094, + "grad_norm": 2.694260375344963, + "learning_rate": 1.5556453774812652e-06, + "loss": 0.9632, + "step": 5964 + }, + { + "epoch": 0.6273416856192567, + "grad_norm": 3.2038875399300526, + "learning_rate": 1.5548724616489095e-06, + "loss": 0.9407, + "step": 5965 + }, + { + "epoch": 0.6274468560611041, + "grad_norm": 2.7091848613454936, + "learning_rate": 1.5540996512056164e-06, + "loss": 0.9873, + "step": 5966 + }, + { + "epoch": 0.6275520265029514, + "grad_norm": 2.8068933011488033, + "learning_rate": 1.5533269462375603e-06, + "loss": 0.9585, + "step": 5967 + }, + { + "epoch": 0.6276571969447987, + "grad_norm": 2.7111104148324543, + "learning_rate": 1.5525543468309035e-06, + "loss": 0.987, + "step": 5968 + }, + { + "epoch": 0.627762367386646, + "grad_norm": 2.744267573867375, + "learning_rate": 1.5517818530717965e-06, + "loss": 0.9686, + "step": 5969 + }, + { + "epoch": 0.6278675378284932, + "grad_norm": 2.830012535610845, + "learning_rate": 1.5510094650463797e-06, + "loss": 0.9701, + "step": 5970 + }, + { + "epoch": 0.6279727082703406, + "grad_norm": 3.230771720949463, + "learning_rate": 1.5502371828407797e-06, + "loss": 1.073, + "step": 5971 + }, + { + "epoch": 0.6280778787121879, + "grad_norm": 2.3409480978908896, + "learning_rate": 1.5494650065411116e-06, + "loss": 0.9638, + "step": 5972 + }, + { + "epoch": 0.6281830491540352, + "grad_norm": 2.444047229295055, + "learning_rate": 1.548692936233479e-06, + "loss": 0.9757, + "step": 5973 + }, + { + "epoch": 0.6282882195958825, + "grad_norm": 2.00504969984711, + "learning_rate": 1.5479209720039733e-06, + "loss": 0.9657, + "step": 5974 + }, + { + "epoch": 0.6283933900377299, + "grad_norm": 3.516884926223067, + "learning_rate": 1.5471491139386763e-06, + "loss": 0.9693, + "step": 5975 + }, + { + "epoch": 0.6284985604795772, + "grad_norm": 2.794487173825674, + "learning_rate": 1.546377362123655e-06, + "loss": 0.9452, + "step": 5976 + }, + { + "epoch": 0.6286037309214245, + "grad_norm": 2.1935018379858984, + "learning_rate": 1.5456057166449657e-06, + "loss": 0.9745, + "step": 5977 + }, + { + "epoch": 0.6287089013632718, + "grad_norm": 2.297663462287113, + "learning_rate": 1.5448341775886528e-06, + "loss": 0.9751, + "step": 5978 + }, + { + "epoch": 0.6288140718051192, + "grad_norm": 2.4320433680260085, + "learning_rate": 1.544062745040749e-06, + "loss": 0.9293, + "step": 5979 + }, + { + "epoch": 0.6289192422469665, + "grad_norm": 2.872577702758348, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.9477, + "step": 5980 + }, + { + "epoch": 0.6290244126888138, + "grad_norm": 2.6721948871558783, + "learning_rate": 1.542520199814241e-06, + "loss": 0.9874, + "step": 5981 + }, + { + "epoch": 0.6291295831306611, + "grad_norm": 2.5749173147193996, + "learning_rate": 1.541749087307642e-06, + "loss": 0.9875, + "step": 5982 + }, + { + "epoch": 0.6292347535725085, + "grad_norm": 3.239662190499185, + "learning_rate": 1.540978081653463e-06, + "loss": 0.9966, + "step": 5983 + }, + { + "epoch": 0.6293399240143558, + "grad_norm": 2.4691533579634926, + "learning_rate": 1.5402071829376785e-06, + "loss": 0.9739, + "step": 5984 + }, + { + "epoch": 0.6294450944562031, + "grad_norm": 2.59932511245003, + "learning_rate": 1.5394363912462492e-06, + "loss": 0.9414, + "step": 5985 + }, + { + "epoch": 0.6295502648980504, + "grad_norm": 3.223852238411686, + "learning_rate": 1.5386657066651242e-06, + "loss": 1.0248, + "step": 5986 + }, + { + "epoch": 0.6296554353398978, + "grad_norm": 2.6080630359778993, + "learning_rate": 1.5378951292802397e-06, + "loss": 0.9842, + "step": 5987 + }, + { + "epoch": 0.6297606057817451, + "grad_norm": 2.133772821565045, + "learning_rate": 1.537124659177523e-06, + "loss": 0.9879, + "step": 5988 + }, + { + "epoch": 0.6298657762235924, + "grad_norm": 2.3424669533242573, + "learning_rate": 1.5363542964428869e-06, + "loss": 0.981, + "step": 5989 + }, + { + "epoch": 0.6299709466654396, + "grad_norm": 2.805150094282492, + "learning_rate": 1.5355840411622324e-06, + "loss": 0.9469, + "step": 5990 + }, + { + "epoch": 0.630076117107287, + "grad_norm": 2.1906730624214235, + "learning_rate": 1.5348138934214493e-06, + "loss": 0.9742, + "step": 5991 + }, + { + "epoch": 0.6301812875491343, + "grad_norm": 2.1142812888162434, + "learning_rate": 1.5340438533064144e-06, + "loss": 0.9788, + "step": 5992 + }, + { + "epoch": 0.6302864579909816, + "grad_norm": 3.1567558738222607, + "learning_rate": 1.533273920902994e-06, + "loss": 0.9616, + "step": 5993 + }, + { + "epoch": 0.6303916284328289, + "grad_norm": 2.9030050091513866, + "learning_rate": 1.5325040962970417e-06, + "loss": 0.9866, + "step": 5994 + }, + { + "epoch": 0.6304967988746762, + "grad_norm": 2.7460052058389817, + "learning_rate": 1.5317343795743978e-06, + "loss": 0.9868, + "step": 5995 + }, + { + "epoch": 0.6306019693165236, + "grad_norm": 2.777830833328807, + "learning_rate": 1.5309647708208928e-06, + "loss": 1.0023, + "step": 5996 + }, + { + "epoch": 0.6307071397583709, + "grad_norm": 2.5120232310444104, + "learning_rate": 1.5301952701223422e-06, + "loss": 0.972, + "step": 5997 + }, + { + "epoch": 0.6308123102002182, + "grad_norm": 2.5363484810380212, + "learning_rate": 1.5294258775645545e-06, + "loss": 0.9653, + "step": 5998 + }, + { + "epoch": 0.6309174806420655, + "grad_norm": 2.3239370528340144, + "learning_rate": 1.5286565932333206e-06, + "loss": 0.9899, + "step": 5999 + }, + { + "epoch": 0.6310226510839129, + "grad_norm": 1.8488945840537439, + "learning_rate": 1.527887417214422e-06, + "loss": 0.9603, + "step": 6000 + }, + { + "epoch": 0.6311278215257602, + "grad_norm": 3.48906837044783, + "learning_rate": 1.5271183495936273e-06, + "loss": 1.0177, + "step": 6001 + }, + { + "epoch": 0.6312329919676075, + "grad_norm": 2.163852345437768, + "learning_rate": 1.526349390456695e-06, + "loss": 0.9964, + "step": 6002 + }, + { + "epoch": 0.6313381624094548, + "grad_norm": 2.5997209044493865, + "learning_rate": 1.5255805398893694e-06, + "loss": 1.0004, + "step": 6003 + }, + { + "epoch": 0.6314433328513022, + "grad_norm": 2.375034238569322, + "learning_rate": 1.524811797977383e-06, + "loss": 0.9733, + "step": 6004 + }, + { + "epoch": 0.6315485032931495, + "grad_norm": 2.496467895172032, + "learning_rate": 1.524043164806457e-06, + "loss": 1.0152, + "step": 6005 + }, + { + "epoch": 0.6316536737349968, + "grad_norm": 1.9885390550524185, + "learning_rate": 1.523274640462299e-06, + "loss": 0.9841, + "step": 6006 + }, + { + "epoch": 0.6317588441768441, + "grad_norm": 3.025548010215934, + "learning_rate": 1.5225062250306061e-06, + "loss": 1.0333, + "step": 6007 + }, + { + "epoch": 0.6318640146186915, + "grad_norm": 2.3886268665683668, + "learning_rate": 1.521737918597063e-06, + "loss": 0.9801, + "step": 6008 + }, + { + "epoch": 0.6319691850605388, + "grad_norm": 2.787511865407236, + "learning_rate": 1.5209697212473411e-06, + "loss": 0.9432, + "step": 6009 + }, + { + "epoch": 0.6320743555023861, + "grad_norm": 1.9376239960228194, + "learning_rate": 1.5202016330670998e-06, + "loss": 0.9643, + "step": 6010 + }, + { + "epoch": 0.6321795259442333, + "grad_norm": 2.5039360525755274, + "learning_rate": 1.5194336541419889e-06, + "loss": 1.0006, + "step": 6011 + }, + { + "epoch": 0.6322846963860806, + "grad_norm": 2.2219077385221486, + "learning_rate": 1.5186657845576428e-06, + "loss": 0.9957, + "step": 6012 + }, + { + "epoch": 0.632389866827928, + "grad_norm": 2.1820409541460544, + "learning_rate": 1.517898024399685e-06, + "loss": 1.001, + "step": 6013 + }, + { + "epoch": 0.6324950372697753, + "grad_norm": 3.0044306866211405, + "learning_rate": 1.5171303737537268e-06, + "loss": 1.0513, + "step": 6014 + }, + { + "epoch": 0.6326002077116226, + "grad_norm": 2.353451046725791, + "learning_rate": 1.5163628327053661e-06, + "loss": 0.9949, + "step": 6015 + }, + { + "epoch": 0.6327053781534699, + "grad_norm": 1.7590698297866734, + "learning_rate": 1.5155954013401916e-06, + "loss": 1.0115, + "step": 6016 + }, + { + "epoch": 0.6328105485953173, + "grad_norm": 2.903364189025722, + "learning_rate": 1.514828079743777e-06, + "loss": 1.0058, + "step": 6017 + }, + { + "epoch": 0.6329157190371646, + "grad_norm": 2.331986775898218, + "learning_rate": 1.5140608680016843e-06, + "loss": 0.9509, + "step": 6018 + }, + { + "epoch": 0.6330208894790119, + "grad_norm": 2.2153310954039225, + "learning_rate": 1.5132937661994642e-06, + "loss": 0.9977, + "step": 6019 + }, + { + "epoch": 0.6331260599208592, + "grad_norm": 2.5817333311290285, + "learning_rate": 1.5125267744226538e-06, + "loss": 1.0117, + "step": 6020 + }, + { + "epoch": 0.6332312303627066, + "grad_norm": 2.1972799748746903, + "learning_rate": 1.5117598927567791e-06, + "loss": 0.9816, + "step": 6021 + }, + { + "epoch": 0.6333364008045539, + "grad_norm": 2.501611715704273, + "learning_rate": 1.5109931212873535e-06, + "loss": 0.9624, + "step": 6022 + }, + { + "epoch": 0.6334415712464012, + "grad_norm": 3.048683807069792, + "learning_rate": 1.510226460099878e-06, + "loss": 1.0291, + "step": 6023 + }, + { + "epoch": 0.6335467416882485, + "grad_norm": 1.8577271918034934, + "learning_rate": 1.5094599092798396e-06, + "loss": 0.9834, + "step": 6024 + }, + { + "epoch": 0.6336519121300959, + "grad_norm": 2.360244200259059, + "learning_rate": 1.5086934689127173e-06, + "loss": 1.0152, + "step": 6025 + }, + { + "epoch": 0.6337570825719432, + "grad_norm": 3.3673054887669793, + "learning_rate": 1.5079271390839739e-06, + "loss": 1.0165, + "step": 6026 + }, + { + "epoch": 0.6338622530137905, + "grad_norm": 2.0209824979538773, + "learning_rate": 1.507160919879061e-06, + "loss": 1.0005, + "step": 6027 + }, + { + "epoch": 0.6339674234556378, + "grad_norm": 2.4114885968260156, + "learning_rate": 1.5063948113834184e-06, + "loss": 1.0082, + "step": 6028 + }, + { + "epoch": 0.6340725938974852, + "grad_norm": 2.185300297703002, + "learning_rate": 1.5056288136824725e-06, + "loss": 0.9766, + "step": 6029 + }, + { + "epoch": 0.6341777643393325, + "grad_norm": 2.0177323129193434, + "learning_rate": 1.5048629268616387e-06, + "loss": 0.9728, + "step": 6030 + }, + { + "epoch": 0.6342829347811797, + "grad_norm": 2.653274446710612, + "learning_rate": 1.5040971510063194e-06, + "loss": 0.9947, + "step": 6031 + }, + { + "epoch": 0.634388105223027, + "grad_norm": 2.579001884562927, + "learning_rate": 1.5033314862019038e-06, + "loss": 1.0079, + "step": 6032 + }, + { + "epoch": 0.6344932756648743, + "grad_norm": 2.5583814952866994, + "learning_rate": 1.5025659325337691e-06, + "loss": 1.0113, + "step": 6033 + }, + { + "epoch": 0.6345984461067217, + "grad_norm": 2.626887953110027, + "learning_rate": 1.5018004900872826e-06, + "loss": 1.0277, + "step": 6034 + }, + { + "epoch": 0.634703616548569, + "grad_norm": 2.1091067755253032, + "learning_rate": 1.5010351589477955e-06, + "loss": 0.9559, + "step": 6035 + }, + { + "epoch": 0.6348087869904163, + "grad_norm": 1.8200915464301803, + "learning_rate": 1.500269939200648e-06, + "loss": 0.9921, + "step": 6036 + }, + { + "epoch": 0.6349139574322636, + "grad_norm": 3.206419511627608, + "learning_rate": 1.4995048309311689e-06, + "loss": 1.0505, + "step": 6037 + }, + { + "epoch": 0.635019127874111, + "grad_norm": 3.2074930349618525, + "learning_rate": 1.4987398342246723e-06, + "loss": 0.9945, + "step": 6038 + }, + { + "epoch": 0.6351242983159583, + "grad_norm": 2.533006871994517, + "learning_rate": 1.4979749491664631e-06, + "loss": 0.9701, + "step": 6039 + }, + { + "epoch": 0.6352294687578056, + "grad_norm": 2.7185236711474596, + "learning_rate": 1.4972101758418307e-06, + "loss": 1.0024, + "step": 6040 + }, + { + "epoch": 0.6353346391996529, + "grad_norm": 2.946674199556197, + "learning_rate": 1.496445514336054e-06, + "loss": 0.9976, + "step": 6041 + }, + { + "epoch": 0.6354398096415003, + "grad_norm": 1.7428228667592487, + "learning_rate": 1.4956809647343984e-06, + "loss": 1.0117, + "step": 6042 + }, + { + "epoch": 0.6355449800833476, + "grad_norm": 2.3626314845775838, + "learning_rate": 1.494916527122116e-06, + "loss": 1.0219, + "step": 6043 + }, + { + "epoch": 0.6356501505251949, + "grad_norm": 2.6996995243978272, + "learning_rate": 1.4941522015844497e-06, + "loss": 0.9868, + "step": 6044 + }, + { + "epoch": 0.6357553209670422, + "grad_norm": 2.428347148699208, + "learning_rate": 1.4933879882066265e-06, + "loss": 0.9514, + "step": 6045 + }, + { + "epoch": 0.6358604914088896, + "grad_norm": 1.4275506316675006, + "learning_rate": 1.4926238870738624e-06, + "loss": 0.965, + "step": 6046 + }, + { + "epoch": 0.6359656618507369, + "grad_norm": 2.520049309698355, + "learning_rate": 1.4918598982713591e-06, + "loss": 1.0004, + "step": 6047 + }, + { + "epoch": 0.6360708322925842, + "grad_norm": 2.0394100491968654, + "learning_rate": 1.4910960218843099e-06, + "loss": 0.981, + "step": 6048 + }, + { + "epoch": 0.6361760027344315, + "grad_norm": 2.761031291865611, + "learning_rate": 1.4903322579978916e-06, + "loss": 0.9439, + "step": 6049 + }, + { + "epoch": 0.6362811731762789, + "grad_norm": 1.7561053210029363, + "learning_rate": 1.4895686066972703e-06, + "loss": 0.9956, + "step": 6050 + }, + { + "epoch": 0.6363863436181261, + "grad_norm": 1.662203724317949, + "learning_rate": 1.4888050680675983e-06, + "loss": 0.9915, + "step": 6051 + }, + { + "epoch": 0.6364915140599734, + "grad_norm": 2.6919552015061026, + "learning_rate": 1.4880416421940155e-06, + "loss": 1.0345, + "step": 6052 + }, + { + "epoch": 0.6365966845018207, + "grad_norm": 2.0994310916738197, + "learning_rate": 1.4872783291616516e-06, + "loss": 0.9567, + "step": 6053 + }, + { + "epoch": 0.636701854943668, + "grad_norm": 2.8279048616917497, + "learning_rate": 1.4865151290556213e-06, + "loss": 1.0237, + "step": 6054 + }, + { + "epoch": 0.6368070253855154, + "grad_norm": 1.997694763719727, + "learning_rate": 1.4857520419610269e-06, + "loss": 1.0362, + "step": 6055 + }, + { + "epoch": 0.6369121958273627, + "grad_norm": 2.8035596983277227, + "learning_rate": 1.4849890679629585e-06, + "loss": 1.0114, + "step": 6056 + }, + { + "epoch": 0.63701736626921, + "grad_norm": 3.067338117412502, + "learning_rate": 1.4842262071464941e-06, + "loss": 0.9986, + "step": 6057 + }, + { + "epoch": 0.6371225367110573, + "grad_norm": 2.258041094300344, + "learning_rate": 1.4834634595966987e-06, + "loss": 0.9978, + "step": 6058 + }, + { + "epoch": 0.6372277071529047, + "grad_norm": 1.7649409916767491, + "learning_rate": 1.4827008253986242e-06, + "loss": 0.9676, + "step": 6059 + }, + { + "epoch": 0.637332877594752, + "grad_norm": 2.156642745687781, + "learning_rate": 1.4819383046373103e-06, + "loss": 1.0234, + "step": 6060 + }, + { + "epoch": 0.6374380480365993, + "grad_norm": 1.9425720258392243, + "learning_rate": 1.481175897397783e-06, + "loss": 0.9994, + "step": 6061 + }, + { + "epoch": 0.6375432184784466, + "grad_norm": 1.747020487159967, + "learning_rate": 1.4804136037650587e-06, + "loss": 0.9864, + "step": 6062 + }, + { + "epoch": 0.637648388920294, + "grad_norm": 2.71636510704543, + "learning_rate": 1.4796514238241384e-06, + "loss": 1.0384, + "step": 6063 + }, + { + "epoch": 0.6377535593621413, + "grad_norm": 2.528649446654802, + "learning_rate": 1.47888935766001e-06, + "loss": 0.9637, + "step": 6064 + }, + { + "epoch": 0.6378587298039886, + "grad_norm": 2.2556064050120743, + "learning_rate": 1.4781274053576502e-06, + "loss": 1.008, + "step": 6065 + }, + { + "epoch": 0.6379639002458359, + "grad_norm": 1.750081160005925, + "learning_rate": 1.4773655670020235e-06, + "loss": 0.9957, + "step": 6066 + }, + { + "epoch": 0.6380690706876833, + "grad_norm": 2.250883027340165, + "learning_rate": 1.47660384267808e-06, + "loss": 0.9868, + "step": 6067 + }, + { + "epoch": 0.6381742411295306, + "grad_norm": 2.060415813689357, + "learning_rate": 1.4758422324707583e-06, + "loss": 0.9704, + "step": 6068 + }, + { + "epoch": 0.6382794115713779, + "grad_norm": 1.9461628426655384, + "learning_rate": 1.4750807364649833e-06, + "loss": 0.9545, + "step": 6069 + }, + { + "epoch": 0.6383845820132252, + "grad_norm": 2.1407118008400485, + "learning_rate": 1.474319354745668e-06, + "loss": 0.9842, + "step": 6070 + }, + { + "epoch": 0.6384897524550726, + "grad_norm": 2.469491976655806, + "learning_rate": 1.4735580873977125e-06, + "loss": 0.9548, + "step": 6071 + }, + { + "epoch": 0.6385949228969198, + "grad_norm": 2.115135764982275, + "learning_rate": 1.4727969345060041e-06, + "loss": 1.0031, + "step": 6072 + }, + { + "epoch": 0.6387000933387671, + "grad_norm": 3.4636565981184195, + "learning_rate": 1.472035896155417e-06, + "loss": 1.0143, + "step": 6073 + }, + { + "epoch": 0.6388052637806144, + "grad_norm": 2.4623855590153383, + "learning_rate": 1.4712749724308135e-06, + "loss": 1.0005, + "step": 6074 + }, + { + "epoch": 0.6389104342224617, + "grad_norm": 3.4935329001539603, + "learning_rate": 1.4705141634170402e-06, + "loss": 0.9797, + "step": 6075 + }, + { + "epoch": 0.6390156046643091, + "grad_norm": 2.981348278086113, + "learning_rate": 1.4697534691989362e-06, + "loss": 0.9747, + "step": 6076 + }, + { + "epoch": 0.6391207751061564, + "grad_norm": 2.3924446707785187, + "learning_rate": 1.468992889861324e-06, + "loss": 0.9882, + "step": 6077 + }, + { + "epoch": 0.6392259455480037, + "grad_norm": 2.23110800034559, + "learning_rate": 1.4682324254890135e-06, + "loss": 1.009, + "step": 6078 + }, + { + "epoch": 0.639331115989851, + "grad_norm": 2.468541532847342, + "learning_rate": 1.467472076166802e-06, + "loss": 0.9841, + "step": 6079 + }, + { + "epoch": 0.6394362864316984, + "grad_norm": 2.1517955193808342, + "learning_rate": 1.4667118419794756e-06, + "loss": 0.9967, + "step": 6080 + }, + { + "epoch": 0.6395414568735457, + "grad_norm": 2.5661813956881527, + "learning_rate": 1.4659517230118059e-06, + "loss": 0.9628, + "step": 6081 + }, + { + "epoch": 0.639646627315393, + "grad_norm": 2.181840326831563, + "learning_rate": 1.4651917193485516e-06, + "loss": 0.9237, + "step": 6082 + }, + { + "epoch": 0.6397517977572403, + "grad_norm": 2.1097264744955604, + "learning_rate": 1.4644318310744593e-06, + "loss": 1.0078, + "step": 6083 + }, + { + "epoch": 0.6398569681990877, + "grad_norm": 3.493695271673113, + "learning_rate": 1.4636720582742614e-06, + "loss": 0.9912, + "step": 6084 + }, + { + "epoch": 0.639962138640935, + "grad_norm": 1.9541985454696953, + "learning_rate": 1.4629124010326808e-06, + "loss": 0.9942, + "step": 6085 + }, + { + "epoch": 0.6400673090827823, + "grad_norm": 1.814035910685774, + "learning_rate": 1.4621528594344237e-06, + "loss": 1.0028, + "step": 6086 + }, + { + "epoch": 0.6401724795246296, + "grad_norm": 2.696801122775463, + "learning_rate": 1.461393433564185e-06, + "loss": 0.9912, + "step": 6087 + }, + { + "epoch": 0.640277649966477, + "grad_norm": 1.8963092165199675, + "learning_rate": 1.4606341235066452e-06, + "loss": 1.0165, + "step": 6088 + }, + { + "epoch": 0.6403828204083243, + "grad_norm": 3.669943765578645, + "learning_rate": 1.4598749293464763e-06, + "loss": 1.0253, + "step": 6089 + }, + { + "epoch": 0.6404879908501716, + "grad_norm": 2.8563817058680616, + "learning_rate": 1.459115851168333e-06, + "loss": 0.9864, + "step": 6090 + }, + { + "epoch": 0.6405931612920189, + "grad_norm": 3.028542769912519, + "learning_rate": 1.458356889056857e-06, + "loss": 0.9806, + "step": 6091 + }, + { + "epoch": 0.6406983317338661, + "grad_norm": 2.5809797491743653, + "learning_rate": 1.4575980430966808e-06, + "loss": 1.0162, + "step": 6092 + }, + { + "epoch": 0.6408035021757135, + "grad_norm": 2.040383893422859, + "learning_rate": 1.4568393133724185e-06, + "loss": 0.9846, + "step": 6093 + }, + { + "epoch": 0.6409086726175608, + "grad_norm": 2.4533017761010236, + "learning_rate": 1.4560806999686782e-06, + "loss": 1.0269, + "step": 6094 + }, + { + "epoch": 0.6410138430594081, + "grad_norm": 3.402665196483549, + "learning_rate": 1.4553222029700483e-06, + "loss": 1.0354, + "step": 6095 + }, + { + "epoch": 0.6411190135012554, + "grad_norm": 1.8564212218833198, + "learning_rate": 1.4545638224611091e-06, + "loss": 0.969, + "step": 6096 + }, + { + "epoch": 0.6412241839431028, + "grad_norm": 1.8685811138285715, + "learning_rate": 1.453805558526424e-06, + "loss": 0.9675, + "step": 6097 + }, + { + "epoch": 0.6413293543849501, + "grad_norm": 2.0516034951437496, + "learning_rate": 1.4530474112505462e-06, + "loss": 0.9577, + "step": 6098 + }, + { + "epoch": 0.6414345248267974, + "grad_norm": 2.632085890015541, + "learning_rate": 1.4522893807180163e-06, + "loss": 1.0097, + "step": 6099 + }, + { + "epoch": 0.6415396952686447, + "grad_norm": 2.3357227858145326, + "learning_rate": 1.4515314670133582e-06, + "loss": 0.99, + "step": 6100 + }, + { + "epoch": 0.6416448657104921, + "grad_norm": 2.031648424124929, + "learning_rate": 1.4507736702210872e-06, + "loss": 0.9714, + "step": 6101 + }, + { + "epoch": 0.6417500361523394, + "grad_norm": 2.9746248980463488, + "learning_rate": 1.4500159904257008e-06, + "loss": 0.995, + "step": 6102 + }, + { + "epoch": 0.6418552065941867, + "grad_norm": 2.4550674213700825, + "learning_rate": 1.4492584277116901e-06, + "loss": 1.0096, + "step": 6103 + }, + { + "epoch": 0.641960377036034, + "grad_norm": 2.5455704526475222, + "learning_rate": 1.4485009821635269e-06, + "loss": 0.9562, + "step": 6104 + }, + { + "epoch": 0.6420655474778814, + "grad_norm": 2.233105772105395, + "learning_rate": 1.4477436538656715e-06, + "loss": 0.9256, + "step": 6105 + }, + { + "epoch": 0.6421707179197287, + "grad_norm": 2.738769620298446, + "learning_rate": 1.446986442902574e-06, + "loss": 1.0144, + "step": 6106 + }, + { + "epoch": 0.642275888361576, + "grad_norm": 1.7815706347021845, + "learning_rate": 1.4462293493586662e-06, + "loss": 0.9717, + "step": 6107 + }, + { + "epoch": 0.6423810588034233, + "grad_norm": 2.0497666168497335, + "learning_rate": 1.445472373318374e-06, + "loss": 0.9561, + "step": 6108 + }, + { + "epoch": 0.6424862292452707, + "grad_norm": 2.6274997137270244, + "learning_rate": 1.444715514866103e-06, + "loss": 1.0196, + "step": 6109 + }, + { + "epoch": 0.642591399687118, + "grad_norm": 3.440273225742282, + "learning_rate": 1.443958774086251e-06, + "loss": 1.0102, + "step": 6110 + }, + { + "epoch": 0.6426965701289653, + "grad_norm": 2.483691809703666, + "learning_rate": 1.443202151063198e-06, + "loss": 1.0265, + "step": 6111 + }, + { + "epoch": 0.6428017405708125, + "grad_norm": 2.447339258191458, + "learning_rate": 1.4424456458813147e-06, + "loss": 1.0198, + "step": 6112 + }, + { + "epoch": 0.6429069110126598, + "grad_norm": 2.47035563695523, + "learning_rate": 1.4416892586249586e-06, + "loss": 1.0044, + "step": 6113 + }, + { + "epoch": 0.6430120814545072, + "grad_norm": 2.1915716837499333, + "learning_rate": 1.4409329893784702e-06, + "loss": 0.9784, + "step": 6114 + }, + { + "epoch": 0.6431172518963545, + "grad_norm": 2.2232422956545825, + "learning_rate": 1.4401768382261813e-06, + "loss": 1.0041, + "step": 6115 + }, + { + "epoch": 0.6432224223382018, + "grad_norm": 2.022035643282418, + "learning_rate": 1.4394208052524062e-06, + "loss": 1.0323, + "step": 6116 + }, + { + "epoch": 0.6433275927800491, + "grad_norm": 2.1999594819259496, + "learning_rate": 1.4386648905414525e-06, + "loss": 0.9813, + "step": 6117 + }, + { + "epoch": 0.6434327632218965, + "grad_norm": 2.4472550369492936, + "learning_rate": 1.4379090941776067e-06, + "loss": 0.9525, + "step": 6118 + }, + { + "epoch": 0.6435379336637438, + "grad_norm": 2.4214125950190817, + "learning_rate": 1.4371534162451487e-06, + "loss": 0.9848, + "step": 6119 + }, + { + "epoch": 0.6436431041055911, + "grad_norm": 2.6024268782469013, + "learning_rate": 1.4363978568283412e-06, + "loss": 1.0264, + "step": 6120 + }, + { + "epoch": 0.6437482745474384, + "grad_norm": 2.842563298958492, + "learning_rate": 1.4356424160114332e-06, + "loss": 0.9535, + "step": 6121 + }, + { + "epoch": 0.6438534449892858, + "grad_norm": 2.086520974335246, + "learning_rate": 1.4348870938786657e-06, + "loss": 0.9933, + "step": 6122 + }, + { + "epoch": 0.6439586154311331, + "grad_norm": 2.6018674216608306, + "learning_rate": 1.434131890514261e-06, + "loss": 0.9847, + "step": 6123 + }, + { + "epoch": 0.6440637858729804, + "grad_norm": 1.7399725974174627, + "learning_rate": 1.4333768060024308e-06, + "loss": 0.9887, + "step": 6124 + }, + { + "epoch": 0.6441689563148277, + "grad_norm": 2.33460686855743, + "learning_rate": 1.4326218404273718e-06, + "loss": 1.0107, + "step": 6125 + }, + { + "epoch": 0.6442741267566751, + "grad_norm": 2.304286713750231, + "learning_rate": 1.4318669938732694e-06, + "loss": 0.9888, + "step": 6126 + }, + { + "epoch": 0.6443792971985224, + "grad_norm": 2.617604600587451, + "learning_rate": 1.4311122664242955e-06, + "loss": 0.9804, + "step": 6127 + }, + { + "epoch": 0.6444844676403697, + "grad_norm": 2.884591813362857, + "learning_rate": 1.430357658164606e-06, + "loss": 0.9886, + "step": 6128 + }, + { + "epoch": 0.644589638082217, + "grad_norm": 2.437029381096377, + "learning_rate": 1.4296031691783485e-06, + "loss": 1.0295, + "step": 6129 + }, + { + "epoch": 0.6446948085240644, + "grad_norm": 2.9913792325943303, + "learning_rate": 1.4288487995496508e-06, + "loss": 0.9888, + "step": 6130 + }, + { + "epoch": 0.6447999789659117, + "grad_norm": 2.4797754234444715, + "learning_rate": 1.4280945493626347e-06, + "loss": 0.9409, + "step": 6131 + }, + { + "epoch": 0.644905149407759, + "grad_norm": 3.0033762892288545, + "learning_rate": 1.427340418701402e-06, + "loss": 1.0166, + "step": 6132 + }, + { + "epoch": 0.6450103198496062, + "grad_norm": 2.37731076377192, + "learning_rate": 1.4265864076500465e-06, + "loss": 0.963, + "step": 6133 + }, + { + "epoch": 0.6451154902914535, + "grad_norm": 2.6651860260554954, + "learning_rate": 1.4258325162926441e-06, + "loss": 1.0274, + "step": 6134 + }, + { + "epoch": 0.6452206607333009, + "grad_norm": 2.9869829981562384, + "learning_rate": 1.4250787447132607e-06, + "loss": 1.0177, + "step": 6135 + }, + { + "epoch": 0.6453258311751482, + "grad_norm": 2.170891852808566, + "learning_rate": 1.4243250929959484e-06, + "loss": 0.982, + "step": 6136 + }, + { + "epoch": 0.6454310016169955, + "grad_norm": 2.170159054445325, + "learning_rate": 1.4235715612247435e-06, + "loss": 1.0116, + "step": 6137 + }, + { + "epoch": 0.6455361720588428, + "grad_norm": 2.4494534962313868, + "learning_rate": 1.4228181494836724e-06, + "loss": 0.959, + "step": 6138 + }, + { + "epoch": 0.6456413425006902, + "grad_norm": 1.7490013283349262, + "learning_rate": 1.4220648578567444e-06, + "loss": 1.0183, + "step": 6139 + }, + { + "epoch": 0.6457465129425375, + "grad_norm": 2.8377357573800515, + "learning_rate": 1.4213116864279586e-06, + "loss": 0.977, + "step": 6140 + }, + { + "epoch": 0.6458516833843848, + "grad_norm": 1.933344788178783, + "learning_rate": 1.4205586352813e-06, + "loss": 0.9859, + "step": 6141 + }, + { + "epoch": 0.6459568538262321, + "grad_norm": 2.280844764369398, + "learning_rate": 1.4198057045007384e-06, + "loss": 1.0191, + "step": 6142 + }, + { + "epoch": 0.6460620242680795, + "grad_norm": 2.2048132233498654, + "learning_rate": 1.4190528941702328e-06, + "loss": 1.0413, + "step": 6143 + }, + { + "epoch": 0.6461671947099268, + "grad_norm": 2.4477864525484363, + "learning_rate": 1.4183002043737246e-06, + "loss": 1.0105, + "step": 6144 + }, + { + "epoch": 0.6462723651517741, + "grad_norm": 2.3566748243820625, + "learning_rate": 1.4175476351951484e-06, + "loss": 0.9748, + "step": 6145 + }, + { + "epoch": 0.6463775355936214, + "grad_norm": 2.636120161039787, + "learning_rate": 1.4167951867184187e-06, + "loss": 0.984, + "step": 6146 + }, + { + "epoch": 0.6464827060354688, + "grad_norm": 2.367508226857458, + "learning_rate": 1.4160428590274416e-06, + "loss": 0.9811, + "step": 6147 + }, + { + "epoch": 0.6465878764773161, + "grad_norm": 2.2275791479433167, + "learning_rate": 1.415290652206105e-06, + "loss": 0.9852, + "step": 6148 + }, + { + "epoch": 0.6466930469191634, + "grad_norm": 2.9242032394552138, + "learning_rate": 1.414538566338287e-06, + "loss": 1.0, + "step": 6149 + }, + { + "epoch": 0.6467982173610107, + "grad_norm": 2.200563913281183, + "learning_rate": 1.4137866015078523e-06, + "loss": 0.9766, + "step": 6150 + }, + { + "epoch": 0.646903387802858, + "grad_norm": 2.136169934182686, + "learning_rate": 1.4130347577986481e-06, + "loss": 0.9891, + "step": 6151 + }, + { + "epoch": 0.6470085582447054, + "grad_norm": 2.4522731115485916, + "learning_rate": 1.4122830352945133e-06, + "loss": 0.9605, + "step": 6152 + }, + { + "epoch": 0.6471137286865526, + "grad_norm": 1.8397790333899535, + "learning_rate": 1.411531434079268e-06, + "loss": 1.0013, + "step": 6153 + }, + { + "epoch": 0.6472188991283999, + "grad_norm": 2.2803600413168237, + "learning_rate": 1.410779954236725e-06, + "loss": 0.9917, + "step": 6154 + }, + { + "epoch": 0.6473240695702472, + "grad_norm": 2.8291558323999912, + "learning_rate": 1.4100285958506785e-06, + "loss": 1.0154, + "step": 6155 + }, + { + "epoch": 0.6474292400120946, + "grad_norm": 2.737018049431013, + "learning_rate": 1.4092773590049098e-06, + "loss": 0.9962, + "step": 6156 + }, + { + "epoch": 0.6475344104539419, + "grad_norm": 2.5922522490787583, + "learning_rate": 1.4085262437831886e-06, + "loss": 0.9314, + "step": 6157 + }, + { + "epoch": 0.6476395808957892, + "grad_norm": 2.1557844949857463, + "learning_rate": 1.4077752502692704e-06, + "loss": 0.9797, + "step": 6158 + }, + { + "epoch": 0.6477447513376365, + "grad_norm": 2.208795077852058, + "learning_rate": 1.4070243785468974e-06, + "loss": 0.9637, + "step": 6159 + }, + { + "epoch": 0.6478499217794839, + "grad_norm": 2.7684955688133965, + "learning_rate": 1.4062736286997952e-06, + "loss": 0.9932, + "step": 6160 + }, + { + "epoch": 0.6479550922213312, + "grad_norm": 2.7322932255125925, + "learning_rate": 1.4055230008116813e-06, + "loss": 0.9904, + "step": 6161 + }, + { + "epoch": 0.6480602626631785, + "grad_norm": 2.64744883621316, + "learning_rate": 1.404772494966254e-06, + "loss": 1.0088, + "step": 6162 + }, + { + "epoch": 0.6481654331050258, + "grad_norm": 2.5697944667740513, + "learning_rate": 1.4040221112472014e-06, + "loss": 0.9693, + "step": 6163 + }, + { + "epoch": 0.6482706035468732, + "grad_norm": 2.108197664933067, + "learning_rate": 1.4032718497381981e-06, + "loss": 1.032, + "step": 6164 + }, + { + "epoch": 0.6483757739887205, + "grad_norm": 3.875251629143974, + "learning_rate": 1.4025217105229021e-06, + "loss": 1.0005, + "step": 6165 + }, + { + "epoch": 0.6484809444305678, + "grad_norm": 2.3973712746700664, + "learning_rate": 1.4017716936849623e-06, + "loss": 0.9738, + "step": 6166 + }, + { + "epoch": 0.6485861148724151, + "grad_norm": 2.2650362281817866, + "learning_rate": 1.4010217993080076e-06, + "loss": 0.9662, + "step": 6167 + }, + { + "epoch": 0.6486912853142625, + "grad_norm": 2.1269159039754277, + "learning_rate": 1.400272027475662e-06, + "loss": 1.0022, + "step": 6168 + }, + { + "epoch": 0.6487964557561098, + "grad_norm": 2.177929012659701, + "learning_rate": 1.399522378271527e-06, + "loss": 0.9988, + "step": 6169 + }, + { + "epoch": 0.6489016261979571, + "grad_norm": 2.684842639627906, + "learning_rate": 1.3987728517791966e-06, + "loss": 0.9721, + "step": 6170 + }, + { + "epoch": 0.6490067966398044, + "grad_norm": 2.838066575594318, + "learning_rate": 1.3980234480822468e-06, + "loss": 0.9978, + "step": 6171 + }, + { + "epoch": 0.6491119670816518, + "grad_norm": 2.6540051552459465, + "learning_rate": 1.397274167264243e-06, + "loss": 0.9959, + "step": 6172 + }, + { + "epoch": 0.649217137523499, + "grad_norm": 2.4639663876901814, + "learning_rate": 1.3965250094087373e-06, + "loss": 0.9836, + "step": 6173 + }, + { + "epoch": 0.6493223079653463, + "grad_norm": 2.447427698722012, + "learning_rate": 1.3957759745992637e-06, + "loss": 0.9602, + "step": 6174 + }, + { + "epoch": 0.6494274784071936, + "grad_norm": 2.2679043767007983, + "learning_rate": 1.395027062919348e-06, + "loss": 1.0334, + "step": 6175 + }, + { + "epoch": 0.6495326488490409, + "grad_norm": 3.5789206773428317, + "learning_rate": 1.3942782744524974e-06, + "loss": 1.0175, + "step": 6176 + }, + { + "epoch": 0.6496378192908883, + "grad_norm": 2.729641701499508, + "learning_rate": 1.3935296092822087e-06, + "loss": 1.0013, + "step": 6177 + }, + { + "epoch": 0.6497429897327356, + "grad_norm": 2.3392616870926886, + "learning_rate": 1.392781067491965e-06, + "loss": 0.9527, + "step": 6178 + }, + { + "epoch": 0.6498481601745829, + "grad_norm": 2.969259025385436, + "learning_rate": 1.3920326491652325e-06, + "loss": 0.9911, + "step": 6179 + }, + { + "epoch": 0.6499533306164302, + "grad_norm": 2.055601110395823, + "learning_rate": 1.3912843543854664e-06, + "loss": 0.9653, + "step": 6180 + }, + { + "epoch": 0.6500585010582776, + "grad_norm": 2.808238408960534, + "learning_rate": 1.3905361832361078e-06, + "loss": 0.9675, + "step": 6181 + }, + { + "epoch": 0.6501636715001249, + "grad_norm": 2.2954725127353854, + "learning_rate": 1.3897881358005843e-06, + "loss": 0.9861, + "step": 6182 + }, + { + "epoch": 0.6502688419419722, + "grad_norm": 2.4627639209709113, + "learning_rate": 1.389040212162307e-06, + "loss": 0.9841, + "step": 6183 + }, + { + "epoch": 0.6503740123838195, + "grad_norm": 3.075750096193752, + "learning_rate": 1.3882924124046775e-06, + "loss": 1.0039, + "step": 6184 + }, + { + "epoch": 0.6504791828256669, + "grad_norm": 2.715707058368657, + "learning_rate": 1.387544736611079e-06, + "loss": 0.9638, + "step": 6185 + }, + { + "epoch": 0.6505843532675142, + "grad_norm": 1.977183746259915, + "learning_rate": 1.3867971848648843e-06, + "loss": 0.9912, + "step": 6186 + }, + { + "epoch": 0.6506895237093615, + "grad_norm": 2.7109584352412353, + "learning_rate": 1.386049757249452e-06, + "loss": 1.016, + "step": 6187 + }, + { + "epoch": 0.6507946941512088, + "grad_norm": 2.56279869947222, + "learning_rate": 1.3853024538481241e-06, + "loss": 0.9732, + "step": 6188 + }, + { + "epoch": 0.6508998645930562, + "grad_norm": 2.1123247416434574, + "learning_rate": 1.384555274744233e-06, + "loss": 1.0225, + "step": 6189 + }, + { + "epoch": 0.6510050350349035, + "grad_norm": 3.0471903006110517, + "learning_rate": 1.3838082200210932e-06, + "loss": 0.9687, + "step": 6190 + }, + { + "epoch": 0.6511102054767508, + "grad_norm": 2.272404748311318, + "learning_rate": 1.3830612897620072e-06, + "loss": 1.004, + "step": 6191 + }, + { + "epoch": 0.6512153759185981, + "grad_norm": 2.2705689260324875, + "learning_rate": 1.3823144840502656e-06, + "loss": 0.9655, + "step": 6192 + }, + { + "epoch": 0.6513205463604455, + "grad_norm": 2.549896761841074, + "learning_rate": 1.3815678029691399e-06, + "loss": 0.951, + "step": 6193 + }, + { + "epoch": 0.6514257168022927, + "grad_norm": 2.7098082789675817, + "learning_rate": 1.3808212466018927e-06, + "loss": 0.9638, + "step": 6194 + }, + { + "epoch": 0.65153088724414, + "grad_norm": 2.473211401630005, + "learning_rate": 1.3800748150317709e-06, + "loss": 0.9972, + "step": 6195 + }, + { + "epoch": 0.6516360576859873, + "grad_norm": 2.3649286529172047, + "learning_rate": 1.3793285083420077e-06, + "loss": 0.991, + "step": 6196 + }, + { + "epoch": 0.6517412281278346, + "grad_norm": 2.074820659875855, + "learning_rate": 1.378582326615821e-06, + "loss": 0.9987, + "step": 6197 + }, + { + "epoch": 0.651846398569682, + "grad_norm": 2.2908355538023453, + "learning_rate": 1.3778362699364167e-06, + "loss": 0.8914, + "step": 6198 + }, + { + "epoch": 0.6519515690115293, + "grad_norm": 2.495491364615591, + "learning_rate": 1.377090338386985e-06, + "loss": 0.9856, + "step": 6199 + }, + { + "epoch": 0.6520567394533766, + "grad_norm": 1.941710625750233, + "learning_rate": 1.3763445320507034e-06, + "loss": 1.0263, + "step": 6200 + }, + { + "epoch": 0.6521619098952239, + "grad_norm": 2.2537782874692978, + "learning_rate": 1.3755988510107365e-06, + "loss": 0.9837, + "step": 6201 + }, + { + "epoch": 0.6522670803370713, + "grad_norm": 2.14058569239182, + "learning_rate": 1.3748532953502317e-06, + "loss": 0.9417, + "step": 6202 + }, + { + "epoch": 0.6523722507789186, + "grad_norm": 2.2055386916726043, + "learning_rate": 1.3741078651523242e-06, + "loss": 0.9859, + "step": 6203 + }, + { + "epoch": 0.6524774212207659, + "grad_norm": 2.59614689675084, + "learning_rate": 1.3733625605001365e-06, + "loss": 0.9907, + "step": 6204 + }, + { + "epoch": 0.6525825916626132, + "grad_norm": 2.4124820848680453, + "learning_rate": 1.3726173814767763e-06, + "loss": 1.0189, + "step": 6205 + }, + { + "epoch": 0.6526877621044606, + "grad_norm": 2.350472903283974, + "learning_rate": 1.3718723281653357e-06, + "loss": 0.9483, + "step": 6206 + }, + { + "epoch": 0.6527929325463079, + "grad_norm": 2.728569075199262, + "learning_rate": 1.3711274006488935e-06, + "loss": 1.0009, + "step": 6207 + }, + { + "epoch": 0.6528981029881552, + "grad_norm": 2.1797869690723095, + "learning_rate": 1.370382599010515e-06, + "loss": 0.9916, + "step": 6208 + }, + { + "epoch": 0.6530032734300025, + "grad_norm": 2.34014133371477, + "learning_rate": 1.3696379233332518e-06, + "loss": 0.9545, + "step": 6209 + }, + { + "epoch": 0.6531084438718499, + "grad_norm": 2.760835763478809, + "learning_rate": 1.3688933737001425e-06, + "loss": 0.9907, + "step": 6210 + }, + { + "epoch": 0.6532136143136972, + "grad_norm": 2.0489131366244666, + "learning_rate": 1.3681489501942077e-06, + "loss": 0.9922, + "step": 6211 + }, + { + "epoch": 0.6533187847555445, + "grad_norm": 2.74268259260636, + "learning_rate": 1.3674046528984576e-06, + "loss": 0.9946, + "step": 6212 + }, + { + "epoch": 0.6534239551973918, + "grad_norm": 2.3837332766442016, + "learning_rate": 1.3666604818958878e-06, + "loss": 0.9978, + "step": 6213 + }, + { + "epoch": 0.653529125639239, + "grad_norm": 2.286480330832379, + "learning_rate": 1.3659164372694771e-06, + "loss": 1.0212, + "step": 6214 + }, + { + "epoch": 0.6536342960810864, + "grad_norm": 3.110138767109496, + "learning_rate": 1.365172519102195e-06, + "loss": 1.0182, + "step": 6215 + }, + { + "epoch": 0.6537394665229337, + "grad_norm": 2.3763826353787456, + "learning_rate": 1.3644287274769915e-06, + "loss": 0.9838, + "step": 6216 + }, + { + "epoch": 0.653844636964781, + "grad_norm": 2.147543965542234, + "learning_rate": 1.3636850624768065e-06, + "loss": 1.065, + "step": 6217 + }, + { + "epoch": 0.6539498074066283, + "grad_norm": 1.9128638684889196, + "learning_rate": 1.362941524184564e-06, + "loss": 0.99, + "step": 6218 + }, + { + "epoch": 0.6540549778484757, + "grad_norm": 2.701396037832273, + "learning_rate": 1.3621981126831755e-06, + "loss": 1.0051, + "step": 6219 + }, + { + "epoch": 0.654160148290323, + "grad_norm": 2.8040498677343324, + "learning_rate": 1.3614548280555351e-06, + "loss": 1.0138, + "step": 6220 + }, + { + "epoch": 0.6542653187321703, + "grad_norm": 2.0771737084674466, + "learning_rate": 1.3607116703845273e-06, + "loss": 1.0207, + "step": 6221 + }, + { + "epoch": 0.6543704891740176, + "grad_norm": 1.906154408918418, + "learning_rate": 1.3599686397530171e-06, + "loss": 1.0037, + "step": 6222 + }, + { + "epoch": 0.654475659615865, + "grad_norm": 2.6242007085780337, + "learning_rate": 1.35922573624386e-06, + "loss": 0.9953, + "step": 6223 + }, + { + "epoch": 0.6545808300577123, + "grad_norm": 2.536900445914889, + "learning_rate": 1.3584829599398958e-06, + "loss": 0.9774, + "step": 6224 + }, + { + "epoch": 0.6546860004995596, + "grad_norm": 2.3679575872686933, + "learning_rate": 1.3577403109239485e-06, + "loss": 0.9633, + "step": 6225 + }, + { + "epoch": 0.6547911709414069, + "grad_norm": 2.7750626080040086, + "learning_rate": 1.35699778927883e-06, + "loss": 0.9824, + "step": 6226 + }, + { + "epoch": 0.6548963413832543, + "grad_norm": 2.2512171030808386, + "learning_rate": 1.3562553950873377e-06, + "loss": 0.9846, + "step": 6227 + }, + { + "epoch": 0.6550015118251016, + "grad_norm": 3.2875448840376817, + "learning_rate": 1.3555131284322532e-06, + "loss": 0.9589, + "step": 6228 + }, + { + "epoch": 0.6551066822669489, + "grad_norm": 2.026779321691147, + "learning_rate": 1.3547709893963462e-06, + "loss": 0.9613, + "step": 6229 + }, + { + "epoch": 0.6552118527087962, + "grad_norm": 2.4724040803859197, + "learning_rate": 1.3540289780623697e-06, + "loss": 0.985, + "step": 6230 + }, + { + "epoch": 0.6553170231506436, + "grad_norm": 2.210229193121631, + "learning_rate": 1.3532870945130642e-06, + "loss": 1.0203, + "step": 6231 + }, + { + "epoch": 0.6554221935924909, + "grad_norm": 2.7088718437080894, + "learning_rate": 1.3525453388311554e-06, + "loss": 0.9994, + "step": 6232 + }, + { + "epoch": 0.6555273640343382, + "grad_norm": 2.6674805083220607, + "learning_rate": 1.3518037110993565e-06, + "loss": 1.0023, + "step": 6233 + }, + { + "epoch": 0.6556325344761854, + "grad_norm": 2.4368790272797347, + "learning_rate": 1.3510622114003619e-06, + "loss": 1.024, + "step": 6234 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 2.355980409234641, + "learning_rate": 1.3503208398168567e-06, + "loss": 1.0132, + "step": 6235 + }, + { + "epoch": 0.6558428753598801, + "grad_norm": 2.617783917085451, + "learning_rate": 1.349579596431509e-06, + "loss": 1.0071, + "step": 6236 + }, + { + "epoch": 0.6559480458017274, + "grad_norm": 1.908042391800999, + "learning_rate": 1.3488384813269726e-06, + "loss": 0.9537, + "step": 6237 + }, + { + "epoch": 0.6560532162435747, + "grad_norm": 2.4517991712108955, + "learning_rate": 1.3480974945858887e-06, + "loss": 0.9943, + "step": 6238 + }, + { + "epoch": 0.656158386685422, + "grad_norm": 2.125250466725107, + "learning_rate": 1.347356636290882e-06, + "loss": 0.9884, + "step": 6239 + }, + { + "epoch": 0.6562635571272694, + "grad_norm": 2.9442428748019256, + "learning_rate": 1.3466159065245637e-06, + "loss": 0.9988, + "step": 6240 + }, + { + "epoch": 0.6563687275691167, + "grad_norm": 2.1219839011084813, + "learning_rate": 1.3458753053695332e-06, + "loss": 0.9689, + "step": 6241 + }, + { + "epoch": 0.656473898010964, + "grad_norm": 2.175678143174604, + "learning_rate": 1.3451348329083702e-06, + "loss": 0.9788, + "step": 6242 + }, + { + "epoch": 0.6565790684528113, + "grad_norm": 1.8922658899883003, + "learning_rate": 1.344394489223646e-06, + "loss": 0.9898, + "step": 6243 + }, + { + "epoch": 0.6566842388946587, + "grad_norm": 2.6214577423300742, + "learning_rate": 1.3436542743979125e-06, + "loss": 0.9652, + "step": 6244 + }, + { + "epoch": 0.656789409336506, + "grad_norm": 2.105636623831596, + "learning_rate": 1.3429141885137097e-06, + "loss": 0.9862, + "step": 6245 + }, + { + "epoch": 0.6568945797783533, + "grad_norm": 1.9675801964204271, + "learning_rate": 1.342174231653564e-06, + "loss": 0.936, + "step": 6246 + }, + { + "epoch": 0.6569997502202006, + "grad_norm": 2.1697661785388473, + "learning_rate": 1.3414344038999862e-06, + "loss": 0.9785, + "step": 6247 + }, + { + "epoch": 0.657104920662048, + "grad_norm": 2.187917791791845, + "learning_rate": 1.340694705335472e-06, + "loss": 1.0148, + "step": 6248 + }, + { + "epoch": 0.6572100911038953, + "grad_norm": 3.026719311049528, + "learning_rate": 1.3399551360425035e-06, + "loss": 0.9711, + "step": 6249 + }, + { + "epoch": 0.6573152615457426, + "grad_norm": 1.8530024264442755, + "learning_rate": 1.33921569610355e-06, + "loss": 0.9588, + "step": 6250 + }, + { + "epoch": 0.6574204319875899, + "grad_norm": 2.1627719960544014, + "learning_rate": 1.3384763856010626e-06, + "loss": 0.9773, + "step": 6251 + }, + { + "epoch": 0.6575256024294373, + "grad_norm": 2.049389943315943, + "learning_rate": 1.3377372046174826e-06, + "loss": 0.9599, + "step": 6252 + }, + { + "epoch": 0.6576307728712846, + "grad_norm": 2.0628952108415333, + "learning_rate": 1.3369981532352317e-06, + "loss": 1.0007, + "step": 6253 + }, + { + "epoch": 0.6577359433131319, + "grad_norm": 2.491700162689158, + "learning_rate": 1.3362592315367212e-06, + "loss": 0.9925, + "step": 6254 + }, + { + "epoch": 0.6578411137549791, + "grad_norm": 2.2480692531605677, + "learning_rate": 1.3355204396043467e-06, + "loss": 0.9865, + "step": 6255 + }, + { + "epoch": 0.6579462841968264, + "grad_norm": 2.1560294381554073, + "learning_rate": 1.3347817775204903e-06, + "loss": 0.9463, + "step": 6256 + }, + { + "epoch": 0.6580514546386738, + "grad_norm": 2.1594218843164397, + "learning_rate": 1.3340432453675173e-06, + "loss": 0.9732, + "step": 6257 + }, + { + "epoch": 0.6581566250805211, + "grad_norm": 2.3800077094356666, + "learning_rate": 1.3333048432277779e-06, + "loss": 1.0126, + "step": 6258 + }, + { + "epoch": 0.6582617955223684, + "grad_norm": 2.109956422004809, + "learning_rate": 1.3325665711836138e-06, + "loss": 0.9533, + "step": 6259 + }, + { + "epoch": 0.6583669659642157, + "grad_norm": 3.0683493261098493, + "learning_rate": 1.331828429317345e-06, + "loss": 0.9883, + "step": 6260 + }, + { + "epoch": 0.6584721364060631, + "grad_norm": 2.686877306764361, + "learning_rate": 1.3310904177112819e-06, + "loss": 0.9882, + "step": 6261 + }, + { + "epoch": 0.6585773068479104, + "grad_norm": 3.1631141233592492, + "learning_rate": 1.3303525364477166e-06, + "loss": 1.0159, + "step": 6262 + }, + { + "epoch": 0.6586824772897577, + "grad_norm": 3.0131960777287388, + "learning_rate": 1.3296147856089298e-06, + "loss": 0.9658, + "step": 6263 + }, + { + "epoch": 0.658787647731605, + "grad_norm": 2.449844623908453, + "learning_rate": 1.328877165277187e-06, + "loss": 0.9998, + "step": 6264 + }, + { + "epoch": 0.6588928181734524, + "grad_norm": 2.452655678014848, + "learning_rate": 1.328139675534737e-06, + "loss": 0.9983, + "step": 6265 + }, + { + "epoch": 0.6589979886152997, + "grad_norm": 2.2755161044714, + "learning_rate": 1.3274023164638178e-06, + "loss": 1.0014, + "step": 6266 + }, + { + "epoch": 0.659103159057147, + "grad_norm": 1.9204300885679346, + "learning_rate": 1.326665088146648e-06, + "loss": 0.9859, + "step": 6267 + }, + { + "epoch": 0.6592083294989943, + "grad_norm": 3.1297354355899274, + "learning_rate": 1.325927990665436e-06, + "loss": 0.9901, + "step": 6268 + }, + { + "epoch": 0.6593134999408417, + "grad_norm": 2.6490272337574448, + "learning_rate": 1.325191024102373e-06, + "loss": 1.006, + "step": 6269 + }, + { + "epoch": 0.659418670382689, + "grad_norm": 3.235289643820615, + "learning_rate": 1.3244541885396384e-06, + "loss": 0.9851, + "step": 6270 + }, + { + "epoch": 0.6595238408245363, + "grad_norm": 2.232938536691584, + "learning_rate": 1.3237174840593927e-06, + "loss": 0.9571, + "step": 6271 + }, + { + "epoch": 0.6596290112663836, + "grad_norm": 2.7467215459350838, + "learning_rate": 1.3229809107437852e-06, + "loss": 0.9742, + "step": 6272 + }, + { + "epoch": 0.659734181708231, + "grad_norm": 3.265655439864346, + "learning_rate": 1.3222444686749508e-06, + "loss": 0.9426, + "step": 6273 + }, + { + "epoch": 0.6598393521500783, + "grad_norm": 2.7142378977870782, + "learning_rate": 1.3215081579350058e-06, + "loss": 0.9626, + "step": 6274 + }, + { + "epoch": 0.6599445225919255, + "grad_norm": 2.424253609885278, + "learning_rate": 1.3207719786060575e-06, + "loss": 0.9681, + "step": 6275 + }, + { + "epoch": 0.6600496930337728, + "grad_norm": 2.650710054297971, + "learning_rate": 1.3200359307701926e-06, + "loss": 1.0027, + "step": 6276 + }, + { + "epoch": 0.6601548634756201, + "grad_norm": 2.106049814182672, + "learning_rate": 1.319300014509488e-06, + "loss": 0.9976, + "step": 6277 + }, + { + "epoch": 0.6602600339174675, + "grad_norm": 3.0478589082735597, + "learning_rate": 1.318564229906005e-06, + "loss": 0.9812, + "step": 6278 + }, + { + "epoch": 0.6603652043593148, + "grad_norm": 2.644175851321169, + "learning_rate": 1.317828577041787e-06, + "loss": 0.9984, + "step": 6279 + }, + { + "epoch": 0.6604703748011621, + "grad_norm": 4.149275333858121, + "learning_rate": 1.317093055998867e-06, + "loss": 0.9905, + "step": 6280 + }, + { + "epoch": 0.6605755452430094, + "grad_norm": 2.1753727776576413, + "learning_rate": 1.3163576668592581e-06, + "loss": 0.9645, + "step": 6281 + }, + { + "epoch": 0.6606807156848568, + "grad_norm": 2.1321309814458433, + "learning_rate": 1.315622409704967e-06, + "loss": 0.9764, + "step": 6282 + }, + { + "epoch": 0.6607858861267041, + "grad_norm": 2.3284143235104104, + "learning_rate": 1.3148872846179761e-06, + "loss": 0.9685, + "step": 6283 + }, + { + "epoch": 0.6608910565685514, + "grad_norm": 2.20797024529712, + "learning_rate": 1.31415229168026e-06, + "loss": 0.9727, + "step": 6284 + }, + { + "epoch": 0.6609962270103987, + "grad_norm": 2.27464739045528, + "learning_rate": 1.3134174309737752e-06, + "loss": 0.9636, + "step": 6285 + }, + { + "epoch": 0.661101397452246, + "grad_norm": 2.564977023089282, + "learning_rate": 1.312682702580464e-06, + "loss": 0.9836, + "step": 6286 + }, + { + "epoch": 0.6612065678940934, + "grad_norm": 2.088470460153833, + "learning_rate": 1.3119481065822559e-06, + "loss": 0.8773, + "step": 6287 + }, + { + "epoch": 0.6613117383359407, + "grad_norm": 2.191541507414501, + "learning_rate": 1.3112136430610623e-06, + "loss": 1.0141, + "step": 6288 + }, + { + "epoch": 0.661416908777788, + "grad_norm": 2.459055525272844, + "learning_rate": 1.310479312098783e-06, + "loss": 0.9724, + "step": 6289 + }, + { + "epoch": 0.6615220792196354, + "grad_norm": 2.5763375289572004, + "learning_rate": 1.3097451137772999e-06, + "loss": 1.0105, + "step": 6290 + }, + { + "epoch": 0.6616272496614827, + "grad_norm": 3.1403073156867753, + "learning_rate": 1.3090110481784831e-06, + "loss": 0.9903, + "step": 6291 + }, + { + "epoch": 0.66173242010333, + "grad_norm": 2.718007886394719, + "learning_rate": 1.3082771153841872e-06, + "loss": 1.0049, + "step": 6292 + }, + { + "epoch": 0.6618375905451773, + "grad_norm": 2.551223778632464, + "learning_rate": 1.3075433154762496e-06, + "loss": 0.9796, + "step": 6293 + }, + { + "epoch": 0.6619427609870246, + "grad_norm": 2.1469053961592754, + "learning_rate": 1.3068096485364967e-06, + "loss": 1.0102, + "step": 6294 + }, + { + "epoch": 0.6620479314288719, + "grad_norm": 3.4328998554698007, + "learning_rate": 1.306076114646735e-06, + "loss": 0.9514, + "step": 6295 + }, + { + "epoch": 0.6621531018707192, + "grad_norm": 2.9569815422885153, + "learning_rate": 1.3053427138887631e-06, + "loss": 0.9742, + "step": 6296 + }, + { + "epoch": 0.6622582723125665, + "grad_norm": 2.8570811225791637, + "learning_rate": 1.3046094463443582e-06, + "loss": 1.0205, + "step": 6297 + }, + { + "epoch": 0.6623634427544138, + "grad_norm": 2.44062115758859, + "learning_rate": 1.3038763120952871e-06, + "loss": 1.004, + "step": 6298 + }, + { + "epoch": 0.6624686131962612, + "grad_norm": 2.1860706563332344, + "learning_rate": 1.303143311223298e-06, + "loss": 0.9852, + "step": 6299 + }, + { + "epoch": 0.6625737836381085, + "grad_norm": 2.305746587101714, + "learning_rate": 1.302410443810127e-06, + "loss": 0.9965, + "step": 6300 + }, + { + "epoch": 0.6626789540799558, + "grad_norm": 1.9272014050310644, + "learning_rate": 1.3016777099374962e-06, + "loss": 1.025, + "step": 6301 + }, + { + "epoch": 0.6627841245218031, + "grad_norm": 2.647801095541516, + "learning_rate": 1.3009451096871084e-06, + "loss": 0.9724, + "step": 6302 + }, + { + "epoch": 0.6628892949636505, + "grad_norm": 2.479510774074513, + "learning_rate": 1.3002126431406565e-06, + "loss": 0.9775, + "step": 6303 + }, + { + "epoch": 0.6629944654054978, + "grad_norm": 3.0571345818594478, + "learning_rate": 1.2994803103798131e-06, + "loss": 0.986, + "step": 6304 + }, + { + "epoch": 0.6630996358473451, + "grad_norm": 2.7508548007010054, + "learning_rate": 1.2987481114862427e-06, + "loss": 0.9855, + "step": 6305 + }, + { + "epoch": 0.6632048062891924, + "grad_norm": 1.9774859354202081, + "learning_rate": 1.2980160465415891e-06, + "loss": 0.9723, + "step": 6306 + }, + { + "epoch": 0.6633099767310398, + "grad_norm": 2.2732675979550114, + "learning_rate": 1.2972841156274843e-06, + "loss": 0.9814, + "step": 6307 + }, + { + "epoch": 0.6634151471728871, + "grad_norm": 2.362143185441358, + "learning_rate": 1.2965523188255438e-06, + "loss": 1.0208, + "step": 6308 + }, + { + "epoch": 0.6635203176147344, + "grad_norm": 2.2701352939481096, + "learning_rate": 1.2958206562173664e-06, + "loss": 0.9388, + "step": 6309 + }, + { + "epoch": 0.6636254880565817, + "grad_norm": 2.3921549016140102, + "learning_rate": 1.2950891278845423e-06, + "loss": 0.9988, + "step": 6310 + }, + { + "epoch": 0.663730658498429, + "grad_norm": 2.573032844055122, + "learning_rate": 1.2943577339086395e-06, + "loss": 0.9928, + "step": 6311 + }, + { + "epoch": 0.6638358289402764, + "grad_norm": 1.896933627943948, + "learning_rate": 1.2936264743712159e-06, + "loss": 0.9814, + "step": 6312 + }, + { + "epoch": 0.6639409993821237, + "grad_norm": 2.511447955098937, + "learning_rate": 1.292895349353811e-06, + "loss": 0.9846, + "step": 6313 + }, + { + "epoch": 0.664046169823971, + "grad_norm": 2.391646179219941, + "learning_rate": 1.2921643589379517e-06, + "loss": 0.9703, + "step": 6314 + }, + { + "epoch": 0.6641513402658183, + "grad_norm": 2.067946485876376, + "learning_rate": 1.2914335032051502e-06, + "loss": 0.971, + "step": 6315 + }, + { + "epoch": 0.6642565107076656, + "grad_norm": 3.1972490953679165, + "learning_rate": 1.2907027822369006e-06, + "loss": 0.9421, + "step": 6316 + }, + { + "epoch": 0.6643616811495129, + "grad_norm": 1.7933848682194085, + "learning_rate": 1.289972196114686e-06, + "loss": 1.0276, + "step": 6317 + }, + { + "epoch": 0.6644668515913602, + "grad_norm": 2.1050494682029797, + "learning_rate": 1.2892417449199696e-06, + "loss": 0.9682, + "step": 6318 + }, + { + "epoch": 0.6645720220332075, + "grad_norm": 2.5224497949789235, + "learning_rate": 1.2885114287342058e-06, + "loss": 1.0111, + "step": 6319 + }, + { + "epoch": 0.6646771924750549, + "grad_norm": 1.9853269766464046, + "learning_rate": 1.287781247638828e-06, + "loss": 0.9859, + "step": 6320 + }, + { + "epoch": 0.6647823629169022, + "grad_norm": 3.571360120955266, + "learning_rate": 1.2870512017152586e-06, + "loss": 1.0006, + "step": 6321 + }, + { + "epoch": 0.6648875333587495, + "grad_norm": 2.8210890079031303, + "learning_rate": 1.286321291044902e-06, + "loss": 0.9834, + "step": 6322 + }, + { + "epoch": 0.6649927038005968, + "grad_norm": 2.1350217532955544, + "learning_rate": 1.2855915157091498e-06, + "loss": 1.0235, + "step": 6323 + }, + { + "epoch": 0.6650978742424442, + "grad_norm": 2.939945404933564, + "learning_rate": 1.2848618757893782e-06, + "loss": 0.986, + "step": 6324 + }, + { + "epoch": 0.6652030446842915, + "grad_norm": 2.8261629770042602, + "learning_rate": 1.284132371366946e-06, + "loss": 1.0038, + "step": 6325 + }, + { + "epoch": 0.6653082151261388, + "grad_norm": 2.52942309852291, + "learning_rate": 1.2834030025232006e-06, + "loss": 0.9817, + "step": 6326 + }, + { + "epoch": 0.6654133855679861, + "grad_norm": 2.432243507022762, + "learning_rate": 1.2826737693394693e-06, + "loss": 1.0329, + "step": 6327 + }, + { + "epoch": 0.6655185560098335, + "grad_norm": 2.2735077443497937, + "learning_rate": 1.2819446718970713e-06, + "loss": 1.0334, + "step": 6328 + }, + { + "epoch": 0.6656237264516808, + "grad_norm": 2.6901417303834467, + "learning_rate": 1.2812157102773043e-06, + "loss": 1.0319, + "step": 6329 + }, + { + "epoch": 0.6657288968935281, + "grad_norm": 1.9151652735311289, + "learning_rate": 1.2804868845614527e-06, + "loss": 1.0053, + "step": 6330 + }, + { + "epoch": 0.6658340673353754, + "grad_norm": 2.923158306662884, + "learning_rate": 1.279758194830788e-06, + "loss": 0.963, + "step": 6331 + }, + { + "epoch": 0.6659392377772227, + "grad_norm": 3.174548375700343, + "learning_rate": 1.2790296411665618e-06, + "loss": 0.9674, + "step": 6332 + }, + { + "epoch": 0.6660444082190701, + "grad_norm": 3.0051084482405717, + "learning_rate": 1.2783012236500173e-06, + "loss": 0.9453, + "step": 6333 + }, + { + "epoch": 0.6661495786609174, + "grad_norm": 2.594123710120902, + "learning_rate": 1.2775729423623759e-06, + "loss": 0.9764, + "step": 6334 + }, + { + "epoch": 0.6662547491027647, + "grad_norm": 2.5531587705993126, + "learning_rate": 1.2768447973848485e-06, + "loss": 0.9683, + "step": 6335 + }, + { + "epoch": 0.6663599195446119, + "grad_norm": 3.1748751741089167, + "learning_rate": 1.276116788798627e-06, + "loss": 1.02, + "step": 6336 + }, + { + "epoch": 0.6664650899864593, + "grad_norm": 2.7632150853823148, + "learning_rate": 1.2753889166848909e-06, + "loss": 0.9777, + "step": 6337 + }, + { + "epoch": 0.6665702604283066, + "grad_norm": 2.0037733131489106, + "learning_rate": 1.274661181124805e-06, + "loss": 0.9881, + "step": 6338 + }, + { + "epoch": 0.6666754308701539, + "grad_norm": 2.4827855341616885, + "learning_rate": 1.2739335821995153e-06, + "loss": 0.94, + "step": 6339 + }, + { + "epoch": 0.6667806013120012, + "grad_norm": 2.8119810771826517, + "learning_rate": 1.2732061199901563e-06, + "loss": 0.994, + "step": 6340 + }, + { + "epoch": 0.6668857717538486, + "grad_norm": 2.752059269835914, + "learning_rate": 1.2724787945778427e-06, + "loss": 1.0225, + "step": 6341 + }, + { + "epoch": 0.6669909421956959, + "grad_norm": 2.2241542929333216, + "learning_rate": 1.271751606043682e-06, + "loss": 0.9708, + "step": 6342 + }, + { + "epoch": 0.6670961126375432, + "grad_norm": 2.6235709049395997, + "learning_rate": 1.2710245544687568e-06, + "loss": 1.0189, + "step": 6343 + }, + { + "epoch": 0.6672012830793905, + "grad_norm": 3.156266897222367, + "learning_rate": 1.2702976399341422e-06, + "loss": 0.9985, + "step": 6344 + }, + { + "epoch": 0.6673064535212379, + "grad_norm": 2.230980642118251, + "learning_rate": 1.2695708625208933e-06, + "loss": 0.9875, + "step": 6345 + }, + { + "epoch": 0.6674116239630852, + "grad_norm": 2.5767378584379155, + "learning_rate": 1.2688442223100494e-06, + "loss": 0.9804, + "step": 6346 + }, + { + "epoch": 0.6675167944049325, + "grad_norm": 2.029400978615484, + "learning_rate": 1.268117719382641e-06, + "loss": 0.9311, + "step": 6347 + }, + { + "epoch": 0.6676219648467798, + "grad_norm": 2.6045821189170235, + "learning_rate": 1.2673913538196753e-06, + "loss": 1.0034, + "step": 6348 + }, + { + "epoch": 0.6677271352886271, + "grad_norm": 2.0673514221546965, + "learning_rate": 1.26666512570215e-06, + "loss": 0.9702, + "step": 6349 + }, + { + "epoch": 0.6678323057304745, + "grad_norm": 2.227452564501476, + "learning_rate": 1.265939035111043e-06, + "loss": 0.9654, + "step": 6350 + }, + { + "epoch": 0.6679374761723218, + "grad_norm": 2.389880074299462, + "learning_rate": 1.26521308212732e-06, + "loss": 1.024, + "step": 6351 + }, + { + "epoch": 0.6680426466141691, + "grad_norm": 1.9298756073215733, + "learning_rate": 1.2644872668319317e-06, + "loss": 1.0022, + "step": 6352 + }, + { + "epoch": 0.6681478170560164, + "grad_norm": 2.8622091569435577, + "learning_rate": 1.2637615893058098e-06, + "loss": 0.9935, + "step": 6353 + }, + { + "epoch": 0.6682529874978638, + "grad_norm": 3.031543303955931, + "learning_rate": 1.263036049629875e-06, + "loss": 0.9945, + "step": 6354 + }, + { + "epoch": 0.6683581579397111, + "grad_norm": 3.1179126127334835, + "learning_rate": 1.262310647885028e-06, + "loss": 1.0373, + "step": 6355 + }, + { + "epoch": 0.6684633283815583, + "grad_norm": 2.301408947362086, + "learning_rate": 1.2615853841521602e-06, + "loss": 1.0232, + "step": 6356 + }, + { + "epoch": 0.6685684988234056, + "grad_norm": 2.9564009207787585, + "learning_rate": 1.2608602585121419e-06, + "loss": 1.0009, + "step": 6357 + }, + { + "epoch": 0.668673669265253, + "grad_norm": 2.338028385808441, + "learning_rate": 1.2601352710458314e-06, + "loss": 0.9979, + "step": 6358 + }, + { + "epoch": 0.6687788397071003, + "grad_norm": 1.9749844430021521, + "learning_rate": 1.2594104218340686e-06, + "loss": 0.9774, + "step": 6359 + }, + { + "epoch": 0.6688840101489476, + "grad_norm": 2.0803838048430623, + "learning_rate": 1.2586857109576814e-06, + "loss": 1.0137, + "step": 6360 + }, + { + "epoch": 0.6689891805907949, + "grad_norm": 2.561718404866887, + "learning_rate": 1.257961138497481e-06, + "loss": 1.0203, + "step": 6361 + }, + { + "epoch": 0.6690943510326423, + "grad_norm": 2.110231753776184, + "learning_rate": 1.2572367045342615e-06, + "loss": 0.9843, + "step": 6362 + }, + { + "epoch": 0.6691995214744896, + "grad_norm": 3.728842150551063, + "learning_rate": 1.256512409148804e-06, + "loss": 1.0121, + "step": 6363 + }, + { + "epoch": 0.6693046919163369, + "grad_norm": 2.6450699087235745, + "learning_rate": 1.2557882524218722e-06, + "loss": 1.0254, + "step": 6364 + }, + { + "epoch": 0.6694098623581842, + "grad_norm": 3.3862188687250474, + "learning_rate": 1.2550642344342155e-06, + "loss": 0.9904, + "step": 6365 + }, + { + "epoch": 0.6695150328000316, + "grad_norm": 2.6560070020655164, + "learning_rate": 1.2543403552665684e-06, + "loss": 1.001, + "step": 6366 + }, + { + "epoch": 0.6696202032418789, + "grad_norm": 2.066321011865616, + "learning_rate": 1.2536166149996476e-06, + "loss": 1.0294, + "step": 6367 + }, + { + "epoch": 0.6697253736837262, + "grad_norm": 1.841460947605558, + "learning_rate": 1.252893013714157e-06, + "loss": 0.955, + "step": 6368 + }, + { + "epoch": 0.6698305441255735, + "grad_norm": 2.6195237655803427, + "learning_rate": 1.2521695514907817e-06, + "loss": 0.981, + "step": 6369 + }, + { + "epoch": 0.6699357145674208, + "grad_norm": 2.5282441482759155, + "learning_rate": 1.2514462284101969e-06, + "loss": 1.0073, + "step": 6370 + }, + { + "epoch": 0.6700408850092682, + "grad_norm": 2.420722358809797, + "learning_rate": 1.2507230445530554e-06, + "loss": 1.0199, + "step": 6371 + }, + { + "epoch": 0.6701460554511155, + "grad_norm": 3.210358669515261, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.935, + "step": 6372 + }, + { + "epoch": 0.6702512258929628, + "grad_norm": 2.6066278908101244, + "learning_rate": 1.2492770948316548e-06, + "loss": 0.9807, + "step": 6373 + }, + { + "epoch": 0.6703563963348101, + "grad_norm": 2.166982928665858, + "learning_rate": 1.2485543291286292e-06, + "loss": 1.0028, + "step": 6374 + }, + { + "epoch": 0.6704615667766575, + "grad_norm": 2.3061126697659953, + "learning_rate": 1.2478317029715182e-06, + "loss": 0.972, + "step": 6375 + }, + { + "epoch": 0.6705667372185048, + "grad_norm": 2.574290187989157, + "learning_rate": 1.2471092164408985e-06, + "loss": 0.9746, + "step": 6376 + }, + { + "epoch": 0.670671907660352, + "grad_norm": 2.5423268036429754, + "learning_rate": 1.2463868696173351e-06, + "loss": 1.0209, + "step": 6377 + }, + { + "epoch": 0.6707770781021993, + "grad_norm": 2.2164659344968767, + "learning_rate": 1.245664662581372e-06, + "loss": 0.9704, + "step": 6378 + }, + { + "epoch": 0.6708822485440467, + "grad_norm": 2.582570427543782, + "learning_rate": 1.2449425954135452e-06, + "loss": 0.9712, + "step": 6379 + }, + { + "epoch": 0.670987418985894, + "grad_norm": 2.124683883692389, + "learning_rate": 1.2442206681943685e-06, + "loss": 0.9725, + "step": 6380 + }, + { + "epoch": 0.6710925894277413, + "grad_norm": 2.3639142162437095, + "learning_rate": 1.2434988810043416e-06, + "loss": 0.9681, + "step": 6381 + }, + { + "epoch": 0.6711977598695886, + "grad_norm": 2.1442126123743024, + "learning_rate": 1.2427772339239502e-06, + "loss": 1.0037, + "step": 6382 + }, + { + "epoch": 0.671302930311436, + "grad_norm": 2.625018523809141, + "learning_rate": 1.2420557270336638e-06, + "loss": 1.0121, + "step": 6383 + }, + { + "epoch": 0.6714081007532833, + "grad_norm": 2.4345832011147217, + "learning_rate": 1.241334360413937e-06, + "loss": 0.972, + "step": 6384 + }, + { + "epoch": 0.6715132711951306, + "grad_norm": 2.7727329895954993, + "learning_rate": 1.2406131341452054e-06, + "loss": 1.0481, + "step": 6385 + }, + { + "epoch": 0.6716184416369779, + "grad_norm": 2.7456790530416173, + "learning_rate": 1.2398920483078939e-06, + "loss": 0.9972, + "step": 6386 + }, + { + "epoch": 0.6717236120788252, + "grad_norm": 2.2868278468077765, + "learning_rate": 1.2391711029824065e-06, + "loss": 0.981, + "step": 6387 + }, + { + "epoch": 0.6718287825206726, + "grad_norm": 2.193127747109827, + "learning_rate": 1.2384502982491359e-06, + "loss": 0.9802, + "step": 6388 + }, + { + "epoch": 0.6719339529625199, + "grad_norm": 2.525039802068027, + "learning_rate": 1.2377296341884578e-06, + "loss": 1.0594, + "step": 6389 + }, + { + "epoch": 0.6720391234043672, + "grad_norm": 2.2688658169002793, + "learning_rate": 1.2370091108807307e-06, + "loss": 0.9756, + "step": 6390 + }, + { + "epoch": 0.6721442938462145, + "grad_norm": 2.8518070010439383, + "learning_rate": 1.2362887284062994e-06, + "loss": 1.0145, + "step": 6391 + }, + { + "epoch": 0.6722494642880619, + "grad_norm": 3.120055638002566, + "learning_rate": 1.23556848684549e-06, + "loss": 1.0178, + "step": 6392 + }, + { + "epoch": 0.6723546347299092, + "grad_norm": 2.677188223326482, + "learning_rate": 1.2348483862786188e-06, + "loss": 0.9663, + "step": 6393 + }, + { + "epoch": 0.6724598051717565, + "grad_norm": 2.764351849759641, + "learning_rate": 1.2341284267859796e-06, + "loss": 0.9915, + "step": 6394 + }, + { + "epoch": 0.6725649756136038, + "grad_norm": 1.9868623247964174, + "learning_rate": 1.2334086084478553e-06, + "loss": 0.9902, + "step": 6395 + }, + { + "epoch": 0.6726701460554512, + "grad_norm": 1.88432409309354, + "learning_rate": 1.2326889313445095e-06, + "loss": 1.0214, + "step": 6396 + }, + { + "epoch": 0.6727753164972984, + "grad_norm": 2.505160726077182, + "learning_rate": 1.2319693955561926e-06, + "loss": 0.996, + "step": 6397 + }, + { + "epoch": 0.6728804869391457, + "grad_norm": 2.4305679692895326, + "learning_rate": 1.2312500011631396e-06, + "loss": 0.9957, + "step": 6398 + }, + { + "epoch": 0.672985657380993, + "grad_norm": 2.26838492918931, + "learning_rate": 1.2305307482455661e-06, + "loss": 0.9879, + "step": 6399 + }, + { + "epoch": 0.6730908278228404, + "grad_norm": 2.7005160503685772, + "learning_rate": 1.2298116368836772e-06, + "loss": 1.0049, + "step": 6400 + }, + { + "epoch": 0.6731959982646877, + "grad_norm": 2.5263479523166814, + "learning_rate": 1.2290926671576567e-06, + "loss": 1.0359, + "step": 6401 + }, + { + "epoch": 0.673301168706535, + "grad_norm": 2.1790008720432605, + "learning_rate": 1.2283738391476766e-06, + "loss": 1.0109, + "step": 6402 + }, + { + "epoch": 0.6734063391483823, + "grad_norm": 2.6192166675598467, + "learning_rate": 1.2276551529338929e-06, + "loss": 0.9518, + "step": 6403 + }, + { + "epoch": 0.6735115095902297, + "grad_norm": 2.905972519031942, + "learning_rate": 1.2269366085964424e-06, + "loss": 1.0174, + "step": 6404 + }, + { + "epoch": 0.673616680032077, + "grad_norm": 2.9312474143114664, + "learning_rate": 1.2262182062154498e-06, + "loss": 1.0203, + "step": 6405 + }, + { + "epoch": 0.6737218504739243, + "grad_norm": 2.394870761083195, + "learning_rate": 1.225499945871022e-06, + "loss": 0.9592, + "step": 6406 + }, + { + "epoch": 0.6738270209157716, + "grad_norm": 2.025708578404046, + "learning_rate": 1.2247818276432522e-06, + "loss": 0.9798, + "step": 6407 + }, + { + "epoch": 0.673932191357619, + "grad_norm": 2.183993520411737, + "learning_rate": 1.2240638516122135e-06, + "loss": 0.9284, + "step": 6408 + }, + { + "epoch": 0.6740373617994663, + "grad_norm": 1.9540801713327498, + "learning_rate": 1.2233460178579683e-06, + "loss": 0.9748, + "step": 6409 + }, + { + "epoch": 0.6741425322413136, + "grad_norm": 2.397767392403271, + "learning_rate": 1.2226283264605587e-06, + "loss": 0.9958, + "step": 6410 + }, + { + "epoch": 0.6742477026831609, + "grad_norm": 2.63791023453177, + "learning_rate": 1.2219107775000136e-06, + "loss": 0.9764, + "step": 6411 + }, + { + "epoch": 0.6743528731250082, + "grad_norm": 2.6612509498444563, + "learning_rate": 1.2211933710563462e-06, + "loss": 0.9665, + "step": 6412 + }, + { + "epoch": 0.6744580435668556, + "grad_norm": 2.1922298975484, + "learning_rate": 1.2204761072095511e-06, + "loss": 1.0004, + "step": 6413 + }, + { + "epoch": 0.6745632140087029, + "grad_norm": 2.3286284733059666, + "learning_rate": 1.219758986039611e-06, + "loss": 0.9977, + "step": 6414 + }, + { + "epoch": 0.6746683844505502, + "grad_norm": 2.8936932199687746, + "learning_rate": 1.2190420076264877e-06, + "loss": 1.0052, + "step": 6415 + }, + { + "epoch": 0.6747735548923975, + "grad_norm": 2.9466430859373043, + "learning_rate": 1.2183251720501317e-06, + "loss": 1.0172, + "step": 6416 + }, + { + "epoch": 0.6748787253342448, + "grad_norm": 2.6112081025461276, + "learning_rate": 1.2176084793904764e-06, + "loss": 1.026, + "step": 6417 + }, + { + "epoch": 0.6749838957760921, + "grad_norm": 2.623855271718345, + "learning_rate": 1.2168919297274368e-06, + "loss": 0.998, + "step": 6418 + }, + { + "epoch": 0.6750890662179394, + "grad_norm": 2.4575121259688033, + "learning_rate": 1.2161755231409142e-06, + "loss": 0.9709, + "step": 6419 + }, + { + "epoch": 0.6751942366597867, + "grad_norm": 2.5971196149724416, + "learning_rate": 1.2154592597107942e-06, + "loss": 0.9818, + "step": 6420 + }, + { + "epoch": 0.675299407101634, + "grad_norm": 1.8871909749453315, + "learning_rate": 1.214743139516946e-06, + "loss": 1.0128, + "step": 6421 + }, + { + "epoch": 0.6754045775434814, + "grad_norm": 2.2047792064214824, + "learning_rate": 1.2140271626392215e-06, + "loss": 0.9645, + "step": 6422 + }, + { + "epoch": 0.6755097479853287, + "grad_norm": 2.1961869799992435, + "learning_rate": 1.2133113291574586e-06, + "loss": 0.9703, + "step": 6423 + }, + { + "epoch": 0.675614918427176, + "grad_norm": 2.958708144601371, + "learning_rate": 1.212595639151477e-06, + "loss": 0.9813, + "step": 6424 + }, + { + "epoch": 0.6757200888690233, + "grad_norm": 2.1479518310219854, + "learning_rate": 1.211880092701083e-06, + "loss": 1.0119, + "step": 6425 + }, + { + "epoch": 0.6758252593108707, + "grad_norm": 2.390810440217156, + "learning_rate": 1.2111646898860654e-06, + "loss": 0.9381, + "step": 6426 + }, + { + "epoch": 0.675930429752718, + "grad_norm": 2.2064670963516835, + "learning_rate": 1.2104494307861963e-06, + "loss": 0.9644, + "step": 6427 + }, + { + "epoch": 0.6760356001945653, + "grad_norm": 2.429723636327678, + "learning_rate": 1.2097343154812332e-06, + "loss": 0.9703, + "step": 6428 + }, + { + "epoch": 0.6761407706364126, + "grad_norm": 3.083948528744446, + "learning_rate": 1.2090193440509173e-06, + "loss": 1.0232, + "step": 6429 + }, + { + "epoch": 0.67624594107826, + "grad_norm": 2.9234421217471187, + "learning_rate": 1.208304516574974e-06, + "loss": 0.9884, + "step": 6430 + }, + { + "epoch": 0.6763511115201073, + "grad_norm": 1.6972930816606224, + "learning_rate": 1.2075898331331112e-06, + "loss": 1.0111, + "step": 6431 + }, + { + "epoch": 0.6764562819619546, + "grad_norm": 3.532197380851382, + "learning_rate": 1.206875293805021e-06, + "loss": 1.0103, + "step": 6432 + }, + { + "epoch": 0.6765614524038019, + "grad_norm": 2.534725775044316, + "learning_rate": 1.206160898670381e-06, + "loss": 0.9684, + "step": 6433 + }, + { + "epoch": 0.6766666228456493, + "grad_norm": 2.6453642866329408, + "learning_rate": 1.2054466478088515e-06, + "loss": 1.0008, + "step": 6434 + }, + { + "epoch": 0.6767717932874966, + "grad_norm": 2.312434495761649, + "learning_rate": 1.2047325413000782e-06, + "loss": 0.9743, + "step": 6435 + }, + { + "epoch": 0.6768769637293439, + "grad_norm": 1.9592460033323447, + "learning_rate": 1.2040185792236874e-06, + "loss": 1.004, + "step": 6436 + }, + { + "epoch": 0.6769821341711912, + "grad_norm": 1.8537125451220031, + "learning_rate": 1.2033047616592938e-06, + "loss": 0.9589, + "step": 6437 + }, + { + "epoch": 0.6770873046130385, + "grad_norm": 2.3821524850998137, + "learning_rate": 1.2025910886864914e-06, + "loss": 1.0142, + "step": 6438 + }, + { + "epoch": 0.6771924750548858, + "grad_norm": 2.6671405915081805, + "learning_rate": 1.2018775603848613e-06, + "loss": 0.9977, + "step": 6439 + }, + { + "epoch": 0.6772976454967331, + "grad_norm": 2.9782145090252388, + "learning_rate": 1.201164176833968e-06, + "loss": 0.9522, + "step": 6440 + }, + { + "epoch": 0.6774028159385804, + "grad_norm": 2.3275648903630803, + "learning_rate": 1.2004509381133577e-06, + "loss": 0.9778, + "step": 6441 + }, + { + "epoch": 0.6775079863804278, + "grad_norm": 2.2558601171743953, + "learning_rate": 1.1997378443025633e-06, + "loss": 1.0001, + "step": 6442 + }, + { + "epoch": 0.6776131568222751, + "grad_norm": 2.7943480960191405, + "learning_rate": 1.1990248954811002e-06, + "loss": 1.0329, + "step": 6443 + }, + { + "epoch": 0.6777183272641224, + "grad_norm": 1.9980446949092163, + "learning_rate": 1.1983120917284682e-06, + "loss": 1.0, + "step": 6444 + }, + { + "epoch": 0.6778234977059697, + "grad_norm": 2.5703128757840967, + "learning_rate": 1.1975994331241491e-06, + "loss": 1.0233, + "step": 6445 + }, + { + "epoch": 0.677928668147817, + "grad_norm": 2.454703747591987, + "learning_rate": 1.1968869197476116e-06, + "loss": 0.9941, + "step": 6446 + }, + { + "epoch": 0.6780338385896644, + "grad_norm": 2.1653067794036995, + "learning_rate": 1.1961745516783044e-06, + "loss": 0.9694, + "step": 6447 + }, + { + "epoch": 0.6781390090315117, + "grad_norm": 2.2038359344425396, + "learning_rate": 1.1954623289956633e-06, + "loss": 0.977, + "step": 6448 + }, + { + "epoch": 0.678244179473359, + "grad_norm": 2.693813467095616, + "learning_rate": 1.1947502517791073e-06, + "loss": 0.984, + "step": 6449 + }, + { + "epoch": 0.6783493499152063, + "grad_norm": 2.1406308204159874, + "learning_rate": 1.194038320108037e-06, + "loss": 0.9603, + "step": 6450 + }, + { + "epoch": 0.6784545203570537, + "grad_norm": 2.1294455644301076, + "learning_rate": 1.1933265340618389e-06, + "loss": 0.9735, + "step": 6451 + }, + { + "epoch": 0.678559690798901, + "grad_norm": 2.103773447891833, + "learning_rate": 1.192614893719884e-06, + "loss": 0.9834, + "step": 6452 + }, + { + "epoch": 0.6786648612407483, + "grad_norm": 2.0922753017568336, + "learning_rate": 1.1919033991615234e-06, + "loss": 1.0113, + "step": 6453 + }, + { + "epoch": 0.6787700316825956, + "grad_norm": 1.9815603762507759, + "learning_rate": 1.1911920504660963e-06, + "loss": 0.9569, + "step": 6454 + }, + { + "epoch": 0.678875202124443, + "grad_norm": 1.797769460062252, + "learning_rate": 1.190480847712922e-06, + "loss": 0.9873, + "step": 6455 + }, + { + "epoch": 0.6789803725662903, + "grad_norm": 2.003693077137846, + "learning_rate": 1.1897697909813058e-06, + "loss": 1.0133, + "step": 6456 + }, + { + "epoch": 0.6790855430081376, + "grad_norm": 2.1003840733140686, + "learning_rate": 1.1890588803505362e-06, + "loss": 0.9632, + "step": 6457 + }, + { + "epoch": 0.6791907134499848, + "grad_norm": 2.080990514432014, + "learning_rate": 1.1883481158998862e-06, + "loss": 0.9783, + "step": 6458 + }, + { + "epoch": 0.6792958838918322, + "grad_norm": 3.374047031904028, + "learning_rate": 1.1876374977086094e-06, + "loss": 1.0202, + "step": 6459 + }, + { + "epoch": 0.6794010543336795, + "grad_norm": 2.4938920235535047, + "learning_rate": 1.1869270258559477e-06, + "loss": 1.008, + "step": 6460 + }, + { + "epoch": 0.6795062247755268, + "grad_norm": 2.186419415119328, + "learning_rate": 1.1862167004211217e-06, + "loss": 0.9835, + "step": 6461 + }, + { + "epoch": 0.6796113952173741, + "grad_norm": 2.9582048165354378, + "learning_rate": 1.1855065214833394e-06, + "loss": 0.9733, + "step": 6462 + }, + { + "epoch": 0.6797165656592214, + "grad_norm": 3.8575153613107016, + "learning_rate": 1.1847964891217923e-06, + "loss": 1.0115, + "step": 6463 + }, + { + "epoch": 0.6798217361010688, + "grad_norm": 2.242861038730955, + "learning_rate": 1.1840866034156526e-06, + "loss": 0.9677, + "step": 6464 + }, + { + "epoch": 0.6799269065429161, + "grad_norm": 2.0912260201336648, + "learning_rate": 1.1833768644440787e-06, + "loss": 0.9556, + "step": 6465 + }, + { + "epoch": 0.6800320769847634, + "grad_norm": 3.3182406948209993, + "learning_rate": 1.1826672722862137e-06, + "loss": 0.9788, + "step": 6466 + }, + { + "epoch": 0.6801372474266107, + "grad_norm": 2.887753294344934, + "learning_rate": 1.1819578270211801e-06, + "loss": 0.9623, + "step": 6467 + }, + { + "epoch": 0.6802424178684581, + "grad_norm": 3.1634544519317966, + "learning_rate": 1.1812485287280886e-06, + "loss": 1.0159, + "step": 6468 + }, + { + "epoch": 0.6803475883103054, + "grad_norm": 2.210225867067167, + "learning_rate": 1.1805393774860296e-06, + "loss": 0.9579, + "step": 6469 + }, + { + "epoch": 0.6804527587521527, + "grad_norm": 2.3215390072281874, + "learning_rate": 1.1798303733740801e-06, + "loss": 0.9572, + "step": 6470 + }, + { + "epoch": 0.680557929194, + "grad_norm": 2.531063961551066, + "learning_rate": 1.1791215164712993e-06, + "loss": 1.0024, + "step": 6471 + }, + { + "epoch": 0.6806630996358474, + "grad_norm": 2.6959358550758883, + "learning_rate": 1.1784128068567316e-06, + "loss": 0.996, + "step": 6472 + }, + { + "epoch": 0.6807682700776947, + "grad_norm": 2.0196092769679828, + "learning_rate": 1.1777042446094011e-06, + "loss": 0.9526, + "step": 6473 + }, + { + "epoch": 0.680873440519542, + "grad_norm": 2.020236502773026, + "learning_rate": 1.1769958298083192e-06, + "loss": 0.9758, + "step": 6474 + }, + { + "epoch": 0.6809786109613893, + "grad_norm": 2.1941180121097825, + "learning_rate": 1.176287562532481e-06, + "loss": 0.9882, + "step": 6475 + }, + { + "epoch": 0.6810837814032367, + "grad_norm": 2.44848887745427, + "learning_rate": 1.1755794428608614e-06, + "loss": 1.0088, + "step": 6476 + }, + { + "epoch": 0.681188951845084, + "grad_norm": 1.6532880562870076, + "learning_rate": 1.1748714708724232e-06, + "loss": 0.9774, + "step": 6477 + }, + { + "epoch": 0.6812941222869312, + "grad_norm": 2.6504903012905494, + "learning_rate": 1.1741636466461093e-06, + "loss": 0.9639, + "step": 6478 + }, + { + "epoch": 0.6813992927287785, + "grad_norm": 2.1833707527018653, + "learning_rate": 1.173455970260848e-06, + "loss": 0.9325, + "step": 6479 + }, + { + "epoch": 0.6815044631706259, + "grad_norm": 1.8991991840090796, + "learning_rate": 1.1727484417955512e-06, + "loss": 0.9709, + "step": 6480 + }, + { + "epoch": 0.6816096336124732, + "grad_norm": 2.7755476284341594, + "learning_rate": 1.1720410613291144e-06, + "loss": 1.0033, + "step": 6481 + }, + { + "epoch": 0.6817148040543205, + "grad_norm": 2.4340897257786587, + "learning_rate": 1.1713338289404152e-06, + "loss": 0.9761, + "step": 6482 + }, + { + "epoch": 0.6818199744961678, + "grad_norm": 2.6322074888977873, + "learning_rate": 1.1706267447083145e-06, + "loss": 0.9855, + "step": 6483 + }, + { + "epoch": 0.6819251449380151, + "grad_norm": 2.5435211848927963, + "learning_rate": 1.169919808711659e-06, + "loss": 0.9968, + "step": 6484 + }, + { + "epoch": 0.6820303153798625, + "grad_norm": 2.3401833639741376, + "learning_rate": 1.1692130210292767e-06, + "loss": 0.9764, + "step": 6485 + }, + { + "epoch": 0.6821354858217098, + "grad_norm": 2.757772162036146, + "learning_rate": 1.1685063817399818e-06, + "loss": 0.9863, + "step": 6486 + }, + { + "epoch": 0.6822406562635571, + "grad_norm": 2.2309395979337774, + "learning_rate": 1.167799890922568e-06, + "loss": 0.9525, + "step": 6487 + }, + { + "epoch": 0.6823458267054044, + "grad_norm": 2.2761181614261123, + "learning_rate": 1.167093548655815e-06, + "loss": 0.9839, + "step": 6488 + }, + { + "epoch": 0.6824509971472518, + "grad_norm": 2.4073807837683474, + "learning_rate": 1.1663873550184864e-06, + "loss": 0.9943, + "step": 6489 + }, + { + "epoch": 0.6825561675890991, + "grad_norm": 2.344425968364666, + "learning_rate": 1.1656813100893271e-06, + "loss": 1.0342, + "step": 6490 + }, + { + "epoch": 0.6826613380309464, + "grad_norm": 3.3083451221904077, + "learning_rate": 1.1649754139470679e-06, + "loss": 0.9833, + "step": 6491 + }, + { + "epoch": 0.6827665084727937, + "grad_norm": 2.101235305234725, + "learning_rate": 1.16426966667042e-06, + "loss": 1.0127, + "step": 6492 + }, + { + "epoch": 0.6828716789146411, + "grad_norm": 2.4853559361171267, + "learning_rate": 1.1635640683380803e-06, + "loss": 0.9932, + "step": 6493 + }, + { + "epoch": 0.6829768493564884, + "grad_norm": 2.823621227962317, + "learning_rate": 1.1628586190287289e-06, + "loss": 0.9721, + "step": 6494 + }, + { + "epoch": 0.6830820197983357, + "grad_norm": 2.1446739627758924, + "learning_rate": 1.1621533188210296e-06, + "loss": 0.9461, + "step": 6495 + }, + { + "epoch": 0.683187190240183, + "grad_norm": 2.5010687220751877, + "learning_rate": 1.1614481677936274e-06, + "loss": 0.9768, + "step": 6496 + }, + { + "epoch": 0.6832923606820304, + "grad_norm": 2.5827579517567134, + "learning_rate": 1.1607431660251523e-06, + "loss": 0.9719, + "step": 6497 + }, + { + "epoch": 0.6833975311238777, + "grad_norm": 2.163533594993246, + "learning_rate": 1.160038313594219e-06, + "loss": 0.9844, + "step": 6498 + }, + { + "epoch": 0.6835027015657249, + "grad_norm": 1.6505556607260339, + "learning_rate": 1.1593336105794222e-06, + "loss": 0.9816, + "step": 6499 + }, + { + "epoch": 0.6836078720075722, + "grad_norm": 2.5798234985766753, + "learning_rate": 1.158629057059343e-06, + "loss": 0.9543, + "step": 6500 + }, + { + "epoch": 0.6837130424494196, + "grad_norm": 2.847502464596311, + "learning_rate": 1.1579246531125435e-06, + "loss": 0.9713, + "step": 6501 + }, + { + "epoch": 0.6838182128912669, + "grad_norm": 2.664122177395312, + "learning_rate": 1.1572203988175706e-06, + "loss": 1.0001, + "step": 6502 + }, + { + "epoch": 0.6839233833331142, + "grad_norm": 2.3631903419020603, + "learning_rate": 1.1565162942529553e-06, + "loss": 0.9806, + "step": 6503 + }, + { + "epoch": 0.6840285537749615, + "grad_norm": 2.140387924285255, + "learning_rate": 1.155812339497209e-06, + "loss": 1.0032, + "step": 6504 + }, + { + "epoch": 0.6841337242168088, + "grad_norm": 2.481719404717142, + "learning_rate": 1.1551085346288296e-06, + "loss": 1.0173, + "step": 6505 + }, + { + "epoch": 0.6842388946586562, + "grad_norm": 2.508106903316508, + "learning_rate": 1.154404879726294e-06, + "loss": 0.9962, + "step": 6506 + }, + { + "epoch": 0.6843440651005035, + "grad_norm": 3.216269422744731, + "learning_rate": 1.1537013748680694e-06, + "loss": 1.0013, + "step": 6507 + }, + { + "epoch": 0.6844492355423508, + "grad_norm": 2.0004248330968206, + "learning_rate": 1.152998020132599e-06, + "loss": 1.0044, + "step": 6508 + }, + { + "epoch": 0.6845544059841981, + "grad_norm": 1.933485559067847, + "learning_rate": 1.1522948155983143e-06, + "loss": 0.9502, + "step": 6509 + }, + { + "epoch": 0.6846595764260455, + "grad_norm": 2.6662844264965972, + "learning_rate": 1.1515917613436258e-06, + "loss": 0.9901, + "step": 6510 + }, + { + "epoch": 0.6847647468678928, + "grad_norm": 1.9043626170295296, + "learning_rate": 1.1508888574469308e-06, + "loss": 0.9483, + "step": 6511 + }, + { + "epoch": 0.6848699173097401, + "grad_norm": 2.889546124540106, + "learning_rate": 1.1501861039866095e-06, + "loss": 0.9652, + "step": 6512 + }, + { + "epoch": 0.6849750877515874, + "grad_norm": 2.297426343613282, + "learning_rate": 1.1494835010410222e-06, + "loss": 0.9644, + "step": 6513 + }, + { + "epoch": 0.6850802581934348, + "grad_norm": 2.5762672587290227, + "learning_rate": 1.1487810486885164e-06, + "loss": 0.919, + "step": 6514 + }, + { + "epoch": 0.6851854286352821, + "grad_norm": 3.1574284790806653, + "learning_rate": 1.1480787470074197e-06, + "loss": 0.9916, + "step": 6515 + }, + { + "epoch": 0.6852905990771294, + "grad_norm": 2.7014959221760697, + "learning_rate": 1.147376596076045e-06, + "loss": 0.9377, + "step": 6516 + }, + { + "epoch": 0.6853957695189767, + "grad_norm": 2.5585301645643783, + "learning_rate": 1.146674595972688e-06, + "loss": 0.9876, + "step": 6517 + }, + { + "epoch": 0.6855009399608241, + "grad_norm": 2.2873581952413944, + "learning_rate": 1.1459727467756257e-06, + "loss": 1.0167, + "step": 6518 + }, + { + "epoch": 0.6856061104026713, + "grad_norm": 2.0427362279367567, + "learning_rate": 1.1452710485631216e-06, + "loss": 1.0053, + "step": 6519 + }, + { + "epoch": 0.6857112808445186, + "grad_norm": 1.6816352696983445, + "learning_rate": 1.1445695014134175e-06, + "loss": 0.9847, + "step": 6520 + }, + { + "epoch": 0.6858164512863659, + "grad_norm": 2.8015068825343077, + "learning_rate": 1.1438681054047454e-06, + "loss": 0.9762, + "step": 6521 + }, + { + "epoch": 0.6859216217282132, + "grad_norm": 2.8607684877029493, + "learning_rate": 1.143166860615313e-06, + "loss": 0.9794, + "step": 6522 + }, + { + "epoch": 0.6860267921700606, + "grad_norm": 1.8264702204384304, + "learning_rate": 1.1424657671233175e-06, + "loss": 1.0061, + "step": 6523 + }, + { + "epoch": 0.6861319626119079, + "grad_norm": 2.5820509572402357, + "learning_rate": 1.1417648250069332e-06, + "loss": 1.0065, + "step": 6524 + }, + { + "epoch": 0.6862371330537552, + "grad_norm": 2.131759160671369, + "learning_rate": 1.1410640343443222e-06, + "loss": 0.9638, + "step": 6525 + }, + { + "epoch": 0.6863423034956025, + "grad_norm": 2.427313320161486, + "learning_rate": 1.140363395213629e-06, + "loss": 0.9769, + "step": 6526 + }, + { + "epoch": 0.6864474739374499, + "grad_norm": 2.762517347392917, + "learning_rate": 1.139662907692978e-06, + "loss": 1.0102, + "step": 6527 + }, + { + "epoch": 0.6865526443792972, + "grad_norm": 1.928573110997746, + "learning_rate": 1.1389625718604816e-06, + "loss": 0.9984, + "step": 6528 + }, + { + "epoch": 0.6866578148211445, + "grad_norm": 2.890165560340471, + "learning_rate": 1.1382623877942291e-06, + "loss": 1.0161, + "step": 6529 + }, + { + "epoch": 0.6867629852629918, + "grad_norm": 2.287013440890568, + "learning_rate": 1.1375623555723008e-06, + "loss": 1.0233, + "step": 6530 + }, + { + "epoch": 0.6868681557048392, + "grad_norm": 1.865259627410808, + "learning_rate": 1.1368624752727529e-06, + "loss": 0.9824, + "step": 6531 + }, + { + "epoch": 0.6869733261466865, + "grad_norm": 2.865076892904702, + "learning_rate": 1.1361627469736286e-06, + "loss": 0.9781, + "step": 6532 + }, + { + "epoch": 0.6870784965885338, + "grad_norm": 1.8570863276832512, + "learning_rate": 1.1354631707529532e-06, + "loss": 0.9515, + "step": 6533 + }, + { + "epoch": 0.6871836670303811, + "grad_norm": 2.1759244504535022, + "learning_rate": 1.1347637466887324e-06, + "loss": 0.9344, + "step": 6534 + }, + { + "epoch": 0.6872888374722285, + "grad_norm": 2.696477773718701, + "learning_rate": 1.134064474858961e-06, + "loss": 0.9849, + "step": 6535 + }, + { + "epoch": 0.6873940079140758, + "grad_norm": 2.0827168686133066, + "learning_rate": 1.1333653553416107e-06, + "loss": 0.948, + "step": 6536 + }, + { + "epoch": 0.6874991783559231, + "grad_norm": 2.5449063654411237, + "learning_rate": 1.1326663882146407e-06, + "loss": 0.9875, + "step": 6537 + }, + { + "epoch": 0.6876043487977704, + "grad_norm": 2.1105732172457667, + "learning_rate": 1.1319675735559894e-06, + "loss": 1.0455, + "step": 6538 + }, + { + "epoch": 0.6877095192396177, + "grad_norm": 2.6112809368518044, + "learning_rate": 1.1312689114435806e-06, + "loss": 0.9998, + "step": 6539 + }, + { + "epoch": 0.687814689681465, + "grad_norm": 2.6962129988071917, + "learning_rate": 1.130570401955322e-06, + "loss": 0.974, + "step": 6540 + }, + { + "epoch": 0.6879198601233123, + "grad_norm": 2.3841373338690954, + "learning_rate": 1.129872045169101e-06, + "loss": 0.9668, + "step": 6541 + }, + { + "epoch": 0.6880250305651596, + "grad_norm": 2.00322624680125, + "learning_rate": 1.1291738411627913e-06, + "loss": 0.9997, + "step": 6542 + }, + { + "epoch": 0.688130201007007, + "grad_norm": 2.391244626370067, + "learning_rate": 1.1284757900142451e-06, + "loss": 0.9691, + "step": 6543 + }, + { + "epoch": 0.6882353714488543, + "grad_norm": 1.7816408797129248, + "learning_rate": 1.1277778918013046e-06, + "loss": 1.0039, + "step": 6544 + }, + { + "epoch": 0.6883405418907016, + "grad_norm": 1.9496538742999459, + "learning_rate": 1.127080146601788e-06, + "loss": 0.9769, + "step": 6545 + }, + { + "epoch": 0.6884457123325489, + "grad_norm": 2.7789172568168166, + "learning_rate": 1.1263825544935015e-06, + "loss": 0.9584, + "step": 6546 + }, + { + "epoch": 0.6885508827743962, + "grad_norm": 2.116800668579364, + "learning_rate": 1.1256851155542297e-06, + "loss": 0.9908, + "step": 6547 + }, + { + "epoch": 0.6886560532162436, + "grad_norm": 2.942405980467766, + "learning_rate": 1.1249878298617436e-06, + "loss": 0.9636, + "step": 6548 + }, + { + "epoch": 0.6887612236580909, + "grad_norm": 2.213773047892503, + "learning_rate": 1.124290697493797e-06, + "loss": 0.9824, + "step": 6549 + }, + { + "epoch": 0.6888663940999382, + "grad_norm": 2.1291578558657394, + "learning_rate": 1.1235937185281234e-06, + "loss": 0.9922, + "step": 6550 + }, + { + "epoch": 0.6889715645417855, + "grad_norm": 2.724578811906178, + "learning_rate": 1.1228968930424433e-06, + "loss": 0.9929, + "step": 6551 + }, + { + "epoch": 0.6890767349836329, + "grad_norm": 2.0628702408800677, + "learning_rate": 1.1222002211144567e-06, + "loss": 0.9745, + "step": 6552 + }, + { + "epoch": 0.6891819054254802, + "grad_norm": 2.1610050278428083, + "learning_rate": 1.1215037028218484e-06, + "loss": 1.005, + "step": 6553 + }, + { + "epoch": 0.6892870758673275, + "grad_norm": 2.7787423829228874, + "learning_rate": 1.1208073382422866e-06, + "loss": 1.0103, + "step": 6554 + }, + { + "epoch": 0.6893922463091748, + "grad_norm": 2.3348483217669633, + "learning_rate": 1.1201111274534198e-06, + "loss": 0.9533, + "step": 6555 + }, + { + "epoch": 0.6894974167510222, + "grad_norm": 2.11428494125051, + "learning_rate": 1.1194150705328825e-06, + "loss": 1.0074, + "step": 6556 + }, + { + "epoch": 0.6896025871928695, + "grad_norm": 2.424483283901889, + "learning_rate": 1.1187191675582878e-06, + "loss": 1.0236, + "step": 6557 + }, + { + "epoch": 0.6897077576347168, + "grad_norm": 2.041193732298862, + "learning_rate": 1.1180234186072379e-06, + "loss": 0.9219, + "step": 6558 + }, + { + "epoch": 0.6898129280765641, + "grad_norm": 2.7736259010967075, + "learning_rate": 1.1173278237573113e-06, + "loss": 0.9917, + "step": 6559 + }, + { + "epoch": 0.6899180985184113, + "grad_norm": 2.479292231955384, + "learning_rate": 1.1166323830860745e-06, + "loss": 0.9836, + "step": 6560 + }, + { + "epoch": 0.6900232689602587, + "grad_norm": 2.286822176950764, + "learning_rate": 1.1159370966710723e-06, + "loss": 0.9802, + "step": 6561 + }, + { + "epoch": 0.690128439402106, + "grad_norm": 1.7894272908985807, + "learning_rate": 1.1152419645898355e-06, + "loss": 0.9678, + "step": 6562 + }, + { + "epoch": 0.6902336098439533, + "grad_norm": 3.264730137920692, + "learning_rate": 1.114546986919878e-06, + "loss": 1.0223, + "step": 6563 + }, + { + "epoch": 0.6903387802858006, + "grad_norm": 2.1228771331780125, + "learning_rate": 1.1138521637386928e-06, + "loss": 0.9932, + "step": 6564 + }, + { + "epoch": 0.690443950727648, + "grad_norm": 2.83441954898417, + "learning_rate": 1.1131574951237607e-06, + "loss": 0.9904, + "step": 6565 + }, + { + "epoch": 0.6905491211694953, + "grad_norm": 2.3866086648057, + "learning_rate": 1.112462981152539e-06, + "loss": 0.9736, + "step": 6566 + }, + { + "epoch": 0.6906542916113426, + "grad_norm": 2.0765111597644133, + "learning_rate": 1.1117686219024756e-06, + "loss": 0.9855, + "step": 6567 + }, + { + "epoch": 0.6907594620531899, + "grad_norm": 1.9430515714587882, + "learning_rate": 1.1110744174509952e-06, + "loss": 0.9586, + "step": 6568 + }, + { + "epoch": 0.6908646324950373, + "grad_norm": 2.722560114399417, + "learning_rate": 1.1103803678755058e-06, + "loss": 0.944, + "step": 6569 + }, + { + "epoch": 0.6909698029368846, + "grad_norm": 2.1402961489701267, + "learning_rate": 1.109686473253401e-06, + "loss": 0.995, + "step": 6570 + }, + { + "epoch": 0.6910749733787319, + "grad_norm": 2.7043321565248677, + "learning_rate": 1.1089927336620531e-06, + "loss": 1.0437, + "step": 6571 + }, + { + "epoch": 0.6911801438205792, + "grad_norm": 2.434297743887707, + "learning_rate": 1.108299149178823e-06, + "loss": 0.9765, + "step": 6572 + }, + { + "epoch": 0.6912853142624266, + "grad_norm": 1.8390287310817142, + "learning_rate": 1.107605719881048e-06, + "loss": 0.9612, + "step": 6573 + }, + { + "epoch": 0.6913904847042739, + "grad_norm": 2.270753478584313, + "learning_rate": 1.1069124458460528e-06, + "loss": 0.8822, + "step": 6574 + }, + { + "epoch": 0.6914956551461212, + "grad_norm": 2.1749999487851523, + "learning_rate": 1.1062193271511408e-06, + "loss": 0.9452, + "step": 6575 + }, + { + "epoch": 0.6916008255879685, + "grad_norm": 2.202741500698131, + "learning_rate": 1.1055263638736008e-06, + "loss": 0.9954, + "step": 6576 + }, + { + "epoch": 0.6917059960298159, + "grad_norm": 3.070731100268695, + "learning_rate": 1.1048335560907047e-06, + "loss": 1.0214, + "step": 6577 + }, + { + "epoch": 0.6918111664716632, + "grad_norm": 2.7005615503787426, + "learning_rate": 1.1041409038797047e-06, + "loss": 1.0112, + "step": 6578 + }, + { + "epoch": 0.6919163369135105, + "grad_norm": 2.7644388909239135, + "learning_rate": 1.1034484073178377e-06, + "loss": 0.9633, + "step": 6579 + }, + { + "epoch": 0.6920215073553577, + "grad_norm": 2.01408411568597, + "learning_rate": 1.1027560664823208e-06, + "loss": 1.0022, + "step": 6580 + }, + { + "epoch": 0.692126677797205, + "grad_norm": 2.4415146472180593, + "learning_rate": 1.102063881450358e-06, + "loss": 0.9889, + "step": 6581 + }, + { + "epoch": 0.6922318482390524, + "grad_norm": 2.1342536088258277, + "learning_rate": 1.1013718522991315e-06, + "loss": 1.0117, + "step": 6582 + }, + { + "epoch": 0.6923370186808997, + "grad_norm": 2.4132414775288296, + "learning_rate": 1.100679979105809e-06, + "loss": 0.9468, + "step": 6583 + }, + { + "epoch": 0.692442189122747, + "grad_norm": 2.2328463531395295, + "learning_rate": 1.0999882619475382e-06, + "loss": 0.9449, + "step": 6584 + }, + { + "epoch": 0.6925473595645943, + "grad_norm": 2.3588285134864524, + "learning_rate": 1.0992967009014522e-06, + "loss": 0.9846, + "step": 6585 + }, + { + "epoch": 0.6926525300064417, + "grad_norm": 2.3173283765307326, + "learning_rate": 1.0986052960446658e-06, + "loss": 1.011, + "step": 6586 + }, + { + "epoch": 0.692757700448289, + "grad_norm": 2.300519093867645, + "learning_rate": 1.0979140474542743e-06, + "loss": 1.0277, + "step": 6587 + }, + { + "epoch": 0.6928628708901363, + "grad_norm": 2.392157404408049, + "learning_rate": 1.0972229552073594e-06, + "loss": 0.9473, + "step": 6588 + }, + { + "epoch": 0.6929680413319836, + "grad_norm": 2.5842800419125784, + "learning_rate": 1.0965320193809808e-06, + "loss": 0.9278, + "step": 6589 + }, + { + "epoch": 0.693073211773831, + "grad_norm": 1.891437035514997, + "learning_rate": 1.0958412400521851e-06, + "loss": 0.9947, + "step": 6590 + }, + { + "epoch": 0.6931783822156783, + "grad_norm": 2.6484100957216365, + "learning_rate": 1.0951506172979998e-06, + "loss": 1.0276, + "step": 6591 + }, + { + "epoch": 0.6932835526575256, + "grad_norm": 2.0840632815442985, + "learning_rate": 1.0944601511954328e-06, + "loss": 0.993, + "step": 6592 + }, + { + "epoch": 0.6933887230993729, + "grad_norm": 2.956513595108928, + "learning_rate": 1.0937698418214784e-06, + "loss": 1.0338, + "step": 6593 + }, + { + "epoch": 0.6934938935412203, + "grad_norm": 1.9243522516943474, + "learning_rate": 1.0930796892531092e-06, + "loss": 0.9915, + "step": 6594 + }, + { + "epoch": 0.6935990639830676, + "grad_norm": 2.1698046683173136, + "learning_rate": 1.0923896935672856e-06, + "loss": 0.9747, + "step": 6595 + }, + { + "epoch": 0.6937042344249149, + "grad_norm": 2.5425082569792092, + "learning_rate": 1.0916998548409449e-06, + "loss": 0.9799, + "step": 6596 + }, + { + "epoch": 0.6938094048667622, + "grad_norm": 2.577847600360515, + "learning_rate": 1.0910101731510113e-06, + "loss": 0.9921, + "step": 6597 + }, + { + "epoch": 0.6939145753086096, + "grad_norm": 1.8248209965382798, + "learning_rate": 1.090320648574388e-06, + "loss": 0.969, + "step": 6598 + }, + { + "epoch": 0.6940197457504569, + "grad_norm": 1.7602736317885663, + "learning_rate": 1.0896312811879634e-06, + "loss": 0.9835, + "step": 6599 + }, + { + "epoch": 0.6941249161923041, + "grad_norm": 2.958715201212913, + "learning_rate": 1.0889420710686077e-06, + "loss": 0.9786, + "step": 6600 + }, + { + "epoch": 0.6942300866341514, + "grad_norm": 2.029850889738827, + "learning_rate": 1.0882530182931717e-06, + "loss": 0.9684, + "step": 6601 + }, + { + "epoch": 0.6943352570759987, + "grad_norm": 2.032629169019649, + "learning_rate": 1.0875641229384918e-06, + "loss": 0.9604, + "step": 6602 + }, + { + "epoch": 0.6944404275178461, + "grad_norm": 2.6284812089984864, + "learning_rate": 1.0868753850813826e-06, + "loss": 0.9786, + "step": 6603 + }, + { + "epoch": 0.6945455979596934, + "grad_norm": 2.0592326838128967, + "learning_rate": 1.0861868047986473e-06, + "loss": 0.942, + "step": 6604 + }, + { + "epoch": 0.6946507684015407, + "grad_norm": 1.850794804171779, + "learning_rate": 1.0854983821670665e-06, + "loss": 0.9761, + "step": 6605 + }, + { + "epoch": 0.694755938843388, + "grad_norm": 2.5669897777189434, + "learning_rate": 1.0848101172634028e-06, + "loss": 1.021, + "step": 6606 + }, + { + "epoch": 0.6948611092852354, + "grad_norm": 2.6120005510022666, + "learning_rate": 1.0841220101644063e-06, + "loss": 1.0407, + "step": 6607 + }, + { + "epoch": 0.6949662797270827, + "grad_norm": 2.119486727780465, + "learning_rate": 1.0834340609468022e-06, + "loss": 1.0054, + "step": 6608 + }, + { + "epoch": 0.69507145016893, + "grad_norm": 2.2087002638922493, + "learning_rate": 1.0827462696873065e-06, + "loss": 0.9827, + "step": 6609 + }, + { + "epoch": 0.6951766206107773, + "grad_norm": 2.0129214447538137, + "learning_rate": 1.0820586364626103e-06, + "loss": 0.9977, + "step": 6610 + }, + { + "epoch": 0.6952817910526247, + "grad_norm": 2.199166327772394, + "learning_rate": 1.0813711613493922e-06, + "loss": 0.9758, + "step": 6611 + }, + { + "epoch": 0.695386961494472, + "grad_norm": 3.246740679995433, + "learning_rate": 1.080683844424309e-06, + "loss": 0.9628, + "step": 6612 + }, + { + "epoch": 0.6954921319363193, + "grad_norm": 2.2023486928412592, + "learning_rate": 1.0799966857640027e-06, + "loss": 0.9798, + "step": 6613 + }, + { + "epoch": 0.6955973023781666, + "grad_norm": 2.548062183974951, + "learning_rate": 1.0793096854450979e-06, + "loss": 0.9919, + "step": 6614 + }, + { + "epoch": 0.695702472820014, + "grad_norm": 3.1434898532802324, + "learning_rate": 1.0786228435441984e-06, + "loss": 1.0162, + "step": 6615 + }, + { + "epoch": 0.6958076432618613, + "grad_norm": 2.310428364039209, + "learning_rate": 1.0779361601378946e-06, + "loss": 0.9854, + "step": 6616 + }, + { + "epoch": 0.6959128137037086, + "grad_norm": 2.688905242521263, + "learning_rate": 1.077249635302754e-06, + "loss": 1.0092, + "step": 6617 + }, + { + "epoch": 0.6960179841455559, + "grad_norm": 2.192140291940976, + "learning_rate": 1.0765632691153333e-06, + "loss": 0.9757, + "step": 6618 + }, + { + "epoch": 0.6961231545874033, + "grad_norm": 3.107859618660363, + "learning_rate": 1.0758770616521646e-06, + "loss": 0.9874, + "step": 6619 + }, + { + "epoch": 0.6962283250292506, + "grad_norm": 1.8725342958729618, + "learning_rate": 1.0751910129897678e-06, + "loss": 0.9366, + "step": 6620 + }, + { + "epoch": 0.6963334954710978, + "grad_norm": 2.3738236012939358, + "learning_rate": 1.0745051232046399e-06, + "loss": 0.9436, + "step": 6621 + }, + { + "epoch": 0.6964386659129451, + "grad_norm": 2.438216862745785, + "learning_rate": 1.0738193923732648e-06, + "loss": 0.9679, + "step": 6622 + }, + { + "epoch": 0.6965438363547924, + "grad_norm": 2.1364655426341947, + "learning_rate": 1.0731338205721072e-06, + "loss": 0.9434, + "step": 6623 + }, + { + "epoch": 0.6966490067966398, + "grad_norm": 3.5579171402660625, + "learning_rate": 1.0724484078776121e-06, + "loss": 1.0199, + "step": 6624 + }, + { + "epoch": 0.6967541772384871, + "grad_norm": 3.215088447682091, + "learning_rate": 1.0717631543662098e-06, + "loss": 0.9623, + "step": 6625 + }, + { + "epoch": 0.6968593476803344, + "grad_norm": 2.7336030353955234, + "learning_rate": 1.07107806011431e-06, + "loss": 1.0457, + "step": 6626 + }, + { + "epoch": 0.6969645181221817, + "grad_norm": 1.8278294216486561, + "learning_rate": 1.0703931251983068e-06, + "loss": 0.9591, + "step": 6627 + }, + { + "epoch": 0.6970696885640291, + "grad_norm": 2.316066385624242, + "learning_rate": 1.0697083496945766e-06, + "loss": 0.9709, + "step": 6628 + }, + { + "epoch": 0.6971748590058764, + "grad_norm": 2.7923043921417308, + "learning_rate": 1.0690237336794753e-06, + "loss": 1.0243, + "step": 6629 + }, + { + "epoch": 0.6972800294477237, + "grad_norm": 2.0978384840603974, + "learning_rate": 1.0683392772293446e-06, + "loss": 0.9568, + "step": 6630 + }, + { + "epoch": 0.697385199889571, + "grad_norm": 2.465884366366025, + "learning_rate": 1.0676549804205048e-06, + "loss": 0.9826, + "step": 6631 + }, + { + "epoch": 0.6974903703314184, + "grad_norm": 2.218104914348849, + "learning_rate": 1.0669708433292628e-06, + "loss": 0.9982, + "step": 6632 + }, + { + "epoch": 0.6975955407732657, + "grad_norm": 2.3301810291362233, + "learning_rate": 1.0662868660319031e-06, + "loss": 0.9371, + "step": 6633 + }, + { + "epoch": 0.697700711215113, + "grad_norm": 2.1441525236150634, + "learning_rate": 1.0656030486046965e-06, + "loss": 0.9976, + "step": 6634 + }, + { + "epoch": 0.6978058816569603, + "grad_norm": 2.557259001781231, + "learning_rate": 1.0649193911238918e-06, + "loss": 1.0343, + "step": 6635 + }, + { + "epoch": 0.6979110520988077, + "grad_norm": 2.5638852296182266, + "learning_rate": 1.064235893665723e-06, + "loss": 1.0068, + "step": 6636 + }, + { + "epoch": 0.698016222540655, + "grad_norm": 2.584073642881432, + "learning_rate": 1.063552556306406e-06, + "loss": 0.9623, + "step": 6637 + }, + { + "epoch": 0.6981213929825023, + "grad_norm": 2.3545330761686962, + "learning_rate": 1.0628693791221373e-06, + "loss": 1.0107, + "step": 6638 + }, + { + "epoch": 0.6982265634243496, + "grad_norm": 1.945064530584297, + "learning_rate": 1.0621863621890976e-06, + "loss": 0.9861, + "step": 6639 + }, + { + "epoch": 0.698331733866197, + "grad_norm": 2.0623306033113145, + "learning_rate": 1.061503505583447e-06, + "loss": 0.9795, + "step": 6640 + }, + { + "epoch": 0.6984369043080442, + "grad_norm": 2.077722057621392, + "learning_rate": 1.0608208093813299e-06, + "loss": 0.9851, + "step": 6641 + }, + { + "epoch": 0.6985420747498915, + "grad_norm": 2.304749737440805, + "learning_rate": 1.0601382736588735e-06, + "loss": 0.9362, + "step": 6642 + }, + { + "epoch": 0.6986472451917388, + "grad_norm": 2.6096925989976385, + "learning_rate": 1.059455898492184e-06, + "loss": 1.0156, + "step": 6643 + }, + { + "epoch": 0.6987524156335861, + "grad_norm": 2.201610817630821, + "learning_rate": 1.0587736839573525e-06, + "loss": 0.9828, + "step": 6644 + }, + { + "epoch": 0.6988575860754335, + "grad_norm": 1.8056953171438117, + "learning_rate": 1.058091630130451e-06, + "loss": 0.9644, + "step": 6645 + }, + { + "epoch": 0.6989627565172808, + "grad_norm": 1.966122087361216, + "learning_rate": 1.0574097370875346e-06, + "loss": 0.997, + "step": 6646 + }, + { + "epoch": 0.6990679269591281, + "grad_norm": 2.879109107208205, + "learning_rate": 1.0567280049046383e-06, + "loss": 0.9991, + "step": 6647 + }, + { + "epoch": 0.6991730974009754, + "grad_norm": 2.8573357829655404, + "learning_rate": 1.056046433657782e-06, + "loss": 1.0177, + "step": 6648 + }, + { + "epoch": 0.6992782678428228, + "grad_norm": 2.8507110133723863, + "learning_rate": 1.0553650234229642e-06, + "loss": 0.9884, + "step": 6649 + }, + { + "epoch": 0.6993834382846701, + "grad_norm": 1.8137801088961556, + "learning_rate": 1.054683774276169e-06, + "loss": 0.9959, + "step": 6650 + }, + { + "epoch": 0.6994886087265174, + "grad_norm": 2.6038698060831513, + "learning_rate": 1.0540026862933612e-06, + "loss": 1.0342, + "step": 6651 + }, + { + "epoch": 0.6995937791683647, + "grad_norm": 3.033587811054729, + "learning_rate": 1.0533217595504859e-06, + "loss": 0.9652, + "step": 6652 + }, + { + "epoch": 0.6996989496102121, + "grad_norm": 2.1979861729861923, + "learning_rate": 1.0526409941234728e-06, + "loss": 0.9613, + "step": 6653 + }, + { + "epoch": 0.6998041200520594, + "grad_norm": 1.958123885683921, + "learning_rate": 1.0519603900882322e-06, + "loss": 0.9348, + "step": 6654 + }, + { + "epoch": 0.6999092904939067, + "grad_norm": 2.6695995950616047, + "learning_rate": 1.0512799475206576e-06, + "loss": 1.0061, + "step": 6655 + }, + { + "epoch": 0.700014460935754, + "grad_norm": 2.3933085010130206, + "learning_rate": 1.050599666496623e-06, + "loss": 0.9999, + "step": 6656 + }, + { + "epoch": 0.7001196313776014, + "grad_norm": 2.4484506390441987, + "learning_rate": 1.0499195470919844e-06, + "loss": 0.9801, + "step": 6657 + }, + { + "epoch": 0.7002248018194487, + "grad_norm": 2.10318195808962, + "learning_rate": 1.0492395893825804e-06, + "loss": 0.9614, + "step": 6658 + }, + { + "epoch": 0.700329972261296, + "grad_norm": 2.354963154046998, + "learning_rate": 1.0485597934442323e-06, + "loss": 0.9967, + "step": 6659 + }, + { + "epoch": 0.7004351427031433, + "grad_norm": 2.4833792329948308, + "learning_rate": 1.0478801593527436e-06, + "loss": 0.9805, + "step": 6660 + }, + { + "epoch": 0.7005403131449905, + "grad_norm": 2.319460276654493, + "learning_rate": 1.0472006871838963e-06, + "loss": 0.9926, + "step": 6661 + }, + { + "epoch": 0.7006454835868379, + "grad_norm": 2.6492931586569033, + "learning_rate": 1.0465213770134591e-06, + "loss": 1.0135, + "step": 6662 + }, + { + "epoch": 0.7007506540286852, + "grad_norm": 3.0029420857299183, + "learning_rate": 1.0458422289171786e-06, + "loss": 0.9789, + "step": 6663 + }, + { + "epoch": 0.7008558244705325, + "grad_norm": 1.924682101684733, + "learning_rate": 1.0451632429707856e-06, + "loss": 0.9996, + "step": 6664 + }, + { + "epoch": 0.7009609949123798, + "grad_norm": 2.5935534112052725, + "learning_rate": 1.0444844192499938e-06, + "loss": 0.9762, + "step": 6665 + }, + { + "epoch": 0.7010661653542272, + "grad_norm": 2.4891701580984837, + "learning_rate": 1.043805757830495e-06, + "loss": 0.9801, + "step": 6666 + }, + { + "epoch": 0.7011713357960745, + "grad_norm": 3.171560808484776, + "learning_rate": 1.0431272587879662e-06, + "loss": 0.9521, + "step": 6667 + }, + { + "epoch": 0.7012765062379218, + "grad_norm": 2.360067551193987, + "learning_rate": 1.0424489221980657e-06, + "loss": 0.9547, + "step": 6668 + }, + { + "epoch": 0.7013816766797691, + "grad_norm": 2.136337297008342, + "learning_rate": 1.0417707481364338e-06, + "loss": 0.9991, + "step": 6669 + }, + { + "epoch": 0.7014868471216165, + "grad_norm": 2.070557413137076, + "learning_rate": 1.0410927366786904e-06, + "loss": 0.9918, + "step": 6670 + }, + { + "epoch": 0.7015920175634638, + "grad_norm": 2.0832696047671546, + "learning_rate": 1.0404148879004415e-06, + "loss": 1.0046, + "step": 6671 + }, + { + "epoch": 0.7016971880053111, + "grad_norm": 3.3334838695797115, + "learning_rate": 1.0397372018772694e-06, + "loss": 0.996, + "step": 6672 + }, + { + "epoch": 0.7018023584471584, + "grad_norm": 3.2162155727197406, + "learning_rate": 1.0390596786847435e-06, + "loss": 0.9941, + "step": 6673 + }, + { + "epoch": 0.7019075288890058, + "grad_norm": 2.9010911694149053, + "learning_rate": 1.0383823183984133e-06, + "loss": 1.0102, + "step": 6674 + }, + { + "epoch": 0.7020126993308531, + "grad_norm": 1.9879071940536923, + "learning_rate": 1.0377051210938077e-06, + "loss": 1.0026, + "step": 6675 + }, + { + "epoch": 0.7021178697727004, + "grad_norm": 2.911710480192307, + "learning_rate": 1.0370280868464405e-06, + "loss": 1.016, + "step": 6676 + }, + { + "epoch": 0.7022230402145477, + "grad_norm": 2.47624061900751, + "learning_rate": 1.0363512157318076e-06, + "loss": 1.0157, + "step": 6677 + }, + { + "epoch": 0.7023282106563951, + "grad_norm": 2.7429443996244687, + "learning_rate": 1.0356745078253833e-06, + "loss": 0.9474, + "step": 6678 + }, + { + "epoch": 0.7024333810982424, + "grad_norm": 1.6759259613705626, + "learning_rate": 1.0349979632026272e-06, + "loss": 0.9684, + "step": 6679 + }, + { + "epoch": 0.7025385515400897, + "grad_norm": 2.1862048185264458, + "learning_rate": 1.0343215819389782e-06, + "loss": 1.0272, + "step": 6680 + }, + { + "epoch": 0.702643721981937, + "grad_norm": 2.8521745744648492, + "learning_rate": 1.0336453641098584e-06, + "loss": 0.9889, + "step": 6681 + }, + { + "epoch": 0.7027488924237842, + "grad_norm": 2.6870393262822474, + "learning_rate": 1.0329693097906714e-06, + "loss": 1.0578, + "step": 6682 + }, + { + "epoch": 0.7028540628656316, + "grad_norm": 2.6616207546943484, + "learning_rate": 1.0322934190568037e-06, + "loss": 0.9533, + "step": 6683 + }, + { + "epoch": 0.7029592333074789, + "grad_norm": 1.7603354792631345, + "learning_rate": 1.0316176919836207e-06, + "loss": 0.9708, + "step": 6684 + }, + { + "epoch": 0.7030644037493262, + "grad_norm": 2.3418486553788216, + "learning_rate": 1.0309421286464724e-06, + "loss": 0.9603, + "step": 6685 + }, + { + "epoch": 0.7031695741911735, + "grad_norm": 2.6938896407952133, + "learning_rate": 1.030266729120688e-06, + "loss": 1.0238, + "step": 6686 + }, + { + "epoch": 0.7032747446330209, + "grad_norm": 2.035299190868889, + "learning_rate": 1.0295914934815806e-06, + "loss": 0.9898, + "step": 6687 + }, + { + "epoch": 0.7033799150748682, + "grad_norm": 2.0038429608901978, + "learning_rate": 1.0289164218044452e-06, + "loss": 0.9689, + "step": 6688 + }, + { + "epoch": 0.7034850855167155, + "grad_norm": 2.4755202430037913, + "learning_rate": 1.0282415141645554e-06, + "loss": 0.9555, + "step": 6689 + }, + { + "epoch": 0.7035902559585628, + "grad_norm": 3.468729544937565, + "learning_rate": 1.02756677063717e-06, + "loss": 0.998, + "step": 6690 + }, + { + "epoch": 0.7036954264004102, + "grad_norm": 2.6675685372943, + "learning_rate": 1.0268921912975288e-06, + "loss": 1.0056, + "step": 6691 + }, + { + "epoch": 0.7038005968422575, + "grad_norm": 2.500797177465148, + "learning_rate": 1.0262177762208508e-06, + "loss": 0.9976, + "step": 6692 + }, + { + "epoch": 0.7039057672841048, + "grad_norm": 1.843407519469925, + "learning_rate": 1.0255435254823404e-06, + "loss": 0.9219, + "step": 6693 + }, + { + "epoch": 0.7040109377259521, + "grad_norm": 2.646055721221685, + "learning_rate": 1.02486943915718e-06, + "loss": 0.9785, + "step": 6694 + }, + { + "epoch": 0.7041161081677995, + "grad_norm": 2.4428289065934554, + "learning_rate": 1.0241955173205366e-06, + "loss": 1.0164, + "step": 6695 + }, + { + "epoch": 0.7042212786096468, + "grad_norm": 2.4160293322018522, + "learning_rate": 1.0235217600475569e-06, + "loss": 1.0088, + "step": 6696 + }, + { + "epoch": 0.7043264490514941, + "grad_norm": 2.6869630614056756, + "learning_rate": 1.0228481674133719e-06, + "loss": 0.9751, + "step": 6697 + }, + { + "epoch": 0.7044316194933414, + "grad_norm": 2.230312539280299, + "learning_rate": 1.0221747394930904e-06, + "loss": 1.0008, + "step": 6698 + }, + { + "epoch": 0.7045367899351888, + "grad_norm": 2.218987057193651, + "learning_rate": 1.0215014763618054e-06, + "loss": 0.9617, + "step": 6699 + }, + { + "epoch": 0.7046419603770361, + "grad_norm": 2.5556328251578284, + "learning_rate": 1.020828378094592e-06, + "loss": 1.0079, + "step": 6700 + }, + { + "epoch": 0.7047471308188834, + "grad_norm": 2.1111202262894917, + "learning_rate": 1.0201554447665044e-06, + "loss": 0.9656, + "step": 6701 + }, + { + "epoch": 0.7048523012607306, + "grad_norm": 2.3057500589924422, + "learning_rate": 1.0194826764525811e-06, + "loss": 0.9667, + "step": 6702 + }, + { + "epoch": 0.7049574717025779, + "grad_norm": 2.438881329131322, + "learning_rate": 1.01881007322784e-06, + "loss": 1.0032, + "step": 6703 + }, + { + "epoch": 0.7050626421444253, + "grad_norm": 1.9015647486903267, + "learning_rate": 1.0181376351672817e-06, + "loss": 0.9893, + "step": 6704 + }, + { + "epoch": 0.7051678125862726, + "grad_norm": 2.3923551756484422, + "learning_rate": 1.0174653623458886e-06, + "loss": 0.987, + "step": 6705 + }, + { + "epoch": 0.7052729830281199, + "grad_norm": 2.4314151560926165, + "learning_rate": 1.0167932548386253e-06, + "loss": 1.0085, + "step": 6706 + }, + { + "epoch": 0.7053781534699672, + "grad_norm": 1.781610905130927, + "learning_rate": 1.016121312720436e-06, + "loss": 0.9624, + "step": 6707 + }, + { + "epoch": 0.7054833239118146, + "grad_norm": 2.3479526084995106, + "learning_rate": 1.0154495360662464e-06, + "loss": 1.0281, + "step": 6708 + }, + { + "epoch": 0.7055884943536619, + "grad_norm": 1.787929279606114, + "learning_rate": 1.0147779249509662e-06, + "loss": 0.9802, + "step": 6709 + }, + { + "epoch": 0.7056936647955092, + "grad_norm": 2.5085426770017407, + "learning_rate": 1.014106479449485e-06, + "loss": 0.9289, + "step": 6710 + }, + { + "epoch": 0.7057988352373565, + "grad_norm": 2.2725044679555713, + "learning_rate": 1.0134351996366749e-06, + "loss": 0.9641, + "step": 6711 + }, + { + "epoch": 0.7059040056792039, + "grad_norm": 2.364113081705728, + "learning_rate": 1.0127640855873874e-06, + "loss": 0.9668, + "step": 6712 + }, + { + "epoch": 0.7060091761210512, + "grad_norm": 3.297691941833725, + "learning_rate": 1.0120931373764572e-06, + "loss": 1.0055, + "step": 6713 + }, + { + "epoch": 0.7061143465628985, + "grad_norm": 2.1743176526856316, + "learning_rate": 1.011422355078702e-06, + "loss": 0.993, + "step": 6714 + }, + { + "epoch": 0.7062195170047458, + "grad_norm": 2.8680555666625676, + "learning_rate": 1.0107517387689168e-06, + "loss": 1.0052, + "step": 6715 + }, + { + "epoch": 0.7063246874465932, + "grad_norm": 2.3554559333451874, + "learning_rate": 1.0100812885218824e-06, + "loss": 0.9962, + "step": 6716 + }, + { + "epoch": 0.7064298578884405, + "grad_norm": 2.094277187480027, + "learning_rate": 1.0094110044123578e-06, + "loss": 0.9985, + "step": 6717 + }, + { + "epoch": 0.7065350283302878, + "grad_norm": 2.515371556231332, + "learning_rate": 1.0087408865150852e-06, + "loss": 1.0295, + "step": 6718 + }, + { + "epoch": 0.7066401987721351, + "grad_norm": 3.0627093185609064, + "learning_rate": 1.0080709349047885e-06, + "loss": 0.9567, + "step": 6719 + }, + { + "epoch": 0.7067453692139825, + "grad_norm": 2.8156486671988894, + "learning_rate": 1.007401149656173e-06, + "loss": 0.986, + "step": 6720 + }, + { + "epoch": 0.7068505396558298, + "grad_norm": 2.2979514616422376, + "learning_rate": 1.0067315308439235e-06, + "loss": 1.0037, + "step": 6721 + }, + { + "epoch": 0.706955710097677, + "grad_norm": 3.19623467418787, + "learning_rate": 1.0060620785427083e-06, + "loss": 1.021, + "step": 6722 + }, + { + "epoch": 0.7070608805395243, + "grad_norm": 2.3549166089077667, + "learning_rate": 1.0053927928271775e-06, + "loss": 0.9937, + "step": 6723 + }, + { + "epoch": 0.7071660509813716, + "grad_norm": 2.822764978609374, + "learning_rate": 1.00472367377196e-06, + "loss": 1.0179, + "step": 6724 + }, + { + "epoch": 0.707271221423219, + "grad_norm": 2.659932951697124, + "learning_rate": 1.0040547214516698e-06, + "loss": 0.9869, + "step": 6725 + }, + { + "epoch": 0.7073763918650663, + "grad_norm": 3.185470330945337, + "learning_rate": 1.0033859359408977e-06, + "loss": 0.9823, + "step": 6726 + }, + { + "epoch": 0.7074815623069136, + "grad_norm": 2.609628396699824, + "learning_rate": 1.00271731731422e-06, + "loss": 0.9795, + "step": 6727 + }, + { + "epoch": 0.7075867327487609, + "grad_norm": 1.9750399309722628, + "learning_rate": 1.0020488656461936e-06, + "loss": 0.9989, + "step": 6728 + }, + { + "epoch": 0.7076919031906083, + "grad_norm": 2.7806599676410335, + "learning_rate": 1.001380581011354e-06, + "loss": 1.015, + "step": 6729 + }, + { + "epoch": 0.7077970736324556, + "grad_norm": 2.4361344876429527, + "learning_rate": 1.0007124634842227e-06, + "loss": 0.9652, + "step": 6730 + }, + { + "epoch": 0.7079022440743029, + "grad_norm": 2.232929886411633, + "learning_rate": 1.0000445131392975e-06, + "loss": 0.9799, + "step": 6731 + }, + { + "epoch": 0.7080074145161502, + "grad_norm": 2.1763015794617924, + "learning_rate": 9.993767300510613e-07, + "loss": 0.9916, + "step": 6732 + }, + { + "epoch": 0.7081125849579976, + "grad_norm": 2.054775097950991, + "learning_rate": 9.987091142939766e-07, + "loss": 0.9821, + "step": 6733 + }, + { + "epoch": 0.7082177553998449, + "grad_norm": 3.014980389348752, + "learning_rate": 9.980416659424894e-07, + "loss": 0.9988, + "step": 6734 + }, + { + "epoch": 0.7083229258416922, + "grad_norm": 2.2677152224867507, + "learning_rate": 9.97374385071023e-07, + "loss": 0.9856, + "step": 6735 + }, + { + "epoch": 0.7084280962835395, + "grad_norm": 2.235863059602925, + "learning_rate": 9.967072717539852e-07, + "loss": 0.9791, + "step": 6736 + }, + { + "epoch": 0.7085332667253869, + "grad_norm": 2.2178818893528867, + "learning_rate": 9.960403260657658e-07, + "loss": 0.9635, + "step": 6737 + }, + { + "epoch": 0.7086384371672342, + "grad_norm": 2.2511311829581184, + "learning_rate": 9.953735480807322e-07, + "loss": 0.9609, + "step": 6738 + }, + { + "epoch": 0.7087436076090815, + "grad_norm": 2.385032459651381, + "learning_rate": 9.947069378732372e-07, + "loss": 0.9845, + "step": 6739 + }, + { + "epoch": 0.7088487780509288, + "grad_norm": 1.9674072764447825, + "learning_rate": 9.940404955176114e-07, + "loss": 0.9351, + "step": 6740 + }, + { + "epoch": 0.7089539484927762, + "grad_norm": 2.4743599480951404, + "learning_rate": 9.933742210881688e-07, + "loss": 0.9413, + "step": 6741 + }, + { + "epoch": 0.7090591189346235, + "grad_norm": 2.0926323669690037, + "learning_rate": 9.927081146592058e-07, + "loss": 1.0089, + "step": 6742 + }, + { + "epoch": 0.7091642893764707, + "grad_norm": 2.5941026743219453, + "learning_rate": 9.920421763049957e-07, + "loss": 1.0047, + "step": 6743 + }, + { + "epoch": 0.709269459818318, + "grad_norm": 2.8240047908465216, + "learning_rate": 9.913764060997982e-07, + "loss": 0.972, + "step": 6744 + }, + { + "epoch": 0.7093746302601653, + "grad_norm": 2.284340896157482, + "learning_rate": 9.90710804117849e-07, + "loss": 0.9517, + "step": 6745 + }, + { + "epoch": 0.7094798007020127, + "grad_norm": 2.508946771542956, + "learning_rate": 9.900453704333718e-07, + "loss": 0.9857, + "step": 6746 + }, + { + "epoch": 0.70958497114386, + "grad_norm": 2.836809183347872, + "learning_rate": 9.893801051205643e-07, + "loss": 0.9961, + "step": 6747 + }, + { + "epoch": 0.7096901415857073, + "grad_norm": 2.2219358660428927, + "learning_rate": 9.88715008253611e-07, + "loss": 0.9792, + "step": 6748 + }, + { + "epoch": 0.7097953120275546, + "grad_norm": 2.7231291268770046, + "learning_rate": 9.880500799066734e-07, + "loss": 1.0135, + "step": 6749 + }, + { + "epoch": 0.709900482469402, + "grad_norm": 2.2397683772339847, + "learning_rate": 9.873853201538972e-07, + "loss": 0.9937, + "step": 6750 + }, + { + "epoch": 0.7100056529112493, + "grad_norm": 2.4531999818310926, + "learning_rate": 9.86720729069409e-07, + "loss": 0.9821, + "step": 6751 + }, + { + "epoch": 0.7101108233530966, + "grad_norm": 2.895236414074528, + "learning_rate": 9.860563067273142e-07, + "loss": 0.9877, + "step": 6752 + }, + { + "epoch": 0.7102159937949439, + "grad_norm": 2.142507657879859, + "learning_rate": 9.853920532017027e-07, + "loss": 1.0331, + "step": 6753 + }, + { + "epoch": 0.7103211642367913, + "grad_norm": 2.1738833228712044, + "learning_rate": 9.847279685666425e-07, + "loss": 0.9896, + "step": 6754 + }, + { + "epoch": 0.7104263346786386, + "grad_norm": 4.407965262891168, + "learning_rate": 9.840640528961849e-07, + "loss": 1.0162, + "step": 6755 + }, + { + "epoch": 0.7105315051204859, + "grad_norm": 3.380279233475454, + "learning_rate": 9.834003062643616e-07, + "loss": 0.9877, + "step": 6756 + }, + { + "epoch": 0.7106366755623332, + "grad_norm": 2.2302312037350207, + "learning_rate": 9.82736728745186e-07, + "loss": 0.9992, + "step": 6757 + }, + { + "epoch": 0.7107418460041806, + "grad_norm": 2.0718248730773996, + "learning_rate": 9.82073320412652e-07, + "loss": 0.9631, + "step": 6758 + }, + { + "epoch": 0.7108470164460279, + "grad_norm": 2.766749356649403, + "learning_rate": 9.814100813407326e-07, + "loss": 1.0268, + "step": 6759 + }, + { + "epoch": 0.7109521868878752, + "grad_norm": 2.7394051021844597, + "learning_rate": 9.807470116033879e-07, + "loss": 0.9684, + "step": 6760 + }, + { + "epoch": 0.7110573573297225, + "grad_norm": 2.576083662654087, + "learning_rate": 9.800841112745524e-07, + "loss": 0.9926, + "step": 6761 + }, + { + "epoch": 0.7111625277715699, + "grad_norm": 1.998383130036782, + "learning_rate": 9.794213804281463e-07, + "loss": 0.9218, + "step": 6762 + }, + { + "epoch": 0.7112676982134171, + "grad_norm": 2.613732532405646, + "learning_rate": 9.78758819138068e-07, + "loss": 1.0229, + "step": 6763 + }, + { + "epoch": 0.7113728686552644, + "grad_norm": 2.421476761648599, + "learning_rate": 9.780964274781984e-07, + "loss": 1.0062, + "step": 6764 + }, + { + "epoch": 0.7114780390971117, + "grad_norm": 2.13715013448252, + "learning_rate": 9.774342055224006e-07, + "loss": 1.0194, + "step": 6765 + }, + { + "epoch": 0.711583209538959, + "grad_norm": 2.3786521850695044, + "learning_rate": 9.76772153344516e-07, + "loss": 0.9514, + "step": 6766 + }, + { + "epoch": 0.7116883799808064, + "grad_norm": 2.2713978142889677, + "learning_rate": 9.761102710183698e-07, + "loss": 0.9796, + "step": 6767 + }, + { + "epoch": 0.7117935504226537, + "grad_norm": 2.165799232271969, + "learning_rate": 9.754485586177648e-07, + "loss": 0.9959, + "step": 6768 + }, + { + "epoch": 0.711898720864501, + "grad_norm": 1.8842675327288372, + "learning_rate": 9.747870162164903e-07, + "loss": 1.0099, + "step": 6769 + }, + { + "epoch": 0.7120038913063483, + "grad_norm": 1.9989490315740839, + "learning_rate": 9.741256438883108e-07, + "loss": 0.9783, + "step": 6770 + }, + { + "epoch": 0.7121090617481957, + "grad_norm": 3.0192380084749053, + "learning_rate": 9.734644417069764e-07, + "loss": 0.9645, + "step": 6771 + }, + { + "epoch": 0.712214232190043, + "grad_norm": 2.5407462322629355, + "learning_rate": 9.728034097462144e-07, + "loss": 0.9896, + "step": 6772 + }, + { + "epoch": 0.7123194026318903, + "grad_norm": 2.4113869913201023, + "learning_rate": 9.721425480797358e-07, + "loss": 0.9704, + "step": 6773 + }, + { + "epoch": 0.7124245730737376, + "grad_norm": 2.1235306653145734, + "learning_rate": 9.714818567812329e-07, + "loss": 1.0002, + "step": 6774 + }, + { + "epoch": 0.712529743515585, + "grad_norm": 2.141043980222307, + "learning_rate": 9.708213359243762e-07, + "loss": 0.9717, + "step": 6775 + }, + { + "epoch": 0.7126349139574323, + "grad_norm": 2.246816431557711, + "learning_rate": 9.701609855828202e-07, + "loss": 0.9864, + "step": 6776 + }, + { + "epoch": 0.7127400843992796, + "grad_norm": 2.2375134836918664, + "learning_rate": 9.695008058301978e-07, + "loss": 0.9901, + "step": 6777 + }, + { + "epoch": 0.7128452548411269, + "grad_norm": 2.9228849737443614, + "learning_rate": 9.688407967401248e-07, + "loss": 0.9737, + "step": 6778 + }, + { + "epoch": 0.7129504252829743, + "grad_norm": 2.8328425019223253, + "learning_rate": 9.681809583861982e-07, + "loss": 0.992, + "step": 6779 + }, + { + "epoch": 0.7130555957248216, + "grad_norm": 2.7116111720143743, + "learning_rate": 9.675212908419937e-07, + "loss": 0.9816, + "step": 6780 + }, + { + "epoch": 0.7131607661666689, + "grad_norm": 2.5984134719702983, + "learning_rate": 9.668617941810708e-07, + "loss": 0.9631, + "step": 6781 + }, + { + "epoch": 0.7132659366085162, + "grad_norm": 1.9628235706550243, + "learning_rate": 9.662024684769658e-07, + "loss": 0.9091, + "step": 6782 + }, + { + "epoch": 0.7133711070503634, + "grad_norm": 2.953890424536143, + "learning_rate": 9.655433138032022e-07, + "loss": 0.9763, + "step": 6783 + }, + { + "epoch": 0.7134762774922108, + "grad_norm": 1.873848783671468, + "learning_rate": 9.648843302332786e-07, + "loss": 0.9946, + "step": 6784 + }, + { + "epoch": 0.7135814479340581, + "grad_norm": 2.812780437855715, + "learning_rate": 9.642255178406782e-07, + "loss": 0.9467, + "step": 6785 + }, + { + "epoch": 0.7136866183759054, + "grad_norm": 2.0608044605763083, + "learning_rate": 9.635668766988618e-07, + "loss": 0.9883, + "step": 6786 + }, + { + "epoch": 0.7137917888177527, + "grad_norm": 2.38366077999555, + "learning_rate": 9.629084068812742e-07, + "loss": 0.9696, + "step": 6787 + }, + { + "epoch": 0.7138969592596001, + "grad_norm": 2.7383376317409844, + "learning_rate": 9.622501084613407e-07, + "loss": 0.9579, + "step": 6788 + }, + { + "epoch": 0.7140021297014474, + "grad_norm": 3.313699091586876, + "learning_rate": 9.615919815124647e-07, + "loss": 0.9894, + "step": 6789 + }, + { + "epoch": 0.7141073001432947, + "grad_norm": 2.0105423002645364, + "learning_rate": 9.609340261080343e-07, + "loss": 0.9843, + "step": 6790 + }, + { + "epoch": 0.714212470585142, + "grad_norm": 3.588279258448056, + "learning_rate": 9.602762423214146e-07, + "loss": 1.0235, + "step": 6791 + }, + { + "epoch": 0.7143176410269894, + "grad_norm": 2.430588136927392, + "learning_rate": 9.596186302259563e-07, + "loss": 1.0016, + "step": 6792 + }, + { + "epoch": 0.7144228114688367, + "grad_norm": 2.1694545531760423, + "learning_rate": 9.589611898949868e-07, + "loss": 0.9588, + "step": 6793 + }, + { + "epoch": 0.714527981910684, + "grad_norm": 2.4149358677574773, + "learning_rate": 9.583039214018152e-07, + "loss": 0.9814, + "step": 6794 + }, + { + "epoch": 0.7146331523525313, + "grad_norm": 2.272690222416524, + "learning_rate": 9.576468248197335e-07, + "loss": 0.9828, + "step": 6795 + }, + { + "epoch": 0.7147383227943787, + "grad_norm": 2.4404468267875363, + "learning_rate": 9.569899002220104e-07, + "loss": 0.9592, + "step": 6796 + }, + { + "epoch": 0.714843493236226, + "grad_norm": 2.481227510357708, + "learning_rate": 9.563331476819019e-07, + "loss": 0.9432, + "step": 6797 + }, + { + "epoch": 0.7149486636780733, + "grad_norm": 2.1951734767832924, + "learning_rate": 9.55676567272638e-07, + "loss": 1.0097, + "step": 6798 + }, + { + "epoch": 0.7150538341199206, + "grad_norm": 2.4639332073414137, + "learning_rate": 9.550201590674343e-07, + "loss": 0.9852, + "step": 6799 + }, + { + "epoch": 0.715159004561768, + "grad_norm": 2.1444939700922254, + "learning_rate": 9.54363923139484e-07, + "loss": 0.9932, + "step": 6800 + }, + { + "epoch": 0.7152641750036153, + "grad_norm": 2.9435311101102988, + "learning_rate": 9.53707859561963e-07, + "loss": 0.9227, + "step": 6801 + }, + { + "epoch": 0.7153693454454626, + "grad_norm": 2.7613347609604024, + "learning_rate": 9.530519684080289e-07, + "loss": 1.0551, + "step": 6802 + }, + { + "epoch": 0.7154745158873099, + "grad_norm": 3.138642130105925, + "learning_rate": 9.523962497508163e-07, + "loss": 0.9815, + "step": 6803 + }, + { + "epoch": 0.7155796863291571, + "grad_norm": 2.6016824562031577, + "learning_rate": 9.517407036634449e-07, + "loss": 0.9368, + "step": 6804 + }, + { + "epoch": 0.7156848567710045, + "grad_norm": 2.9399166189880686, + "learning_rate": 9.510853302190107e-07, + "loss": 0.9755, + "step": 6805 + }, + { + "epoch": 0.7157900272128518, + "grad_norm": 2.3854715791155123, + "learning_rate": 9.504301294905966e-07, + "loss": 1.0069, + "step": 6806 + }, + { + "epoch": 0.7158951976546991, + "grad_norm": 2.3367169758597504, + "learning_rate": 9.497751015512593e-07, + "loss": 0.9986, + "step": 6807 + }, + { + "epoch": 0.7160003680965464, + "grad_norm": 2.8377663825658592, + "learning_rate": 9.491202464740415e-07, + "loss": 1.0053, + "step": 6808 + }, + { + "epoch": 0.7161055385383938, + "grad_norm": 2.390482144562161, + "learning_rate": 9.484655643319643e-07, + "loss": 0.9992, + "step": 6809 + }, + { + "epoch": 0.7162107089802411, + "grad_norm": 2.942094456519339, + "learning_rate": 9.478110551980274e-07, + "loss": 1.0179, + "step": 6810 + }, + { + "epoch": 0.7163158794220884, + "grad_norm": 2.360695205905589, + "learning_rate": 9.471567191452175e-07, + "loss": 1.0237, + "step": 6811 + }, + { + "epoch": 0.7164210498639357, + "grad_norm": 2.403024984734448, + "learning_rate": 9.465025562464952e-07, + "loss": 0.99, + "step": 6812 + }, + { + "epoch": 0.7165262203057831, + "grad_norm": 2.3364437770617554, + "learning_rate": 9.458485665748071e-07, + "loss": 0.9672, + "step": 6813 + }, + { + "epoch": 0.7166313907476304, + "grad_norm": 2.158609622375604, + "learning_rate": 9.451947502030759e-07, + "loss": 0.9517, + "step": 6814 + }, + { + "epoch": 0.7167365611894777, + "grad_norm": 2.1055972278729715, + "learning_rate": 9.445411072042083e-07, + "loss": 0.964, + "step": 6815 + }, + { + "epoch": 0.716841731631325, + "grad_norm": 2.6035591950083155, + "learning_rate": 9.438876376510911e-07, + "loss": 0.9857, + "step": 6816 + }, + { + "epoch": 0.7169469020731724, + "grad_norm": 2.834112744073984, + "learning_rate": 9.432343416165899e-07, + "loss": 1.0336, + "step": 6817 + }, + { + "epoch": 0.7170520725150197, + "grad_norm": 2.479880902942861, + "learning_rate": 9.425812191735539e-07, + "loss": 1.0126, + "step": 6818 + }, + { + "epoch": 0.717157242956867, + "grad_norm": 1.8559851857923326, + "learning_rate": 9.419282703948085e-07, + "loss": 0.9631, + "step": 6819 + }, + { + "epoch": 0.7172624133987143, + "grad_norm": 1.8082543081529459, + "learning_rate": 9.412754953531664e-07, + "loss": 0.99, + "step": 6820 + }, + { + "epoch": 0.7173675838405617, + "grad_norm": 2.2523841295243123, + "learning_rate": 9.406228941214143e-07, + "loss": 0.9886, + "step": 6821 + }, + { + "epoch": 0.717472754282409, + "grad_norm": 2.457867045065379, + "learning_rate": 9.399704667723239e-07, + "loss": 1.0095, + "step": 6822 + }, + { + "epoch": 0.7175779247242563, + "grad_norm": 3.355058501321726, + "learning_rate": 9.393182133786443e-07, + "loss": 1.0292, + "step": 6823 + }, + { + "epoch": 0.7176830951661035, + "grad_norm": 2.3934295140430466, + "learning_rate": 9.386661340131078e-07, + "loss": 0.9797, + "step": 6824 + }, + { + "epoch": 0.7177882656079508, + "grad_norm": 4.421047798769204, + "learning_rate": 9.380142287484273e-07, + "loss": 0.9442, + "step": 6825 + }, + { + "epoch": 0.7178934360497982, + "grad_norm": 2.1419410112275603, + "learning_rate": 9.373624976572931e-07, + "loss": 1.0034, + "step": 6826 + }, + { + "epoch": 0.7179986064916455, + "grad_norm": 2.446158128193031, + "learning_rate": 9.367109408123803e-07, + "loss": 0.9845, + "step": 6827 + }, + { + "epoch": 0.7181037769334928, + "grad_norm": 2.8329699515453965, + "learning_rate": 9.3605955828634e-07, + "loss": 1.0246, + "step": 6828 + }, + { + "epoch": 0.7182089473753401, + "grad_norm": 2.780651175983165, + "learning_rate": 9.354083501518097e-07, + "loss": 1.0073, + "step": 6829 + }, + { + "epoch": 0.7183141178171875, + "grad_norm": 2.4642646857763753, + "learning_rate": 9.347573164814025e-07, + "loss": 0.9969, + "step": 6830 + }, + { + "epoch": 0.7184192882590348, + "grad_norm": 2.8457696249018243, + "learning_rate": 9.34106457347713e-07, + "loss": 1.0158, + "step": 6831 + }, + { + "epoch": 0.7185244587008821, + "grad_norm": 2.5543405389378275, + "learning_rate": 9.334557728233185e-07, + "loss": 0.9736, + "step": 6832 + }, + { + "epoch": 0.7186296291427294, + "grad_norm": 1.8641262528029885, + "learning_rate": 9.328052629807729e-07, + "loss": 0.98, + "step": 6833 + }, + { + "epoch": 0.7187347995845768, + "grad_norm": 2.5847950268121416, + "learning_rate": 9.32154927892617e-07, + "loss": 0.9441, + "step": 6834 + }, + { + "epoch": 0.7188399700264241, + "grad_norm": 2.129043724063401, + "learning_rate": 9.315047676313648e-07, + "loss": 0.9554, + "step": 6835 + }, + { + "epoch": 0.7189451404682714, + "grad_norm": 2.324839688594734, + "learning_rate": 9.308547822695166e-07, + "loss": 1.0237, + "step": 6836 + }, + { + "epoch": 0.7190503109101187, + "grad_norm": 1.9345404654516318, + "learning_rate": 9.302049718795489e-07, + "loss": 0.9591, + "step": 6837 + }, + { + "epoch": 0.719155481351966, + "grad_norm": 1.7931929866422445, + "learning_rate": 9.295553365339213e-07, + "loss": 0.9902, + "step": 6838 + }, + { + "epoch": 0.7192606517938134, + "grad_norm": 2.244608243473127, + "learning_rate": 9.289058763050743e-07, + "loss": 0.9788, + "step": 6839 + }, + { + "epoch": 0.7193658222356607, + "grad_norm": 2.381440258340103, + "learning_rate": 9.282565912654257e-07, + "loss": 0.9811, + "step": 6840 + }, + { + "epoch": 0.719470992677508, + "grad_norm": 2.645643314379612, + "learning_rate": 9.276074814873778e-07, + "loss": 0.9656, + "step": 6841 + }, + { + "epoch": 0.7195761631193553, + "grad_norm": 2.2621136426873374, + "learning_rate": 9.26958547043309e-07, + "loss": 0.9748, + "step": 6842 + }, + { + "epoch": 0.7196813335612027, + "grad_norm": 2.3967219964591155, + "learning_rate": 9.263097880055835e-07, + "loss": 0.9744, + "step": 6843 + }, + { + "epoch": 0.7197865040030499, + "grad_norm": 3.4195444597169553, + "learning_rate": 9.256612044465407e-07, + "loss": 1.0398, + "step": 6844 + }, + { + "epoch": 0.7198916744448972, + "grad_norm": 1.8349056835941067, + "learning_rate": 9.250127964385045e-07, + "loss": 0.9782, + "step": 6845 + }, + { + "epoch": 0.7199968448867445, + "grad_norm": 3.080343200685503, + "learning_rate": 9.243645640537755e-07, + "loss": 0.9775, + "step": 6846 + }, + { + "epoch": 0.7201020153285919, + "grad_norm": 2.207569417842817, + "learning_rate": 9.237165073646376e-07, + "loss": 0.9864, + "step": 6847 + }, + { + "epoch": 0.7202071857704392, + "grad_norm": 1.9935368383262349, + "learning_rate": 9.230686264433547e-07, + "loss": 0.962, + "step": 6848 + }, + { + "epoch": 0.7203123562122865, + "grad_norm": 3.0768788487950114, + "learning_rate": 9.224209213621693e-07, + "loss": 0.9913, + "step": 6849 + }, + { + "epoch": 0.7204175266541338, + "grad_norm": 3.593231825237885, + "learning_rate": 9.217733921933073e-07, + "loss": 1.0568, + "step": 6850 + }, + { + "epoch": 0.7205226970959812, + "grad_norm": 2.35712856290603, + "learning_rate": 9.21126039008971e-07, + "loss": 1.0247, + "step": 6851 + }, + { + "epoch": 0.7206278675378285, + "grad_norm": 2.060779548724852, + "learning_rate": 9.204788618813468e-07, + "loss": 0.986, + "step": 6852 + }, + { + "epoch": 0.7207330379796758, + "grad_norm": 1.8881478246910306, + "learning_rate": 9.198318608826001e-07, + "loss": 0.9918, + "step": 6853 + }, + { + "epoch": 0.7208382084215231, + "grad_norm": 1.9601162572993023, + "learning_rate": 9.191850360848756e-07, + "loss": 0.9542, + "step": 6854 + }, + { + "epoch": 0.7209433788633705, + "grad_norm": 1.982841411610949, + "learning_rate": 9.185383875603004e-07, + "loss": 1.0137, + "step": 6855 + }, + { + "epoch": 0.7210485493052178, + "grad_norm": 2.5279779328379943, + "learning_rate": 9.178919153809787e-07, + "loss": 1.0106, + "step": 6856 + }, + { + "epoch": 0.7211537197470651, + "grad_norm": 3.6580469329228826, + "learning_rate": 9.172456196190002e-07, + "loss": 0.988, + "step": 6857 + }, + { + "epoch": 0.7212588901889124, + "grad_norm": 2.1551774200448075, + "learning_rate": 9.165995003464295e-07, + "loss": 0.998, + "step": 6858 + }, + { + "epoch": 0.7213640606307598, + "grad_norm": 2.7261206696045357, + "learning_rate": 9.15953557635316e-07, + "loss": 0.9893, + "step": 6859 + }, + { + "epoch": 0.7214692310726071, + "grad_norm": 2.651636066544782, + "learning_rate": 9.153077915576849e-07, + "loss": 0.9691, + "step": 6860 + }, + { + "epoch": 0.7215744015144544, + "grad_norm": 3.4772896538999896, + "learning_rate": 9.146622021855455e-07, + "loss": 1.0076, + "step": 6861 + }, + { + "epoch": 0.7216795719563017, + "grad_norm": 2.378757397637525, + "learning_rate": 9.140167895908867e-07, + "loss": 1.0138, + "step": 6862 + }, + { + "epoch": 0.721784742398149, + "grad_norm": 1.9309802042614799, + "learning_rate": 9.133715538456753e-07, + "loss": 0.9607, + "step": 6863 + }, + { + "epoch": 0.7218899128399964, + "grad_norm": 2.4260010858714534, + "learning_rate": 9.12726495021862e-07, + "loss": 0.995, + "step": 6864 + }, + { + "epoch": 0.7219950832818436, + "grad_norm": 2.755072867504919, + "learning_rate": 9.12081613191374e-07, + "loss": 0.9962, + "step": 6865 + }, + { + "epoch": 0.7221002537236909, + "grad_norm": 2.318026979345631, + "learning_rate": 9.114369084261215e-07, + "loss": 0.9444, + "step": 6866 + }, + { + "epoch": 0.7222054241655382, + "grad_norm": 2.6082295680717023, + "learning_rate": 9.107923807979948e-07, + "loss": 1.0035, + "step": 6867 + }, + { + "epoch": 0.7223105946073856, + "grad_norm": 2.3969924183402185, + "learning_rate": 9.101480303788623e-07, + "loss": 0.9672, + "step": 6868 + }, + { + "epoch": 0.7224157650492329, + "grad_norm": 2.649977228442466, + "learning_rate": 9.095038572405751e-07, + "loss": 1.029, + "step": 6869 + }, + { + "epoch": 0.7225209354910802, + "grad_norm": 2.0300044348072186, + "learning_rate": 9.088598614549629e-07, + "loss": 0.9735, + "step": 6870 + }, + { + "epoch": 0.7226261059329275, + "grad_norm": 1.9663312006008373, + "learning_rate": 9.082160430938375e-07, + "loss": 0.9986, + "step": 6871 + }, + { + "epoch": 0.7227312763747749, + "grad_norm": 2.09299815663197, + "learning_rate": 9.075724022289878e-07, + "loss": 1.0071, + "step": 6872 + }, + { + "epoch": 0.7228364468166222, + "grad_norm": 2.8696877774023886, + "learning_rate": 9.069289389321864e-07, + "loss": 0.9965, + "step": 6873 + }, + { + "epoch": 0.7229416172584695, + "grad_norm": 1.8588664189025497, + "learning_rate": 9.062856532751832e-07, + "loss": 0.9622, + "step": 6874 + }, + { + "epoch": 0.7230467877003168, + "grad_norm": 2.477197723105134, + "learning_rate": 9.056425453297099e-07, + "loss": 0.9691, + "step": 6875 + }, + { + "epoch": 0.7231519581421642, + "grad_norm": 3.2970809410578554, + "learning_rate": 9.04999615167479e-07, + "loss": 0.9987, + "step": 6876 + }, + { + "epoch": 0.7232571285840115, + "grad_norm": 3.0574404452300183, + "learning_rate": 9.043568628601807e-07, + "loss": 1.0004, + "step": 6877 + }, + { + "epoch": 0.7233622990258588, + "grad_norm": 2.3164982947871096, + "learning_rate": 9.037142884794881e-07, + "loss": 0.9689, + "step": 6878 + }, + { + "epoch": 0.7234674694677061, + "grad_norm": 1.705348459141377, + "learning_rate": 9.030718920970513e-07, + "loss": 0.99, + "step": 6879 + }, + { + "epoch": 0.7235726399095535, + "grad_norm": 1.8734205753292643, + "learning_rate": 9.024296737845056e-07, + "loss": 1.0241, + "step": 6880 + }, + { + "epoch": 0.7236778103514008, + "grad_norm": 2.226219420147319, + "learning_rate": 9.017876336134615e-07, + "loss": 0.9465, + "step": 6881 + }, + { + "epoch": 0.7237829807932481, + "grad_norm": 2.3638808966827827, + "learning_rate": 9.011457716555108e-07, + "loss": 0.9356, + "step": 6882 + }, + { + "epoch": 0.7238881512350954, + "grad_norm": 2.274163453037241, + "learning_rate": 9.005040879822269e-07, + "loss": 0.9991, + "step": 6883 + }, + { + "epoch": 0.7239933216769427, + "grad_norm": 2.3270361322264423, + "learning_rate": 8.998625826651624e-07, + "loss": 0.9753, + "step": 6884 + }, + { + "epoch": 0.72409849211879, + "grad_norm": 2.380779478990076, + "learning_rate": 8.992212557758515e-07, + "loss": 0.9749, + "step": 6885 + }, + { + "epoch": 0.7242036625606373, + "grad_norm": 2.5279858961997146, + "learning_rate": 8.985801073858047e-07, + "loss": 1.0178, + "step": 6886 + }, + { + "epoch": 0.7243088330024846, + "grad_norm": 2.264259809683486, + "learning_rate": 8.979391375665169e-07, + "loss": 0.9952, + "step": 6887 + }, + { + "epoch": 0.7244140034443319, + "grad_norm": 2.3465532374579468, + "learning_rate": 8.972983463894599e-07, + "loss": 0.9717, + "step": 6888 + }, + { + "epoch": 0.7245191738861793, + "grad_norm": 2.682561586616032, + "learning_rate": 8.966577339260874e-07, + "loss": 0.9911, + "step": 6889 + }, + { + "epoch": 0.7246243443280266, + "grad_norm": 3.0023908265682917, + "learning_rate": 8.960173002478336e-07, + "loss": 0.9918, + "step": 6890 + }, + { + "epoch": 0.7247295147698739, + "grad_norm": 2.2766548656250625, + "learning_rate": 8.953770454261102e-07, + "loss": 0.9655, + "step": 6891 + }, + { + "epoch": 0.7248346852117212, + "grad_norm": 2.83034363859963, + "learning_rate": 8.947369695323113e-07, + "loss": 0.9809, + "step": 6892 + }, + { + "epoch": 0.7249398556535686, + "grad_norm": 2.4788072341230527, + "learning_rate": 8.940970726378106e-07, + "loss": 1.0237, + "step": 6893 + }, + { + "epoch": 0.7250450260954159, + "grad_norm": 2.711458177748327, + "learning_rate": 8.934573548139621e-07, + "loss": 0.9478, + "step": 6894 + }, + { + "epoch": 0.7251501965372632, + "grad_norm": 2.9684298621784375, + "learning_rate": 8.928178161320977e-07, + "loss": 1.0247, + "step": 6895 + }, + { + "epoch": 0.7252553669791105, + "grad_norm": 1.6958237240093583, + "learning_rate": 8.921784566635328e-07, + "loss": 1.0107, + "step": 6896 + }, + { + "epoch": 0.7253605374209579, + "grad_norm": 2.608563466979351, + "learning_rate": 8.915392764795592e-07, + "loss": 0.9734, + "step": 6897 + }, + { + "epoch": 0.7254657078628052, + "grad_norm": 2.465172273583475, + "learning_rate": 8.909002756514509e-07, + "loss": 0.9729, + "step": 6898 + }, + { + "epoch": 0.7255708783046525, + "grad_norm": 2.179996715866093, + "learning_rate": 8.902614542504631e-07, + "loss": 0.9561, + "step": 6899 + }, + { + "epoch": 0.7256760487464998, + "grad_norm": 1.9710483091820064, + "learning_rate": 8.896228123478268e-07, + "loss": 0.9673, + "step": 6900 + }, + { + "epoch": 0.7257812191883471, + "grad_norm": 2.3895996851177923, + "learning_rate": 8.889843500147577e-07, + "loss": 0.9992, + "step": 6901 + }, + { + "epoch": 0.7258863896301945, + "grad_norm": 2.4070929276655026, + "learning_rate": 8.883460673224478e-07, + "loss": 1.0059, + "step": 6902 + }, + { + "epoch": 0.7259915600720418, + "grad_norm": 2.52826203988613, + "learning_rate": 8.877079643420708e-07, + "loss": 0.9768, + "step": 6903 + }, + { + "epoch": 0.7260967305138891, + "grad_norm": 2.6173671239261616, + "learning_rate": 8.870700411447817e-07, + "loss": 0.9739, + "step": 6904 + }, + { + "epoch": 0.7262019009557364, + "grad_norm": 2.022268173256108, + "learning_rate": 8.864322978017114e-07, + "loss": 0.9741, + "step": 6905 + }, + { + "epoch": 0.7263070713975837, + "grad_norm": 3.336022838388577, + "learning_rate": 8.857947343839749e-07, + "loss": 0.9924, + "step": 6906 + }, + { + "epoch": 0.726412241839431, + "grad_norm": 3.1049563608044135, + "learning_rate": 8.851573509626649e-07, + "loss": 0.9864, + "step": 6907 + }, + { + "epoch": 0.7265174122812783, + "grad_norm": 2.2360381413354036, + "learning_rate": 8.845201476088558e-07, + "loss": 0.9817, + "step": 6908 + }, + { + "epoch": 0.7266225827231256, + "grad_norm": 2.431440384321841, + "learning_rate": 8.838831243935988e-07, + "loss": 1.006, + "step": 6909 + }, + { + "epoch": 0.726727753164973, + "grad_norm": 2.672270703566811, + "learning_rate": 8.832462813879289e-07, + "loss": 0.993, + "step": 6910 + }, + { + "epoch": 0.7268329236068203, + "grad_norm": 2.3586907446017147, + "learning_rate": 8.826096186628568e-07, + "loss": 0.9827, + "step": 6911 + }, + { + "epoch": 0.7269380940486676, + "grad_norm": 2.3158398975904197, + "learning_rate": 8.819731362893769e-07, + "loss": 0.9767, + "step": 6912 + }, + { + "epoch": 0.7270432644905149, + "grad_norm": 2.7399363636718146, + "learning_rate": 8.813368343384621e-07, + "loss": 1.014, + "step": 6913 + }, + { + "epoch": 0.7271484349323623, + "grad_norm": 2.308433695829236, + "learning_rate": 8.807007128810638e-07, + "loss": 1.0087, + "step": 6914 + }, + { + "epoch": 0.7272536053742096, + "grad_norm": 3.0347699529423333, + "learning_rate": 8.800647719881153e-07, + "loss": 0.9805, + "step": 6915 + }, + { + "epoch": 0.7273587758160569, + "grad_norm": 2.387065454939189, + "learning_rate": 8.794290117305296e-07, + "loss": 0.9813, + "step": 6916 + }, + { + "epoch": 0.7274639462579042, + "grad_norm": 2.8287122692548468, + "learning_rate": 8.787934321791972e-07, + "loss": 0.9892, + "step": 6917 + }, + { + "epoch": 0.7275691166997516, + "grad_norm": 2.377152435206834, + "learning_rate": 8.781580334049919e-07, + "loss": 0.9678, + "step": 6918 + }, + { + "epoch": 0.7276742871415989, + "grad_norm": 3.7839274906805667, + "learning_rate": 8.77522815478764e-07, + "loss": 1.0138, + "step": 6919 + }, + { + "epoch": 0.7277794575834462, + "grad_norm": 2.3333914671728175, + "learning_rate": 8.768877784713458e-07, + "loss": 1.0377, + "step": 6920 + }, + { + "epoch": 0.7278846280252935, + "grad_norm": 2.4232211693938637, + "learning_rate": 8.762529224535496e-07, + "loss": 0.9752, + "step": 6921 + }, + { + "epoch": 0.7279897984671408, + "grad_norm": 2.3166357731448946, + "learning_rate": 8.756182474961666e-07, + "loss": 0.9823, + "step": 6922 + }, + { + "epoch": 0.7280949689089882, + "grad_norm": 2.599714079191184, + "learning_rate": 8.749837536699671e-07, + "loss": 1.0219, + "step": 6923 + }, + { + "epoch": 0.7282001393508355, + "grad_norm": 2.758481393241142, + "learning_rate": 8.743494410457032e-07, + "loss": 1.0016, + "step": 6924 + }, + { + "epoch": 0.7283053097926828, + "grad_norm": 2.801903529605539, + "learning_rate": 8.737153096941045e-07, + "loss": 0.9807, + "step": 6925 + }, + { + "epoch": 0.72841048023453, + "grad_norm": 2.132005701744301, + "learning_rate": 8.730813596858823e-07, + "loss": 1.0284, + "step": 6926 + }, + { + "epoch": 0.7285156506763774, + "grad_norm": 2.645513420655628, + "learning_rate": 8.724475910917274e-07, + "loss": 1.0083, + "step": 6927 + }, + { + "epoch": 0.7286208211182247, + "grad_norm": 2.7201477772204816, + "learning_rate": 8.718140039823086e-07, + "loss": 0.9533, + "step": 6928 + }, + { + "epoch": 0.728725991560072, + "grad_norm": 1.9346726122158562, + "learning_rate": 8.711805984282767e-07, + "loss": 0.972, + "step": 6929 + }, + { + "epoch": 0.7288311620019193, + "grad_norm": 2.5900806982874673, + "learning_rate": 8.70547374500261e-07, + "loss": 1.0068, + "step": 6930 + }, + { + "epoch": 0.7289363324437667, + "grad_norm": 2.269593721723228, + "learning_rate": 8.699143322688719e-07, + "loss": 0.982, + "step": 6931 + }, + { + "epoch": 0.729041502885614, + "grad_norm": 3.3136833165123636, + "learning_rate": 8.692814718046979e-07, + "loss": 0.9883, + "step": 6932 + }, + { + "epoch": 0.7291466733274613, + "grad_norm": 2.360602310285996, + "learning_rate": 8.686487931783067e-07, + "loss": 0.9928, + "step": 6933 + }, + { + "epoch": 0.7292518437693086, + "grad_norm": 2.231311892385931, + "learning_rate": 8.680162964602479e-07, + "loss": 1.0011, + "step": 6934 + }, + { + "epoch": 0.729357014211156, + "grad_norm": 2.2504241332774653, + "learning_rate": 8.673839817210497e-07, + "loss": 0.9558, + "step": 6935 + }, + { + "epoch": 0.7294621846530033, + "grad_norm": 1.758728210506665, + "learning_rate": 8.66751849031221e-07, + "loss": 0.9341, + "step": 6936 + }, + { + "epoch": 0.7295673550948506, + "grad_norm": 2.618947911589737, + "learning_rate": 8.661198984612476e-07, + "loss": 0.9822, + "step": 6937 + }, + { + "epoch": 0.7296725255366979, + "grad_norm": 2.6255176262261877, + "learning_rate": 8.654881300815981e-07, + "loss": 0.9986, + "step": 6938 + }, + { + "epoch": 0.7297776959785452, + "grad_norm": 2.858506049180034, + "learning_rate": 8.648565439627205e-07, + "loss": 0.9505, + "step": 6939 + }, + { + "epoch": 0.7298828664203926, + "grad_norm": 2.0177047337756906, + "learning_rate": 8.642251401750395e-07, + "loss": 0.9817, + "step": 6940 + }, + { + "epoch": 0.7299880368622399, + "grad_norm": 2.414021437249756, + "learning_rate": 8.635939187889633e-07, + "loss": 0.9816, + "step": 6941 + }, + { + "epoch": 0.7300932073040872, + "grad_norm": 2.398138605350569, + "learning_rate": 8.629628798748763e-07, + "loss": 0.9552, + "step": 6942 + }, + { + "epoch": 0.7301983777459345, + "grad_norm": 3.202783456390097, + "learning_rate": 8.623320235031452e-07, + "loss": 0.98, + "step": 6943 + }, + { + "epoch": 0.7303035481877819, + "grad_norm": 1.9008924047025504, + "learning_rate": 8.617013497441154e-07, + "loss": 0.9407, + "step": 6944 + }, + { + "epoch": 0.7304087186296292, + "grad_norm": 2.0495458812583873, + "learning_rate": 8.610708586681127e-07, + "loss": 1.0026, + "step": 6945 + }, + { + "epoch": 0.7305138890714764, + "grad_norm": 2.281006292586853, + "learning_rate": 8.604405503454399e-07, + "loss": 0.968, + "step": 6946 + }, + { + "epoch": 0.7306190595133237, + "grad_norm": 2.8232403754432376, + "learning_rate": 8.598104248463823e-07, + "loss": 0.9745, + "step": 6947 + }, + { + "epoch": 0.7307242299551711, + "grad_norm": 3.0990957513005486, + "learning_rate": 8.591804822412048e-07, + "loss": 0.9712, + "step": 6948 + }, + { + "epoch": 0.7308294003970184, + "grad_norm": 2.948097662188505, + "learning_rate": 8.585507226001488e-07, + "loss": 0.972, + "step": 6949 + }, + { + "epoch": 0.7309345708388657, + "grad_norm": 2.4648039136536304, + "learning_rate": 8.579211459934394e-07, + "loss": 0.9927, + "step": 6950 + }, + { + "epoch": 0.731039741280713, + "grad_norm": 3.8091080476673755, + "learning_rate": 8.572917524912777e-07, + "loss": 0.9932, + "step": 6951 + }, + { + "epoch": 0.7311449117225604, + "grad_norm": 2.2912070211331597, + "learning_rate": 8.566625421638464e-07, + "loss": 1.0069, + "step": 6952 + }, + { + "epoch": 0.7312500821644077, + "grad_norm": 2.769316491698749, + "learning_rate": 8.560335150813081e-07, + "loss": 0.9821, + "step": 6953 + }, + { + "epoch": 0.731355252606255, + "grad_norm": 4.834866721749002, + "learning_rate": 8.554046713138034e-07, + "loss": 0.96, + "step": 6954 + }, + { + "epoch": 0.7314604230481023, + "grad_norm": 2.262988850257133, + "learning_rate": 8.54776010931454e-07, + "loss": 0.97, + "step": 6955 + }, + { + "epoch": 0.7315655934899497, + "grad_norm": 2.4207174619006913, + "learning_rate": 8.54147534004359e-07, + "loss": 0.9787, + "step": 6956 + }, + { + "epoch": 0.731670763931797, + "grad_norm": 2.6566870598983052, + "learning_rate": 8.535192406025997e-07, + "loss": 0.9971, + "step": 6957 + }, + { + "epoch": 0.7317759343736443, + "grad_norm": 2.3775122168529137, + "learning_rate": 8.52891130796235e-07, + "loss": 0.9409, + "step": 6958 + }, + { + "epoch": 0.7318811048154916, + "grad_norm": 2.5975495470680183, + "learning_rate": 8.522632046553056e-07, + "loss": 1.0159, + "step": 6959 + }, + { + "epoch": 0.731986275257339, + "grad_norm": 2.551497070086668, + "learning_rate": 8.516354622498279e-07, + "loss": 0.9776, + "step": 6960 + }, + { + "epoch": 0.7320914456991863, + "grad_norm": 2.030647013013727, + "learning_rate": 8.510079036498012e-07, + "loss": 0.9495, + "step": 6961 + }, + { + "epoch": 0.7321966161410336, + "grad_norm": 2.530716610861503, + "learning_rate": 8.503805289252037e-07, + "loss": 1.0091, + "step": 6962 + }, + { + "epoch": 0.7323017865828809, + "grad_norm": 2.8009461610833113, + "learning_rate": 8.497533381459914e-07, + "loss": 1.0142, + "step": 6963 + }, + { + "epoch": 0.7324069570247282, + "grad_norm": 2.7103271804572553, + "learning_rate": 8.491263313821021e-07, + "loss": 0.9479, + "step": 6964 + }, + { + "epoch": 0.7325121274665756, + "grad_norm": 3.0620007557697995, + "learning_rate": 8.484995087034506e-07, + "loss": 0.9684, + "step": 6965 + }, + { + "epoch": 0.7326172979084229, + "grad_norm": 2.310056025863538, + "learning_rate": 8.47872870179933e-07, + "loss": 1.0122, + "step": 6966 + }, + { + "epoch": 0.7327224683502701, + "grad_norm": 2.2752199543586933, + "learning_rate": 8.472464158814256e-07, + "loss": 0.9815, + "step": 6967 + }, + { + "epoch": 0.7328276387921174, + "grad_norm": 2.499533329716282, + "learning_rate": 8.466201458777809e-07, + "loss": 0.9966, + "step": 6968 + }, + { + "epoch": 0.7329328092339648, + "grad_norm": 3.0605576051148433, + "learning_rate": 8.459940602388345e-07, + "loss": 0.9442, + "step": 6969 + }, + { + "epoch": 0.7330379796758121, + "grad_norm": 3.0303954415055334, + "learning_rate": 8.453681590343979e-07, + "loss": 1.012, + "step": 6970 + }, + { + "epoch": 0.7331431501176594, + "grad_norm": 2.3529258666071855, + "learning_rate": 8.447424423342665e-07, + "loss": 1.0134, + "step": 6971 + }, + { + "epoch": 0.7332483205595067, + "grad_norm": 1.653851753950748, + "learning_rate": 8.441169102082106e-07, + "loss": 0.9661, + "step": 6972 + }, + { + "epoch": 0.733353491001354, + "grad_norm": 2.3418082947379775, + "learning_rate": 8.434915627259832e-07, + "loss": 1.0021, + "step": 6973 + }, + { + "epoch": 0.7334586614432014, + "grad_norm": 3.8928938699639724, + "learning_rate": 8.428663999573142e-07, + "loss": 0.9314, + "step": 6974 + }, + { + "epoch": 0.7335638318850487, + "grad_norm": 3.099834663643491, + "learning_rate": 8.422414219719147e-07, + "loss": 0.9996, + "step": 6975 + }, + { + "epoch": 0.733669002326896, + "grad_norm": 1.8811632859332335, + "learning_rate": 8.416166288394751e-07, + "loss": 0.9822, + "step": 6976 + }, + { + "epoch": 0.7337741727687433, + "grad_norm": 2.5292346499214986, + "learning_rate": 8.409920206296635e-07, + "loss": 1.0268, + "step": 6977 + }, + { + "epoch": 0.7338793432105907, + "grad_norm": 2.3412938045005416, + "learning_rate": 8.4036759741213e-07, + "loss": 0.9831, + "step": 6978 + }, + { + "epoch": 0.733984513652438, + "grad_norm": 2.3394006760695123, + "learning_rate": 8.39743359256501e-07, + "loss": 0.9884, + "step": 6979 + }, + { + "epoch": 0.7340896840942853, + "grad_norm": 1.9244006117602965, + "learning_rate": 8.39119306232385e-07, + "loss": 0.9443, + "step": 6980 + }, + { + "epoch": 0.7341948545361326, + "grad_norm": 2.478475134799539, + "learning_rate": 8.384954384093682e-07, + "loss": 0.9719, + "step": 6981 + }, + { + "epoch": 0.73430002497798, + "grad_norm": 3.0512883023668933, + "learning_rate": 8.378717558570182e-07, + "loss": 0.969, + "step": 6982 + }, + { + "epoch": 0.7344051954198273, + "grad_norm": 3.1758406178171295, + "learning_rate": 8.37248258644879e-07, + "loss": 0.9856, + "step": 6983 + }, + { + "epoch": 0.7345103658616746, + "grad_norm": 2.738349461307976, + "learning_rate": 8.366249468424742e-07, + "loss": 0.9978, + "step": 6984 + }, + { + "epoch": 0.7346155363035219, + "grad_norm": 2.321797129857226, + "learning_rate": 8.36001820519311e-07, + "loss": 1.0093, + "step": 6985 + }, + { + "epoch": 0.7347207067453693, + "grad_norm": 2.1761741288187255, + "learning_rate": 8.353788797448703e-07, + "loss": 0.9997, + "step": 6986 + }, + { + "epoch": 0.7348258771872165, + "grad_norm": 2.706187150898931, + "learning_rate": 8.347561245886169e-07, + "loss": 0.9516, + "step": 6987 + }, + { + "epoch": 0.7349310476290638, + "grad_norm": 2.061774866814489, + "learning_rate": 8.341335551199903e-07, + "loss": 0.9517, + "step": 6988 + }, + { + "epoch": 0.7350362180709111, + "grad_norm": 1.9863690617075271, + "learning_rate": 8.335111714084135e-07, + "loss": 0.9947, + "step": 6989 + }, + { + "epoch": 0.7351413885127585, + "grad_norm": 2.197421006470657, + "learning_rate": 8.328889735232876e-07, + "loss": 0.9866, + "step": 6990 + }, + { + "epoch": 0.7352465589546058, + "grad_norm": 3.8124829291968165, + "learning_rate": 8.322669615339909e-07, + "loss": 0.984, + "step": 6991 + }, + { + "epoch": 0.7353517293964531, + "grad_norm": 3.2431775985547375, + "learning_rate": 8.316451355098842e-07, + "loss": 1.0148, + "step": 6992 + }, + { + "epoch": 0.7354568998383004, + "grad_norm": 2.3326759532763464, + "learning_rate": 8.310234955203036e-07, + "loss": 0.9669, + "step": 6993 + }, + { + "epoch": 0.7355620702801478, + "grad_norm": 2.13604126329195, + "learning_rate": 8.304020416345698e-07, + "loss": 1.0038, + "step": 6994 + }, + { + "epoch": 0.7356672407219951, + "grad_norm": 2.5186724644933234, + "learning_rate": 8.297807739219777e-07, + "loss": 1.0161, + "step": 6995 + }, + { + "epoch": 0.7357724111638424, + "grad_norm": 2.8521649286716224, + "learning_rate": 8.291596924518048e-07, + "loss": 1.0343, + "step": 6996 + }, + { + "epoch": 0.7358775816056897, + "grad_norm": 2.7048039596817137, + "learning_rate": 8.285387972933045e-07, + "loss": 0.9705, + "step": 6997 + }, + { + "epoch": 0.735982752047537, + "grad_norm": 2.332648947659513, + "learning_rate": 8.279180885157129e-07, + "loss": 0.9918, + "step": 6998 + }, + { + "epoch": 0.7360879224893844, + "grad_norm": 2.206028825985984, + "learning_rate": 8.272975661882446e-07, + "loss": 0.9916, + "step": 6999 + }, + { + "epoch": 0.7361930929312317, + "grad_norm": 2.8257632521399847, + "learning_rate": 8.266772303800907e-07, + "loss": 0.9935, + "step": 7000 + }, + { + "epoch": 0.736298263373079, + "grad_norm": 1.9735295694119066, + "learning_rate": 8.260570811604252e-07, + "loss": 0.9716, + "step": 7001 + }, + { + "epoch": 0.7364034338149263, + "grad_norm": 2.766348296710771, + "learning_rate": 8.254371185983981e-07, + "loss": 0.9867, + "step": 7002 + }, + { + "epoch": 0.7365086042567737, + "grad_norm": 2.5669860597872582, + "learning_rate": 8.248173427631406e-07, + "loss": 0.9607, + "step": 7003 + }, + { + "epoch": 0.736613774698621, + "grad_norm": 2.011650558328637, + "learning_rate": 8.241977537237639e-07, + "loss": 0.9772, + "step": 7004 + }, + { + "epoch": 0.7367189451404683, + "grad_norm": 4.378445426045064, + "learning_rate": 8.235783515493545e-07, + "loss": 1.0191, + "step": 7005 + }, + { + "epoch": 0.7368241155823156, + "grad_norm": 3.0930192829825094, + "learning_rate": 8.229591363089826e-07, + "loss": 0.9812, + "step": 7006 + }, + { + "epoch": 0.7369292860241629, + "grad_norm": 2.8858200910410408, + "learning_rate": 8.223401080716934e-07, + "loss": 1.012, + "step": 7007 + }, + { + "epoch": 0.7370344564660102, + "grad_norm": 2.3565977478267763, + "learning_rate": 8.217212669065161e-07, + "loss": 0.9891, + "step": 7008 + }, + { + "epoch": 0.7371396269078575, + "grad_norm": 2.90432653736149, + "learning_rate": 8.21102612882454e-07, + "loss": 0.9853, + "step": 7009 + }, + { + "epoch": 0.7372447973497048, + "grad_norm": 2.636696481329649, + "learning_rate": 8.204841460684934e-07, + "loss": 0.9837, + "step": 7010 + }, + { + "epoch": 0.7373499677915522, + "grad_norm": 2.401750673418409, + "learning_rate": 8.198658665335968e-07, + "loss": 0.9897, + "step": 7011 + }, + { + "epoch": 0.7374551382333995, + "grad_norm": 2.187244354384058, + "learning_rate": 8.192477743467078e-07, + "loss": 0.9739, + "step": 7012 + }, + { + "epoch": 0.7375603086752468, + "grad_norm": 2.504902025352448, + "learning_rate": 8.186298695767494e-07, + "loss": 1.0082, + "step": 7013 + }, + { + "epoch": 0.7376654791170941, + "grad_norm": 2.656973818003429, + "learning_rate": 8.18012152292621e-07, + "loss": 1.0404, + "step": 7014 + }, + { + "epoch": 0.7377706495589414, + "grad_norm": 1.9657665226192536, + "learning_rate": 8.173946225632046e-07, + "loss": 0.9986, + "step": 7015 + }, + { + "epoch": 0.7378758200007888, + "grad_norm": 2.458503304437768, + "learning_rate": 8.16777280457357e-07, + "loss": 0.9499, + "step": 7016 + }, + { + "epoch": 0.7379809904426361, + "grad_norm": 2.8152286475357635, + "learning_rate": 8.1616012604392e-07, + "loss": 1.0263, + "step": 7017 + }, + { + "epoch": 0.7380861608844834, + "grad_norm": 2.7808079810706197, + "learning_rate": 8.155431593917096e-07, + "loss": 0.9589, + "step": 7018 + }, + { + "epoch": 0.7381913313263307, + "grad_norm": 2.2105204331938384, + "learning_rate": 8.149263805695215e-07, + "loss": 1.0189, + "step": 7019 + }, + { + "epoch": 0.7382965017681781, + "grad_norm": 2.152001818336773, + "learning_rate": 8.143097896461327e-07, + "loss": 0.9507, + "step": 7020 + }, + { + "epoch": 0.7384016722100254, + "grad_norm": 2.5837569059762746, + "learning_rate": 8.136933866902957e-07, + "loss": 0.9991, + "step": 7021 + }, + { + "epoch": 0.7385068426518727, + "grad_norm": 2.0815321522556953, + "learning_rate": 8.130771717707476e-07, + "loss": 0.9999, + "step": 7022 + }, + { + "epoch": 0.73861201309372, + "grad_norm": 2.4905114183126327, + "learning_rate": 8.124611449561981e-07, + "loss": 0.9588, + "step": 7023 + }, + { + "epoch": 0.7387171835355674, + "grad_norm": 2.0780844688809545, + "learning_rate": 8.118453063153412e-07, + "loss": 0.9142, + "step": 7024 + }, + { + "epoch": 0.7388223539774147, + "grad_norm": 2.1224882708812336, + "learning_rate": 8.112296559168459e-07, + "loss": 0.9753, + "step": 7025 + }, + { + "epoch": 0.738927524419262, + "grad_norm": 2.8656944207274395, + "learning_rate": 8.106141938293627e-07, + "loss": 0.979, + "step": 7026 + }, + { + "epoch": 0.7390326948611093, + "grad_norm": 2.5552855508838763, + "learning_rate": 8.099989201215214e-07, + "loss": 0.9649, + "step": 7027 + }, + { + "epoch": 0.7391378653029566, + "grad_norm": 1.591509673958794, + "learning_rate": 8.093838348619282e-07, + "loss": 0.9729, + "step": 7028 + }, + { + "epoch": 0.7392430357448039, + "grad_norm": 2.3016915010058825, + "learning_rate": 8.087689381191713e-07, + "loss": 0.9691, + "step": 7029 + }, + { + "epoch": 0.7393482061866512, + "grad_norm": 2.697962007116117, + "learning_rate": 8.081542299618139e-07, + "loss": 1.0254, + "step": 7030 + }, + { + "epoch": 0.7394533766284985, + "grad_norm": 2.585421932407672, + "learning_rate": 8.075397104584045e-07, + "loss": 0.9704, + "step": 7031 + }, + { + "epoch": 0.7395585470703459, + "grad_norm": 2.370762176558943, + "learning_rate": 8.069253796774639e-07, + "loss": 0.9573, + "step": 7032 + }, + { + "epoch": 0.7396637175121932, + "grad_norm": 2.7384589402328117, + "learning_rate": 8.063112376874965e-07, + "loss": 0.9659, + "step": 7033 + }, + { + "epoch": 0.7397688879540405, + "grad_norm": 2.3648476588909197, + "learning_rate": 8.056972845569833e-07, + "loss": 0.9979, + "step": 7034 + }, + { + "epoch": 0.7398740583958878, + "grad_norm": 2.363044938151124, + "learning_rate": 8.05083520354383e-07, + "loss": 0.9984, + "step": 7035 + }, + { + "epoch": 0.7399792288377351, + "grad_norm": 2.7144839085362498, + "learning_rate": 8.044699451481383e-07, + "loss": 0.98, + "step": 7036 + }, + { + "epoch": 0.7400843992795825, + "grad_norm": 1.991910306378637, + "learning_rate": 8.038565590066652e-07, + "loss": 1.0083, + "step": 7037 + }, + { + "epoch": 0.7401895697214298, + "grad_norm": 2.268190827360275, + "learning_rate": 8.032433619983628e-07, + "loss": 0.9803, + "step": 7038 + }, + { + "epoch": 0.7402947401632771, + "grad_norm": 2.9919518122332582, + "learning_rate": 8.026303541916056e-07, + "loss": 1.0184, + "step": 7039 + }, + { + "epoch": 0.7403999106051244, + "grad_norm": 1.9029884195579985, + "learning_rate": 8.020175356547497e-07, + "loss": 1.0364, + "step": 7040 + }, + { + "epoch": 0.7405050810469718, + "grad_norm": 2.8980438411255025, + "learning_rate": 8.014049064561294e-07, + "loss": 0.9867, + "step": 7041 + }, + { + "epoch": 0.7406102514888191, + "grad_norm": 2.9396255932824475, + "learning_rate": 8.007924666640565e-07, + "loss": 1.0225, + "step": 7042 + }, + { + "epoch": 0.7407154219306664, + "grad_norm": 3.0222446896596415, + "learning_rate": 8.001802163468244e-07, + "loss": 0.9761, + "step": 7043 + }, + { + "epoch": 0.7408205923725137, + "grad_norm": 3.017678825360528, + "learning_rate": 7.995681555727011e-07, + "loss": 1.0636, + "step": 7044 + }, + { + "epoch": 0.7409257628143611, + "grad_norm": 2.240379927077817, + "learning_rate": 7.989562844099396e-07, + "loss": 0.9825, + "step": 7045 + }, + { + "epoch": 0.7410309332562084, + "grad_norm": 3.079831156511648, + "learning_rate": 7.983446029267658e-07, + "loss": 1.0057, + "step": 7046 + }, + { + "epoch": 0.7411361036980557, + "grad_norm": 3.2771958422605723, + "learning_rate": 7.977331111913883e-07, + "loss": 0.9918, + "step": 7047 + }, + { + "epoch": 0.7412412741399029, + "grad_norm": 2.545219586879537, + "learning_rate": 7.971218092719921e-07, + "loss": 0.9816, + "step": 7048 + }, + { + "epoch": 0.7413464445817503, + "grad_norm": 1.8833415674316218, + "learning_rate": 7.965106972367423e-07, + "loss": 0.9534, + "step": 7049 + }, + { + "epoch": 0.7414516150235976, + "grad_norm": 2.2734406517544934, + "learning_rate": 7.958997751537836e-07, + "loss": 0.9753, + "step": 7050 + }, + { + "epoch": 0.7415567854654449, + "grad_norm": 2.119993555931499, + "learning_rate": 7.952890430912374e-07, + "loss": 0.9472, + "step": 7051 + }, + { + "epoch": 0.7416619559072922, + "grad_norm": 3.0421738792041166, + "learning_rate": 7.946785011172062e-07, + "loss": 1.0013, + "step": 7052 + }, + { + "epoch": 0.7417671263491395, + "grad_norm": 2.473943886643507, + "learning_rate": 7.940681492997678e-07, + "loss": 1.0045, + "step": 7053 + }, + { + "epoch": 0.7418722967909869, + "grad_norm": 3.1688667102775616, + "learning_rate": 7.934579877069848e-07, + "loss": 0.9387, + "step": 7054 + }, + { + "epoch": 0.7419774672328342, + "grad_norm": 2.5868684633157324, + "learning_rate": 7.92848016406893e-07, + "loss": 1.0002, + "step": 7055 + }, + { + "epoch": 0.7420826376746815, + "grad_norm": 2.2449015856520873, + "learning_rate": 7.922382354675079e-07, + "loss": 0.9367, + "step": 7056 + }, + { + "epoch": 0.7421878081165288, + "grad_norm": 3.009362045627099, + "learning_rate": 7.91628644956827e-07, + "loss": 0.9526, + "step": 7057 + }, + { + "epoch": 0.7422929785583762, + "grad_norm": 2.4357656876283156, + "learning_rate": 7.910192449428216e-07, + "loss": 1.0009, + "step": 7058 + }, + { + "epoch": 0.7423981490002235, + "grad_norm": 1.7009514930395582, + "learning_rate": 7.904100354934477e-07, + "loss": 0.9459, + "step": 7059 + }, + { + "epoch": 0.7425033194420708, + "grad_norm": 1.686132473222285, + "learning_rate": 7.898010166766348e-07, + "loss": 0.9756, + "step": 7060 + }, + { + "epoch": 0.7426084898839181, + "grad_norm": 2.580697246706924, + "learning_rate": 7.891921885602946e-07, + "loss": 1.014, + "step": 7061 + }, + { + "epoch": 0.7427136603257655, + "grad_norm": 1.8915629581945324, + "learning_rate": 7.885835512123144e-07, + "loss": 0.9774, + "step": 7062 + }, + { + "epoch": 0.7428188307676128, + "grad_norm": 2.819291640040572, + "learning_rate": 7.879751047005632e-07, + "loss": 0.9752, + "step": 7063 + }, + { + "epoch": 0.7429240012094601, + "grad_norm": 2.019716408325097, + "learning_rate": 7.87366849092888e-07, + "loss": 0.9477, + "step": 7064 + }, + { + "epoch": 0.7430291716513074, + "grad_norm": 2.179030867839558, + "learning_rate": 7.867587844571126e-07, + "loss": 0.9877, + "step": 7065 + }, + { + "epoch": 0.7431343420931548, + "grad_norm": 2.348644789987132, + "learning_rate": 7.861509108610423e-07, + "loss": 0.9943, + "step": 7066 + }, + { + "epoch": 0.7432395125350021, + "grad_norm": 1.9825526177265085, + "learning_rate": 7.855432283724576e-07, + "loss": 0.9825, + "step": 7067 + }, + { + "epoch": 0.7433446829768493, + "grad_norm": 2.171603481982258, + "learning_rate": 7.849357370591229e-07, + "loss": 1.0117, + "step": 7068 + }, + { + "epoch": 0.7434498534186966, + "grad_norm": 2.648552295188202, + "learning_rate": 7.843284369887757e-07, + "loss": 0.9668, + "step": 7069 + }, + { + "epoch": 0.743555023860544, + "grad_norm": 2.849512019966635, + "learning_rate": 7.837213282291365e-07, + "loss": 0.9952, + "step": 7070 + }, + { + "epoch": 0.7436601943023913, + "grad_norm": 2.368367081297725, + "learning_rate": 7.831144108479016e-07, + "loss": 0.9449, + "step": 7071 + }, + { + "epoch": 0.7437653647442386, + "grad_norm": 2.465131065995525, + "learning_rate": 7.825076849127458e-07, + "loss": 1.018, + "step": 7072 + }, + { + "epoch": 0.7438705351860859, + "grad_norm": 2.883105606748885, + "learning_rate": 7.819011504913266e-07, + "loss": 0.9762, + "step": 7073 + }, + { + "epoch": 0.7439757056279332, + "grad_norm": 3.0353049995873707, + "learning_rate": 7.812948076512747e-07, + "loss": 1.005, + "step": 7074 + }, + { + "epoch": 0.7440808760697806, + "grad_norm": 2.2610828766405118, + "learning_rate": 7.806886564602043e-07, + "loss": 0.9639, + "step": 7075 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 2.1841830172268466, + "learning_rate": 7.800826969857036e-07, + "loss": 0.9841, + "step": 7076 + }, + { + "epoch": 0.7442912169534752, + "grad_norm": 2.129396215048809, + "learning_rate": 7.79476929295343e-07, + "loss": 0.9898, + "step": 7077 + }, + { + "epoch": 0.7443963873953225, + "grad_norm": 2.3303104402152384, + "learning_rate": 7.788713534566714e-07, + "loss": 1.0202, + "step": 7078 + }, + { + "epoch": 0.7445015578371699, + "grad_norm": 2.278648106170102, + "learning_rate": 7.78265969537213e-07, + "loss": 0.9724, + "step": 7079 + }, + { + "epoch": 0.7446067282790172, + "grad_norm": 2.3135184335473524, + "learning_rate": 7.776607776044748e-07, + "loss": 0.9536, + "step": 7080 + }, + { + "epoch": 0.7447118987208645, + "grad_norm": 2.4775548340535365, + "learning_rate": 7.77055777725938e-07, + "loss": 0.9794, + "step": 7081 + }, + { + "epoch": 0.7448170691627118, + "grad_norm": 2.5496006086788356, + "learning_rate": 7.764509699690679e-07, + "loss": 1.017, + "step": 7082 + }, + { + "epoch": 0.7449222396045592, + "grad_norm": 2.1357477443739787, + "learning_rate": 7.758463544013026e-07, + "loss": 0.9812, + "step": 7083 + }, + { + "epoch": 0.7450274100464065, + "grad_norm": 2.5264510633127824, + "learning_rate": 7.752419310900636e-07, + "loss": 0.9786, + "step": 7084 + }, + { + "epoch": 0.7451325804882538, + "grad_norm": 2.7553954412125945, + "learning_rate": 7.746377001027466e-07, + "loss": 0.9899, + "step": 7085 + }, + { + "epoch": 0.7452377509301011, + "grad_norm": 2.1345306806782225, + "learning_rate": 7.740336615067293e-07, + "loss": 0.9952, + "step": 7086 + }, + { + "epoch": 0.7453429213719485, + "grad_norm": 2.0732183389576258, + "learning_rate": 7.734298153693671e-07, + "loss": 0.9757, + "step": 7087 + }, + { + "epoch": 0.7454480918137958, + "grad_norm": 2.513807387074567, + "learning_rate": 7.728261617579922e-07, + "loss": 0.9688, + "step": 7088 + }, + { + "epoch": 0.745553262255643, + "grad_norm": 1.6046019938022835, + "learning_rate": 7.722227007399183e-07, + "loss": 0.9902, + "step": 7089 + }, + { + "epoch": 0.7456584326974903, + "grad_norm": 2.5953536113957365, + "learning_rate": 7.716194323824345e-07, + "loss": 0.9866, + "step": 7090 + }, + { + "epoch": 0.7457636031393376, + "grad_norm": 2.326926439775202, + "learning_rate": 7.710163567528101e-07, + "loss": 0.9848, + "step": 7091 + }, + { + "epoch": 0.745868773581185, + "grad_norm": 2.8094515613190687, + "learning_rate": 7.704134739182942e-07, + "loss": 0.9781, + "step": 7092 + }, + { + "epoch": 0.7459739440230323, + "grad_norm": 3.123876981837614, + "learning_rate": 7.698107839461111e-07, + "loss": 0.9893, + "step": 7093 + }, + { + "epoch": 0.7460791144648796, + "grad_norm": 3.5591005303250935, + "learning_rate": 7.692082869034662e-07, + "loss": 0.995, + "step": 7094 + }, + { + "epoch": 0.746184284906727, + "grad_norm": 2.346896318567527, + "learning_rate": 7.686059828575423e-07, + "loss": 0.9834, + "step": 7095 + }, + { + "epoch": 0.7462894553485743, + "grad_norm": 2.6908060947387304, + "learning_rate": 7.680038718755023e-07, + "loss": 0.9847, + "step": 7096 + }, + { + "epoch": 0.7463946257904216, + "grad_norm": 2.4966779519732865, + "learning_rate": 7.674019540244843e-07, + "loss": 0.9635, + "step": 7097 + }, + { + "epoch": 0.7464997962322689, + "grad_norm": 2.173016936111415, + "learning_rate": 7.668002293716084e-07, + "loss": 1.0019, + "step": 7098 + }, + { + "epoch": 0.7466049666741162, + "grad_norm": 2.4007122420388285, + "learning_rate": 7.661986979839703e-07, + "loss": 0.996, + "step": 7099 + }, + { + "epoch": 0.7467101371159636, + "grad_norm": 2.305372137742398, + "learning_rate": 7.655973599286459e-07, + "loss": 1.0022, + "step": 7100 + }, + { + "epoch": 0.7468153075578109, + "grad_norm": 2.7387496498746966, + "learning_rate": 7.649962152726903e-07, + "loss": 1.0304, + "step": 7101 + }, + { + "epoch": 0.7469204779996582, + "grad_norm": 3.7124819307244246, + "learning_rate": 7.643952640831334e-07, + "loss": 0.9823, + "step": 7102 + }, + { + "epoch": 0.7470256484415055, + "grad_norm": 2.3773703882173183, + "learning_rate": 7.637945064269883e-07, + "loss": 1.0051, + "step": 7103 + }, + { + "epoch": 0.7471308188833529, + "grad_norm": 2.3915425949886466, + "learning_rate": 7.631939423712414e-07, + "loss": 0.9314, + "step": 7104 + }, + { + "epoch": 0.7472359893252002, + "grad_norm": 2.7826299688394003, + "learning_rate": 7.625935719828633e-07, + "loss": 0.9793, + "step": 7105 + }, + { + "epoch": 0.7473411597670475, + "grad_norm": 2.1770658600493884, + "learning_rate": 7.619933953287989e-07, + "loss": 0.989, + "step": 7106 + }, + { + "epoch": 0.7474463302088948, + "grad_norm": 2.877859070061451, + "learning_rate": 7.613934124759712e-07, + "loss": 1.0117, + "step": 7107 + }, + { + "epoch": 0.7475515006507422, + "grad_norm": 2.1132339489401253, + "learning_rate": 7.607936234912841e-07, + "loss": 1.0075, + "step": 7108 + }, + { + "epoch": 0.7476566710925894, + "grad_norm": 2.04198700411287, + "learning_rate": 7.601940284416187e-07, + "loss": 0.9941, + "step": 7109 + }, + { + "epoch": 0.7477618415344367, + "grad_norm": 3.077969718435496, + "learning_rate": 7.595946273938348e-07, + "loss": 0.9855, + "step": 7110 + }, + { + "epoch": 0.747867011976284, + "grad_norm": 1.851022408153244, + "learning_rate": 7.589954204147696e-07, + "loss": 0.9606, + "step": 7111 + }, + { + "epoch": 0.7479721824181313, + "grad_norm": 2.181580709946769, + "learning_rate": 7.583964075712402e-07, + "loss": 1.0158, + "step": 7112 + }, + { + "epoch": 0.7480773528599787, + "grad_norm": 2.5644893957012824, + "learning_rate": 7.577975889300395e-07, + "loss": 1.0342, + "step": 7113 + }, + { + "epoch": 0.748182523301826, + "grad_norm": 2.117573151952007, + "learning_rate": 7.57198964557942e-07, + "loss": 0.9742, + "step": 7114 + }, + { + "epoch": 0.7482876937436733, + "grad_norm": 2.645246462063244, + "learning_rate": 7.566005345216993e-07, + "loss": 1.0054, + "step": 7115 + }, + { + "epoch": 0.7483928641855206, + "grad_norm": 2.997740803186767, + "learning_rate": 7.560022988880392e-07, + "loss": 1.0511, + "step": 7116 + }, + { + "epoch": 0.748498034627368, + "grad_norm": 2.925719317892139, + "learning_rate": 7.554042577236706e-07, + "loss": 1.0065, + "step": 7117 + }, + { + "epoch": 0.7486032050692153, + "grad_norm": 1.9177386221309367, + "learning_rate": 7.548064110952799e-07, + "loss": 0.9757, + "step": 7118 + }, + { + "epoch": 0.7487083755110626, + "grad_norm": 2.088839253519746, + "learning_rate": 7.542087590695321e-07, + "loss": 0.9794, + "step": 7119 + }, + { + "epoch": 0.7488135459529099, + "grad_norm": 2.5378066740840493, + "learning_rate": 7.536113017130686e-07, + "loss": 0.9974, + "step": 7120 + }, + { + "epoch": 0.7489187163947573, + "grad_norm": 2.3323792119173956, + "learning_rate": 7.530140390925125e-07, + "loss": 0.9962, + "step": 7121 + }, + { + "epoch": 0.7490238868366046, + "grad_norm": 3.319157282262614, + "learning_rate": 7.524169712744612e-07, + "loss": 1.0356, + "step": 7122 + }, + { + "epoch": 0.7491290572784519, + "grad_norm": 3.4503166532322234, + "learning_rate": 7.518200983254931e-07, + "loss": 0.9966, + "step": 7123 + }, + { + "epoch": 0.7492342277202992, + "grad_norm": 2.23731079204491, + "learning_rate": 7.512234203121655e-07, + "loss": 1.0404, + "step": 7124 + }, + { + "epoch": 0.7493393981621466, + "grad_norm": 2.889672775952245, + "learning_rate": 7.506269373010106e-07, + "loss": 0.9698, + "step": 7125 + }, + { + "epoch": 0.7494445686039939, + "grad_norm": 2.3947512239243682, + "learning_rate": 7.500306493585424e-07, + "loss": 0.9817, + "step": 7126 + }, + { + "epoch": 0.7495497390458412, + "grad_norm": 2.1520151953932376, + "learning_rate": 7.494345565512504e-07, + "loss": 0.973, + "step": 7127 + }, + { + "epoch": 0.7496549094876885, + "grad_norm": 3.2505704587799995, + "learning_rate": 7.488386589456043e-07, + "loss": 1.011, + "step": 7128 + }, + { + "epoch": 0.7497600799295357, + "grad_norm": 2.6688857111662054, + "learning_rate": 7.482429566080518e-07, + "loss": 0.99, + "step": 7129 + }, + { + "epoch": 0.7498652503713831, + "grad_norm": 2.2536394568527727, + "learning_rate": 7.47647449605017e-07, + "loss": 0.9788, + "step": 7130 + }, + { + "epoch": 0.7499704208132304, + "grad_norm": 2.3527982867557764, + "learning_rate": 7.470521380029044e-07, + "loss": 1.0023, + "step": 7131 + }, + { + "epoch": 0.7500755912550777, + "grad_norm": 2.2238317421368268, + "learning_rate": 7.464570218680958e-07, + "loss": 0.9577, + "step": 7132 + }, + { + "epoch": 0.750180761696925, + "grad_norm": 2.500971462662424, + "learning_rate": 7.45862101266952e-07, + "loss": 0.9753, + "step": 7133 + }, + { + "epoch": 0.7502859321387724, + "grad_norm": 2.404293358312979, + "learning_rate": 7.452673762658096e-07, + "loss": 0.9654, + "step": 7134 + }, + { + "epoch": 0.7503911025806197, + "grad_norm": 2.0933883750611204, + "learning_rate": 7.446728469309872e-07, + "loss": 0.9753, + "step": 7135 + }, + { + "epoch": 0.750496273022467, + "grad_norm": 2.2681195797049076, + "learning_rate": 7.440785133287773e-07, + "loss": 0.9794, + "step": 7136 + }, + { + "epoch": 0.7506014434643143, + "grad_norm": 2.422935286133933, + "learning_rate": 7.434843755254534e-07, + "loss": 1.0004, + "step": 7137 + }, + { + "epoch": 0.7507066139061617, + "grad_norm": 3.881491057142409, + "learning_rate": 7.428904335872675e-07, + "loss": 0.9864, + "step": 7138 + }, + { + "epoch": 0.750811784348009, + "grad_norm": 2.966377670151308, + "learning_rate": 7.422966875804475e-07, + "loss": 1.0794, + "step": 7139 + }, + { + "epoch": 0.7509169547898563, + "grad_norm": 2.859330892149561, + "learning_rate": 7.417031375712009e-07, + "loss": 1.0196, + "step": 7140 + }, + { + "epoch": 0.7510221252317036, + "grad_norm": 2.8843383108973084, + "learning_rate": 7.411097836257141e-07, + "loss": 0.9402, + "step": 7141 + }, + { + "epoch": 0.751127295673551, + "grad_norm": 2.1645139983935056, + "learning_rate": 7.405166258101495e-07, + "loss": 0.998, + "step": 7142 + }, + { + "epoch": 0.7512324661153983, + "grad_norm": 3.3119860219502337, + "learning_rate": 7.399236641906498e-07, + "loss": 0.9658, + "step": 7143 + }, + { + "epoch": 0.7513376365572456, + "grad_norm": 2.0595918641126327, + "learning_rate": 7.393308988333337e-07, + "loss": 0.9997, + "step": 7144 + }, + { + "epoch": 0.7514428069990929, + "grad_norm": 3.1286563898373543, + "learning_rate": 7.387383298042994e-07, + "loss": 0.9831, + "step": 7145 + }, + { + "epoch": 0.7515479774409403, + "grad_norm": 2.8018989951253417, + "learning_rate": 7.381459571696237e-07, + "loss": 0.9272, + "step": 7146 + }, + { + "epoch": 0.7516531478827876, + "grad_norm": 2.7502028796110176, + "learning_rate": 7.375537809953609e-07, + "loss": 0.9867, + "step": 7147 + }, + { + "epoch": 0.7517583183246349, + "grad_norm": 2.678162761432206, + "learning_rate": 7.369618013475419e-07, + "loss": 1.0405, + "step": 7148 + }, + { + "epoch": 0.7518634887664822, + "grad_norm": 2.3470597444469794, + "learning_rate": 7.363700182921784e-07, + "loss": 1.0009, + "step": 7149 + }, + { + "epoch": 0.7519686592083294, + "grad_norm": 2.5934942008660897, + "learning_rate": 7.357784318952579e-07, + "loss": 1.0031, + "step": 7150 + }, + { + "epoch": 0.7520738296501768, + "grad_norm": 3.16545323400702, + "learning_rate": 7.35187042222747e-07, + "loss": 1.0078, + "step": 7151 + }, + { + "epoch": 0.7521790000920241, + "grad_norm": 2.409118323522591, + "learning_rate": 7.345958493405911e-07, + "loss": 0.9592, + "step": 7152 + }, + { + "epoch": 0.7522841705338714, + "grad_norm": 2.8526257467266967, + "learning_rate": 7.340048533147112e-07, + "loss": 0.9807, + "step": 7153 + }, + { + "epoch": 0.7523893409757187, + "grad_norm": 2.4382304350488035, + "learning_rate": 7.334140542110093e-07, + "loss": 0.9787, + "step": 7154 + }, + { + "epoch": 0.7524945114175661, + "grad_norm": 1.893802244461524, + "learning_rate": 7.328234520953634e-07, + "loss": 1.0078, + "step": 7155 + }, + { + "epoch": 0.7525996818594134, + "grad_norm": 2.3356044875747672, + "learning_rate": 7.322330470336314e-07, + "loss": 0.98, + "step": 7156 + }, + { + "epoch": 0.7527048523012607, + "grad_norm": 2.6357389878275543, + "learning_rate": 7.316428390916471e-07, + "loss": 0.976, + "step": 7157 + }, + { + "epoch": 0.752810022743108, + "grad_norm": 2.0816090851410043, + "learning_rate": 7.310528283352225e-07, + "loss": 0.9998, + "step": 7158 + }, + { + "epoch": 0.7529151931849554, + "grad_norm": 2.432089883393319, + "learning_rate": 7.304630148301495e-07, + "loss": 1.0173, + "step": 7159 + }, + { + "epoch": 0.7530203636268027, + "grad_norm": 3.2576438811204267, + "learning_rate": 7.298733986421963e-07, + "loss": 0.9316, + "step": 7160 + }, + { + "epoch": 0.75312553406865, + "grad_norm": 2.5192616299402535, + "learning_rate": 7.292839798371107e-07, + "loss": 0.9752, + "step": 7161 + }, + { + "epoch": 0.7532307045104973, + "grad_norm": 2.0662627309135466, + "learning_rate": 7.286947584806162e-07, + "loss": 1.0003, + "step": 7162 + }, + { + "epoch": 0.7533358749523447, + "grad_norm": 2.3318286805896697, + "learning_rate": 7.28105734638416e-07, + "loss": 0.9491, + "step": 7163 + }, + { + "epoch": 0.753441045394192, + "grad_norm": 1.7589980888158547, + "learning_rate": 7.275169083761915e-07, + "loss": 1.0146, + "step": 7164 + }, + { + "epoch": 0.7535462158360393, + "grad_norm": 2.3273013030064167, + "learning_rate": 7.269282797596003e-07, + "loss": 0.9642, + "step": 7165 + }, + { + "epoch": 0.7536513862778866, + "grad_norm": 2.912414660527191, + "learning_rate": 7.263398488542805e-07, + "loss": 1.0203, + "step": 7166 + }, + { + "epoch": 0.753756556719734, + "grad_norm": 2.82234921943586, + "learning_rate": 7.257516157258448e-07, + "loss": 0.9563, + "step": 7167 + }, + { + "epoch": 0.7538617271615813, + "grad_norm": 2.5257753971825565, + "learning_rate": 7.251635804398871e-07, + "loss": 0.9375, + "step": 7168 + }, + { + "epoch": 0.7539668976034286, + "grad_norm": 2.2636444621395486, + "learning_rate": 7.245757430619774e-07, + "loss": 0.9048, + "step": 7169 + }, + { + "epoch": 0.7540720680452758, + "grad_norm": 1.7968514845268797, + "learning_rate": 7.239881036576652e-07, + "loss": 1.0071, + "step": 7170 + }, + { + "epoch": 0.7541772384871231, + "grad_norm": 2.5407406591367714, + "learning_rate": 7.23400662292475e-07, + "loss": 0.9604, + "step": 7171 + }, + { + "epoch": 0.7542824089289705, + "grad_norm": 2.8060825489630976, + "learning_rate": 7.228134190319131e-07, + "loss": 1.016, + "step": 7172 + }, + { + "epoch": 0.7543875793708178, + "grad_norm": 2.7592153012337985, + "learning_rate": 7.222263739414595e-07, + "loss": 0.9704, + "step": 7173 + }, + { + "epoch": 0.7544927498126651, + "grad_norm": 1.982882504910971, + "learning_rate": 7.216395270865759e-07, + "loss": 0.9705, + "step": 7174 + }, + { + "epoch": 0.7545979202545124, + "grad_norm": 2.3971758450955254, + "learning_rate": 7.210528785327001e-07, + "loss": 0.9987, + "step": 7175 + }, + { + "epoch": 0.7547030906963598, + "grad_norm": 2.0404726366158235, + "learning_rate": 7.204664283452472e-07, + "loss": 0.9884, + "step": 7176 + }, + { + "epoch": 0.7548082611382071, + "grad_norm": 2.169832032755285, + "learning_rate": 7.198801765896115e-07, + "loss": 0.9897, + "step": 7177 + }, + { + "epoch": 0.7549134315800544, + "grad_norm": 3.109999050865016, + "learning_rate": 7.192941233311651e-07, + "loss": 1.0375, + "step": 7178 + }, + { + "epoch": 0.7550186020219017, + "grad_norm": 2.4498043657390474, + "learning_rate": 7.187082686352564e-07, + "loss": 0.9986, + "step": 7179 + }, + { + "epoch": 0.7551237724637491, + "grad_norm": 2.045678192773114, + "learning_rate": 7.181226125672142e-07, + "loss": 0.9928, + "step": 7180 + }, + { + "epoch": 0.7552289429055964, + "grad_norm": 2.506873399635673, + "learning_rate": 7.175371551923418e-07, + "loss": 0.9825, + "step": 7181 + }, + { + "epoch": 0.7553341133474437, + "grad_norm": 2.679384773475905, + "learning_rate": 7.169518965759231e-07, + "loss": 1.0099, + "step": 7182 + }, + { + "epoch": 0.755439283789291, + "grad_norm": 2.6893589546797467, + "learning_rate": 7.163668367832194e-07, + "loss": 0.9387, + "step": 7183 + }, + { + "epoch": 0.7555444542311384, + "grad_norm": 2.9496315141406906, + "learning_rate": 7.157819758794699e-07, + "loss": 0.9685, + "step": 7184 + }, + { + "epoch": 0.7556496246729857, + "grad_norm": 2.8192968825983615, + "learning_rate": 7.151973139298895e-07, + "loss": 0.9811, + "step": 7185 + }, + { + "epoch": 0.755754795114833, + "grad_norm": 2.823604895336445, + "learning_rate": 7.146128509996736e-07, + "loss": 1.0068, + "step": 7186 + }, + { + "epoch": 0.7558599655566803, + "grad_norm": 2.6359789895073855, + "learning_rate": 7.140285871539948e-07, + "loss": 0.9866, + "step": 7187 + }, + { + "epoch": 0.7559651359985277, + "grad_norm": 2.1868328887950144, + "learning_rate": 7.134445224580017e-07, + "loss": 0.9832, + "step": 7188 + }, + { + "epoch": 0.756070306440375, + "grad_norm": 2.2297442915008046, + "learning_rate": 7.128606569768237e-07, + "loss": 0.9415, + "step": 7189 + }, + { + "epoch": 0.7561754768822222, + "grad_norm": 2.157623716865805, + "learning_rate": 7.122769907755644e-07, + "loss": 1.0098, + "step": 7190 + }, + { + "epoch": 0.7562806473240695, + "grad_norm": 2.393849877655623, + "learning_rate": 7.116935239193085e-07, + "loss": 0.9886, + "step": 7191 + }, + { + "epoch": 0.7563858177659168, + "grad_norm": 2.5597033186090457, + "learning_rate": 7.111102564731173e-07, + "loss": 0.9664, + "step": 7192 + }, + { + "epoch": 0.7564909882077642, + "grad_norm": 3.176157747900133, + "learning_rate": 7.105271885020281e-07, + "loss": 1.0258, + "step": 7193 + }, + { + "epoch": 0.7565961586496115, + "grad_norm": 2.9605515801527513, + "learning_rate": 7.099443200710595e-07, + "loss": 0.9994, + "step": 7194 + }, + { + "epoch": 0.7567013290914588, + "grad_norm": 2.0428204214114, + "learning_rate": 7.093616512452042e-07, + "loss": 0.9946, + "step": 7195 + }, + { + "epoch": 0.7568064995333061, + "grad_norm": 2.5922763139539353, + "learning_rate": 7.087791820894349e-07, + "loss": 0.9879, + "step": 7196 + }, + { + "epoch": 0.7569116699751535, + "grad_norm": 2.2625784519484355, + "learning_rate": 7.081969126687014e-07, + "loss": 0.9551, + "step": 7197 + }, + { + "epoch": 0.7570168404170008, + "grad_norm": 2.7498123014483973, + "learning_rate": 7.076148430479321e-07, + "loss": 0.9919, + "step": 7198 + }, + { + "epoch": 0.7571220108588481, + "grad_norm": 2.2860922030180015, + "learning_rate": 7.070329732920308e-07, + "loss": 0.9593, + "step": 7199 + }, + { + "epoch": 0.7572271813006954, + "grad_norm": 2.1018686765589782, + "learning_rate": 7.064513034658812e-07, + "loss": 0.9519, + "step": 7200 + }, + { + "epoch": 0.7573323517425428, + "grad_norm": 2.0969982207355655, + "learning_rate": 7.05869833634345e-07, + "loss": 0.964, + "step": 7201 + }, + { + "epoch": 0.7574375221843901, + "grad_norm": 2.2200649084471045, + "learning_rate": 7.052885638622586e-07, + "loss": 0.949, + "step": 7202 + }, + { + "epoch": 0.7575426926262374, + "grad_norm": 2.5288138123773556, + "learning_rate": 7.047074942144399e-07, + "loss": 1.0022, + "step": 7203 + }, + { + "epoch": 0.7576478630680847, + "grad_norm": 2.914980411654761, + "learning_rate": 7.041266247556814e-07, + "loss": 0.9717, + "step": 7204 + }, + { + "epoch": 0.7577530335099321, + "grad_norm": 1.7833952010134386, + "learning_rate": 7.035459555507549e-07, + "loss": 0.9748, + "step": 7205 + }, + { + "epoch": 0.7578582039517794, + "grad_norm": 2.49777934486207, + "learning_rate": 7.029654866644098e-07, + "loss": 0.9616, + "step": 7206 + }, + { + "epoch": 0.7579633743936267, + "grad_norm": 2.748297431592962, + "learning_rate": 7.023852181613735e-07, + "loss": 0.99, + "step": 7207 + }, + { + "epoch": 0.758068544835474, + "grad_norm": 2.6942600142402537, + "learning_rate": 7.018051501063497e-07, + "loss": 1.026, + "step": 7208 + }, + { + "epoch": 0.7581737152773214, + "grad_norm": 3.1894240209205713, + "learning_rate": 7.01225282564019e-07, + "loss": 1.0089, + "step": 7209 + }, + { + "epoch": 0.7582788857191687, + "grad_norm": 1.8391440626240132, + "learning_rate": 7.006456155990444e-07, + "loss": 0.978, + "step": 7210 + }, + { + "epoch": 0.7583840561610159, + "grad_norm": 2.4573368871580317, + "learning_rate": 7.000661492760605e-07, + "loss": 0.9747, + "step": 7211 + }, + { + "epoch": 0.7584892266028632, + "grad_norm": 2.0727123712660758, + "learning_rate": 6.994868836596841e-07, + "loss": 1.0151, + "step": 7212 + }, + { + "epoch": 0.7585943970447105, + "grad_norm": 1.8852340383663915, + "learning_rate": 6.989078188145065e-07, + "loss": 0.9984, + "step": 7213 + }, + { + "epoch": 0.7586995674865579, + "grad_norm": 1.9223909811199098, + "learning_rate": 6.983289548050984e-07, + "loss": 1.0037, + "step": 7214 + }, + { + "epoch": 0.7588047379284052, + "grad_norm": 2.520783501726942, + "learning_rate": 6.977502916960083e-07, + "loss": 0.9742, + "step": 7215 + }, + { + "epoch": 0.7589099083702525, + "grad_norm": 2.838885326430099, + "learning_rate": 6.971718295517604e-07, + "loss": 1.0104, + "step": 7216 + }, + { + "epoch": 0.7590150788120998, + "grad_norm": 2.9156056508616257, + "learning_rate": 6.965935684368591e-07, + "loss": 1.0002, + "step": 7217 + }, + { + "epoch": 0.7591202492539472, + "grad_norm": 2.3167176341509514, + "learning_rate": 6.960155084157835e-07, + "loss": 0.9942, + "step": 7218 + }, + { + "epoch": 0.7592254196957945, + "grad_norm": 2.154619542983804, + "learning_rate": 6.954376495529927e-07, + "loss": 0.9635, + "step": 7219 + }, + { + "epoch": 0.7593305901376418, + "grad_norm": 2.708144396111361, + "learning_rate": 6.94859991912922e-07, + "loss": 0.9492, + "step": 7220 + }, + { + "epoch": 0.7594357605794891, + "grad_norm": 2.869899495540543, + "learning_rate": 6.942825355599861e-07, + "loss": 0.9659, + "step": 7221 + }, + { + "epoch": 0.7595409310213365, + "grad_norm": 2.7681751925235063, + "learning_rate": 6.93705280558574e-07, + "loss": 0.9965, + "step": 7222 + }, + { + "epoch": 0.7596461014631838, + "grad_norm": 2.454449329821474, + "learning_rate": 6.931282269730547e-07, + "loss": 0.9723, + "step": 7223 + }, + { + "epoch": 0.7597512719050311, + "grad_norm": 2.2182925652533085, + "learning_rate": 6.925513748677754e-07, + "loss": 0.9866, + "step": 7224 + }, + { + "epoch": 0.7598564423468784, + "grad_norm": 2.3307398988105255, + "learning_rate": 6.919747243070576e-07, + "loss": 0.9802, + "step": 7225 + }, + { + "epoch": 0.7599616127887258, + "grad_norm": 2.9729325369975705, + "learning_rate": 6.91398275355204e-07, + "loss": 0.9946, + "step": 7226 + }, + { + "epoch": 0.7600667832305731, + "grad_norm": 2.7311855204096185, + "learning_rate": 6.908220280764918e-07, + "loss": 0.9488, + "step": 7227 + }, + { + "epoch": 0.7601719536724204, + "grad_norm": 2.68897996658099, + "learning_rate": 6.902459825351776e-07, + "loss": 0.9728, + "step": 7228 + }, + { + "epoch": 0.7602771241142677, + "grad_norm": 2.3536132874792934, + "learning_rate": 6.896701387954957e-07, + "loss": 0.9418, + "step": 7229 + }, + { + "epoch": 0.7603822945561151, + "grad_norm": 2.604781814575496, + "learning_rate": 6.890944969216556e-07, + "loss": 0.9645, + "step": 7230 + }, + { + "epoch": 0.7604874649979623, + "grad_norm": 2.703365663091066, + "learning_rate": 6.885190569778477e-07, + "loss": 1.0083, + "step": 7231 + }, + { + "epoch": 0.7605926354398096, + "grad_norm": 2.995617326069344, + "learning_rate": 6.879438190282354e-07, + "loss": 0.9152, + "step": 7232 + }, + { + "epoch": 0.7606978058816569, + "grad_norm": 2.4869310918420573, + "learning_rate": 6.873687831369655e-07, + "loss": 1.0181, + "step": 7233 + }, + { + "epoch": 0.7608029763235042, + "grad_norm": 3.3720251417185545, + "learning_rate": 6.867939493681563e-07, + "loss": 0.9915, + "step": 7234 + }, + { + "epoch": 0.7609081467653516, + "grad_norm": 2.751154189351907, + "learning_rate": 6.862193177859081e-07, + "loss": 1.0399, + "step": 7235 + }, + { + "epoch": 0.7610133172071989, + "grad_norm": 2.9074664793566565, + "learning_rate": 6.856448884542951e-07, + "loss": 0.9934, + "step": 7236 + }, + { + "epoch": 0.7611184876490462, + "grad_norm": 1.9259825570766822, + "learning_rate": 6.850706614373715e-07, + "loss": 1.0186, + "step": 7237 + }, + { + "epoch": 0.7612236580908935, + "grad_norm": 2.2646689754710816, + "learning_rate": 6.844966367991688e-07, + "loss": 0.9753, + "step": 7238 + }, + { + "epoch": 0.7613288285327409, + "grad_norm": 1.9865318190535692, + "learning_rate": 6.839228146036936e-07, + "loss": 0.9906, + "step": 7239 + }, + { + "epoch": 0.7614339989745882, + "grad_norm": 2.101934374400583, + "learning_rate": 6.833491949149329e-07, + "loss": 0.9754, + "step": 7240 + }, + { + "epoch": 0.7615391694164355, + "grad_norm": 2.9805412783118554, + "learning_rate": 6.82775777796848e-07, + "loss": 0.9668, + "step": 7241 + }, + { + "epoch": 0.7616443398582828, + "grad_norm": 2.1434579040583226, + "learning_rate": 6.82202563313382e-07, + "loss": 1.009, + "step": 7242 + }, + { + "epoch": 0.7617495103001302, + "grad_norm": 2.5155188921882763, + "learning_rate": 6.816295515284513e-07, + "loss": 0.9747, + "step": 7243 + }, + { + "epoch": 0.7618546807419775, + "grad_norm": 2.351626885137821, + "learning_rate": 6.810567425059506e-07, + "loss": 0.9613, + "step": 7244 + }, + { + "epoch": 0.7619598511838248, + "grad_norm": 2.499487860831982, + "learning_rate": 6.804841363097536e-07, + "loss": 0.998, + "step": 7245 + }, + { + "epoch": 0.7620650216256721, + "grad_norm": 3.5082989671199454, + "learning_rate": 6.799117330037086e-07, + "loss": 1.0091, + "step": 7246 + }, + { + "epoch": 0.7621701920675195, + "grad_norm": 2.5497617555706222, + "learning_rate": 6.793395326516458e-07, + "loss": 1.0019, + "step": 7247 + }, + { + "epoch": 0.7622753625093668, + "grad_norm": 2.9850232501231315, + "learning_rate": 6.787675353173675e-07, + "loss": 0.9852, + "step": 7248 + }, + { + "epoch": 0.7623805329512141, + "grad_norm": 2.369868434119237, + "learning_rate": 6.781957410646581e-07, + "loss": 0.9482, + "step": 7249 + }, + { + "epoch": 0.7624857033930614, + "grad_norm": 2.2851427983587325, + "learning_rate": 6.776241499572747e-07, + "loss": 0.997, + "step": 7250 + }, + { + "epoch": 0.7625908738349086, + "grad_norm": 2.566339810963556, + "learning_rate": 6.770527620589554e-07, + "loss": 0.9698, + "step": 7251 + }, + { + "epoch": 0.762696044276756, + "grad_norm": 2.5226398298922685, + "learning_rate": 6.764815774334149e-07, + "loss": 1.0029, + "step": 7252 + }, + { + "epoch": 0.7628012147186033, + "grad_norm": 2.0900487583661564, + "learning_rate": 6.759105961443435e-07, + "loss": 0.9886, + "step": 7253 + }, + { + "epoch": 0.7629063851604506, + "grad_norm": 2.8035948742537626, + "learning_rate": 6.753398182554116e-07, + "loss": 0.954, + "step": 7254 + }, + { + "epoch": 0.7630115556022979, + "grad_norm": 2.2764400788498347, + "learning_rate": 6.747692438302624e-07, + "loss": 1.0132, + "step": 7255 + }, + { + "epoch": 0.7631167260441453, + "grad_norm": 3.4744512830475545, + "learning_rate": 6.741988729325235e-07, + "loss": 1.0112, + "step": 7256 + }, + { + "epoch": 0.7632218964859926, + "grad_norm": 3.0967876034792012, + "learning_rate": 6.736287056257925e-07, + "loss": 0.9576, + "step": 7257 + }, + { + "epoch": 0.7633270669278399, + "grad_norm": 2.346841697317024, + "learning_rate": 6.730587419736492e-07, + "loss": 0.9708, + "step": 7258 + }, + { + "epoch": 0.7634322373696872, + "grad_norm": 2.201886938588659, + "learning_rate": 6.724889820396488e-07, + "loss": 0.993, + "step": 7259 + }, + { + "epoch": 0.7635374078115346, + "grad_norm": 2.8480079520191044, + "learning_rate": 6.719194258873216e-07, + "loss": 0.9501, + "step": 7260 + }, + { + "epoch": 0.7636425782533819, + "grad_norm": 2.560782072345373, + "learning_rate": 6.713500735801811e-07, + "loss": 0.8903, + "step": 7261 + }, + { + "epoch": 0.7637477486952292, + "grad_norm": 2.159045835231064, + "learning_rate": 6.707809251817121e-07, + "loss": 0.9807, + "step": 7262 + }, + { + "epoch": 0.7638529191370765, + "grad_norm": 1.9003145255033675, + "learning_rate": 6.702119807553806e-07, + "loss": 0.9616, + "step": 7263 + }, + { + "epoch": 0.7639580895789239, + "grad_norm": 2.7151279520190976, + "learning_rate": 6.696432403646267e-07, + "loss": 0.9819, + "step": 7264 + }, + { + "epoch": 0.7640632600207712, + "grad_norm": 2.657753683243195, + "learning_rate": 6.690747040728702e-07, + "loss": 0.9873, + "step": 7265 + }, + { + "epoch": 0.7641684304626185, + "grad_norm": 1.8721650852607126, + "learning_rate": 6.685063719435081e-07, + "loss": 0.9638, + "step": 7266 + }, + { + "epoch": 0.7642736009044658, + "grad_norm": 2.385971269188057, + "learning_rate": 6.679382440399121e-07, + "loss": 0.9701, + "step": 7267 + }, + { + "epoch": 0.7643787713463132, + "grad_norm": 1.6808229698056358, + "learning_rate": 6.673703204254348e-07, + "loss": 0.9355, + "step": 7268 + }, + { + "epoch": 0.7644839417881605, + "grad_norm": 2.741069590738767, + "learning_rate": 6.668026011634019e-07, + "loss": 0.9926, + "step": 7269 + }, + { + "epoch": 0.7645891122300078, + "grad_norm": 2.117414364110104, + "learning_rate": 6.662350863171207e-07, + "loss": 0.9928, + "step": 7270 + }, + { + "epoch": 0.7646942826718551, + "grad_norm": 3.3484101267491737, + "learning_rate": 6.656677759498722e-07, + "loss": 0.9656, + "step": 7271 + }, + { + "epoch": 0.7647994531137023, + "grad_norm": 1.9706890021853318, + "learning_rate": 6.651006701249168e-07, + "loss": 1.0416, + "step": 7272 + }, + { + "epoch": 0.7649046235555497, + "grad_norm": 3.0413485520399663, + "learning_rate": 6.645337689054901e-07, + "loss": 0.9993, + "step": 7273 + }, + { + "epoch": 0.765009793997397, + "grad_norm": 2.0490262655425626, + "learning_rate": 6.639670723548066e-07, + "loss": 1.0095, + "step": 7274 + }, + { + "epoch": 0.7651149644392443, + "grad_norm": 2.8013420666362183, + "learning_rate": 6.63400580536058e-07, + "loss": 0.9992, + "step": 7275 + }, + { + "epoch": 0.7652201348810916, + "grad_norm": 2.047516643390938, + "learning_rate": 6.628342935124113e-07, + "loss": 1.0431, + "step": 7276 + }, + { + "epoch": 0.765325305322939, + "grad_norm": 1.8990163407770566, + "learning_rate": 6.622682113470131e-07, + "loss": 0.9764, + "step": 7277 + }, + { + "epoch": 0.7654304757647863, + "grad_norm": 2.450404087097505, + "learning_rate": 6.617023341029835e-07, + "loss": 0.9911, + "step": 7278 + }, + { + "epoch": 0.7655356462066336, + "grad_norm": 2.3743125938259735, + "learning_rate": 6.611366618434259e-07, + "loss": 0.9901, + "step": 7279 + }, + { + "epoch": 0.7656408166484809, + "grad_norm": 2.6744820836183436, + "learning_rate": 6.605711946314153e-07, + "loss": 0.9975, + "step": 7280 + }, + { + "epoch": 0.7657459870903283, + "grad_norm": 2.2318926191610093, + "learning_rate": 6.600059325300049e-07, + "loss": 0.9983, + "step": 7281 + }, + { + "epoch": 0.7658511575321756, + "grad_norm": 2.6271251940737734, + "learning_rate": 6.594408756022272e-07, + "loss": 0.9516, + "step": 7282 + }, + { + "epoch": 0.7659563279740229, + "grad_norm": 2.686237334138938, + "learning_rate": 6.588760239110887e-07, + "loss": 1.0418, + "step": 7283 + }, + { + "epoch": 0.7660614984158702, + "grad_norm": 2.7534033695073665, + "learning_rate": 6.583113775195771e-07, + "loss": 1.0173, + "step": 7284 + }, + { + "epoch": 0.7661666688577176, + "grad_norm": 2.2831574538057744, + "learning_rate": 6.577469364906527e-07, + "loss": 0.9971, + "step": 7285 + }, + { + "epoch": 0.7662718392995649, + "grad_norm": 2.7528580601431263, + "learning_rate": 6.571827008872572e-07, + "loss": 0.9858, + "step": 7286 + }, + { + "epoch": 0.7663770097414122, + "grad_norm": 2.305246939209575, + "learning_rate": 6.566186707723049e-07, + "loss": 0.9331, + "step": 7287 + }, + { + "epoch": 0.7664821801832595, + "grad_norm": 2.1919285177867747, + "learning_rate": 6.560548462086911e-07, + "loss": 0.9929, + "step": 7288 + }, + { + "epoch": 0.7665873506251069, + "grad_norm": 2.617341622706207, + "learning_rate": 6.554912272592867e-07, + "loss": 0.9793, + "step": 7289 + }, + { + "epoch": 0.7666925210669542, + "grad_norm": 3.2075296541728506, + "learning_rate": 6.549278139869383e-07, + "loss": 0.963, + "step": 7290 + }, + { + "epoch": 0.7667976915088015, + "grad_norm": 2.4692337859546227, + "learning_rate": 6.543646064544725e-07, + "loss": 0.9813, + "step": 7291 + }, + { + "epoch": 0.7669028619506487, + "grad_norm": 2.161864541098391, + "learning_rate": 6.53801604724689e-07, + "loss": 0.9615, + "step": 7292 + }, + { + "epoch": 0.767008032392496, + "grad_norm": 2.2773528909260428, + "learning_rate": 6.5323880886037e-07, + "loss": 0.9771, + "step": 7293 + }, + { + "epoch": 0.7671132028343434, + "grad_norm": 2.9832947081070955, + "learning_rate": 6.526762189242692e-07, + "loss": 1.0007, + "step": 7294 + }, + { + "epoch": 0.7672183732761907, + "grad_norm": 2.715135683153078, + "learning_rate": 6.521138349791209e-07, + "loss": 0.9939, + "step": 7295 + }, + { + "epoch": 0.767323543718038, + "grad_norm": 2.470749049910447, + "learning_rate": 6.515516570876351e-07, + "loss": 0.9985, + "step": 7296 + }, + { + "epoch": 0.7674287141598853, + "grad_norm": 2.2143297282737655, + "learning_rate": 6.509896853124972e-07, + "loss": 0.9607, + "step": 7297 + }, + { + "epoch": 0.7675338846017327, + "grad_norm": 2.887958915857869, + "learning_rate": 6.504279197163746e-07, + "loss": 0.9331, + "step": 7298 + }, + { + "epoch": 0.76763905504358, + "grad_norm": 3.027736095235375, + "learning_rate": 6.498663603619062e-07, + "loss": 1.005, + "step": 7299 + }, + { + "epoch": 0.7677442254854273, + "grad_norm": 2.6040148356675403, + "learning_rate": 6.493050073117115e-07, + "loss": 1.0007, + "step": 7300 + }, + { + "epoch": 0.7678493959272746, + "grad_norm": 2.4651062503117127, + "learning_rate": 6.487438606283847e-07, + "loss": 0.9931, + "step": 7301 + }, + { + "epoch": 0.767954566369122, + "grad_norm": 2.1995832711178593, + "learning_rate": 6.481829203744985e-07, + "loss": 0.9499, + "step": 7302 + }, + { + "epoch": 0.7680597368109693, + "grad_norm": 2.7534359404754913, + "learning_rate": 6.476221866126029e-07, + "loss": 0.9385, + "step": 7303 + }, + { + "epoch": 0.7681649072528166, + "grad_norm": 2.0684676604556387, + "learning_rate": 6.470616594052223e-07, + "loss": 1.0037, + "step": 7304 + }, + { + "epoch": 0.7682700776946639, + "grad_norm": 2.474552308274405, + "learning_rate": 6.465013388148616e-07, + "loss": 0.9923, + "step": 7305 + }, + { + "epoch": 0.7683752481365113, + "grad_norm": 1.7973959275988587, + "learning_rate": 6.459412249039987e-07, + "loss": 0.9635, + "step": 7306 + }, + { + "epoch": 0.7684804185783586, + "grad_norm": 2.12766206020307, + "learning_rate": 6.453813177350934e-07, + "loss": 0.9721, + "step": 7307 + }, + { + "epoch": 0.7685855890202059, + "grad_norm": 2.371873219751724, + "learning_rate": 6.448216173705777e-07, + "loss": 0.9784, + "step": 7308 + }, + { + "epoch": 0.7686907594620532, + "grad_norm": 2.0694569481455987, + "learning_rate": 6.442621238728639e-07, + "loss": 0.9855, + "step": 7309 + }, + { + "epoch": 0.7687959299039006, + "grad_norm": 3.206943381862623, + "learning_rate": 6.437028373043386e-07, + "loss": 1.0048, + "step": 7310 + }, + { + "epoch": 0.7689011003457479, + "grad_norm": 3.365367181152907, + "learning_rate": 6.431437577273669e-07, + "loss": 0.9555, + "step": 7311 + }, + { + "epoch": 0.7690062707875951, + "grad_norm": 2.8202002768816925, + "learning_rate": 6.425848852042918e-07, + "loss": 1.0238, + "step": 7312 + }, + { + "epoch": 0.7691114412294424, + "grad_norm": 2.47701840199492, + "learning_rate": 6.420262197974297e-07, + "loss": 0.991, + "step": 7313 + }, + { + "epoch": 0.7692166116712897, + "grad_norm": 2.3044685577607513, + "learning_rate": 6.414677615690784e-07, + "loss": 0.9681, + "step": 7314 + }, + { + "epoch": 0.7693217821131371, + "grad_norm": 3.4475573948156413, + "learning_rate": 6.409095105815086e-07, + "loss": 0.9781, + "step": 7315 + }, + { + "epoch": 0.7694269525549844, + "grad_norm": 1.8908622557377632, + "learning_rate": 6.403514668969702e-07, + "loss": 0.976, + "step": 7316 + }, + { + "epoch": 0.7695321229968317, + "grad_norm": 2.1550644787890785, + "learning_rate": 6.397936305776903e-07, + "loss": 0.9988, + "step": 7317 + }, + { + "epoch": 0.769637293438679, + "grad_norm": 3.0769836395456025, + "learning_rate": 6.392360016858703e-07, + "loss": 0.9766, + "step": 7318 + }, + { + "epoch": 0.7697424638805264, + "grad_norm": 2.5329631219295625, + "learning_rate": 6.38678580283692e-07, + "loss": 1.0409, + "step": 7319 + }, + { + "epoch": 0.7698476343223737, + "grad_norm": 3.1252877879621637, + "learning_rate": 6.381213664333096e-07, + "loss": 0.9786, + "step": 7320 + }, + { + "epoch": 0.769952804764221, + "grad_norm": 2.477880542696044, + "learning_rate": 6.375643601968598e-07, + "loss": 0.985, + "step": 7321 + }, + { + "epoch": 0.7700579752060683, + "grad_norm": 2.8094356003054846, + "learning_rate": 6.370075616364512e-07, + "loss": 1.0039, + "step": 7322 + }, + { + "epoch": 0.7701631456479157, + "grad_norm": 2.8079925876475476, + "learning_rate": 6.364509708141725e-07, + "loss": 1.0002, + "step": 7323 + }, + { + "epoch": 0.770268316089763, + "grad_norm": 2.3156139981672035, + "learning_rate": 6.358945877920861e-07, + "loss": 0.9701, + "step": 7324 + }, + { + "epoch": 0.7703734865316103, + "grad_norm": 2.3629511594914994, + "learning_rate": 6.353384126322343e-07, + "loss": 1.0154, + "step": 7325 + }, + { + "epoch": 0.7704786569734576, + "grad_norm": 2.898189649288246, + "learning_rate": 6.347824453966354e-07, + "loss": 0.9892, + "step": 7326 + }, + { + "epoch": 0.770583827415305, + "grad_norm": 2.122895447793783, + "learning_rate": 6.342266861472823e-07, + "loss": 0.9902, + "step": 7327 + }, + { + "epoch": 0.7706889978571523, + "grad_norm": 2.7711699801067486, + "learning_rate": 6.336711349461486e-07, + "loss": 0.9745, + "step": 7328 + }, + { + "epoch": 0.7707941682989996, + "grad_norm": 2.5654011499268354, + "learning_rate": 6.331157918551801e-07, + "loss": 1.033, + "step": 7329 + }, + { + "epoch": 0.7708993387408469, + "grad_norm": 1.7248857187294304, + "learning_rate": 6.325606569363044e-07, + "loss": 0.9897, + "step": 7330 + }, + { + "epoch": 0.7710045091826943, + "grad_norm": 2.540837264737888, + "learning_rate": 6.320057302514223e-07, + "loss": 0.9716, + "step": 7331 + }, + { + "epoch": 0.7711096796245416, + "grad_norm": 2.1994727634563467, + "learning_rate": 6.314510118624121e-07, + "loss": 1.0289, + "step": 7332 + }, + { + "epoch": 0.7712148500663888, + "grad_norm": 3.2728653021918515, + "learning_rate": 6.308965018311289e-07, + "loss": 0.9727, + "step": 7333 + }, + { + "epoch": 0.7713200205082361, + "grad_norm": 2.323117451826077, + "learning_rate": 6.303422002194057e-07, + "loss": 1.0584, + "step": 7334 + }, + { + "epoch": 0.7714251909500834, + "grad_norm": 2.991321743338717, + "learning_rate": 6.297881070890519e-07, + "loss": 0.9433, + "step": 7335 + }, + { + "epoch": 0.7715303613919308, + "grad_norm": 1.953839287290166, + "learning_rate": 6.292342225018517e-07, + "loss": 1.0006, + "step": 7336 + }, + { + "epoch": 0.7716355318337781, + "grad_norm": 2.0866972087128044, + "learning_rate": 6.286805465195691e-07, + "loss": 0.964, + "step": 7337 + }, + { + "epoch": 0.7717407022756254, + "grad_norm": 2.647572811142748, + "learning_rate": 6.281270792039418e-07, + "loss": 1.0017, + "step": 7338 + }, + { + "epoch": 0.7718458727174727, + "grad_norm": 2.3592122810731553, + "learning_rate": 6.275738206166862e-07, + "loss": 0.9775, + "step": 7339 + }, + { + "epoch": 0.7719510431593201, + "grad_norm": 1.9493966901790556, + "learning_rate": 6.270207708194959e-07, + "loss": 1.0169, + "step": 7340 + }, + { + "epoch": 0.7720562136011674, + "grad_norm": 2.4294207839002384, + "learning_rate": 6.264679298740389e-07, + "loss": 0.9461, + "step": 7341 + }, + { + "epoch": 0.7721613840430147, + "grad_norm": 2.1711248655387347, + "learning_rate": 6.259152978419625e-07, + "loss": 0.9646, + "step": 7342 + }, + { + "epoch": 0.772266554484862, + "grad_norm": 2.526321431390834, + "learning_rate": 6.253628747848872e-07, + "loss": 0.9981, + "step": 7343 + }, + { + "epoch": 0.7723717249267094, + "grad_norm": 2.4845350540111806, + "learning_rate": 6.248106607644155e-07, + "loss": 0.9788, + "step": 7344 + }, + { + "epoch": 0.7724768953685567, + "grad_norm": 2.3629873655101923, + "learning_rate": 6.242586558421216e-07, + "loss": 0.9314, + "step": 7345 + }, + { + "epoch": 0.772582065810404, + "grad_norm": 3.0934894712871963, + "learning_rate": 6.237068600795593e-07, + "loss": 0.9113, + "step": 7346 + }, + { + "epoch": 0.7726872362522513, + "grad_norm": 2.022489507710269, + "learning_rate": 6.23155273538257e-07, + "loss": 1.0347, + "step": 7347 + }, + { + "epoch": 0.7727924066940987, + "grad_norm": 1.9206629909055462, + "learning_rate": 6.226038962797218e-07, + "loss": 0.9649, + "step": 7348 + }, + { + "epoch": 0.772897577135946, + "grad_norm": 2.4285553532799167, + "learning_rate": 6.220527283654368e-07, + "loss": 0.9841, + "step": 7349 + }, + { + "epoch": 0.7730027475777933, + "grad_norm": 2.2432647116261912, + "learning_rate": 6.215017698568604e-07, + "loss": 0.9608, + "step": 7350 + }, + { + "epoch": 0.7731079180196406, + "grad_norm": 2.2176480767044664, + "learning_rate": 6.209510208154299e-07, + "loss": 0.9774, + "step": 7351 + }, + { + "epoch": 0.773213088461488, + "grad_norm": 2.250001090894873, + "learning_rate": 6.204004813025569e-07, + "loss": 1.0104, + "step": 7352 + }, + { + "epoch": 0.7733182589033352, + "grad_norm": 1.5454534677927962, + "learning_rate": 6.198501513796315e-07, + "loss": 0.9279, + "step": 7353 + }, + { + "epoch": 0.7734234293451825, + "grad_norm": 2.839261743707263, + "learning_rate": 6.193000311080203e-07, + "loss": 0.967, + "step": 7354 + }, + { + "epoch": 0.7735285997870298, + "grad_norm": 2.6135585390560903, + "learning_rate": 6.187501205490648e-07, + "loss": 0.9666, + "step": 7355 + }, + { + "epoch": 0.7736337702288771, + "grad_norm": 2.441585421023636, + "learning_rate": 6.18200419764085e-07, + "loss": 0.9487, + "step": 7356 + }, + { + "epoch": 0.7737389406707245, + "grad_norm": 2.154135654667443, + "learning_rate": 6.176509288143768e-07, + "loss": 0.9512, + "step": 7357 + }, + { + "epoch": 0.7738441111125718, + "grad_norm": 2.0470908979748317, + "learning_rate": 6.17101647761213e-07, + "loss": 1.009, + "step": 7358 + }, + { + "epoch": 0.7739492815544191, + "grad_norm": 2.30034891783103, + "learning_rate": 6.16552576665842e-07, + "loss": 0.9784, + "step": 7359 + }, + { + "epoch": 0.7740544519962664, + "grad_norm": 2.796522111094815, + "learning_rate": 6.160037155894902e-07, + "loss": 0.9799, + "step": 7360 + }, + { + "epoch": 0.7741596224381138, + "grad_norm": 2.635375796645631, + "learning_rate": 6.154550645933591e-07, + "loss": 0.9638, + "step": 7361 + }, + { + "epoch": 0.7742647928799611, + "grad_norm": 2.5343887605480924, + "learning_rate": 6.149066237386278e-07, + "loss": 1.0155, + "step": 7362 + }, + { + "epoch": 0.7743699633218084, + "grad_norm": 2.3060590549923874, + "learning_rate": 6.143583930864527e-07, + "loss": 0.9765, + "step": 7363 + }, + { + "epoch": 0.7744751337636557, + "grad_norm": 2.196968553332674, + "learning_rate": 6.138103726979641e-07, + "loss": 0.9767, + "step": 7364 + }, + { + "epoch": 0.7745803042055031, + "grad_norm": 1.8053420148777697, + "learning_rate": 6.132625626342723e-07, + "loss": 1.0004, + "step": 7365 + }, + { + "epoch": 0.7746854746473504, + "grad_norm": 1.974966755215082, + "learning_rate": 6.127149629564605e-07, + "loss": 1.0041, + "step": 7366 + }, + { + "epoch": 0.7747906450891977, + "grad_norm": 2.4074095725370777, + "learning_rate": 6.121675737255913e-07, + "loss": 0.9991, + "step": 7367 + }, + { + "epoch": 0.774895815531045, + "grad_norm": 2.771485579995371, + "learning_rate": 6.116203950027036e-07, + "loss": 0.9738, + "step": 7368 + }, + { + "epoch": 0.7750009859728924, + "grad_norm": 2.5495785766223857, + "learning_rate": 6.110734268488106e-07, + "loss": 1.0109, + "step": 7369 + }, + { + "epoch": 0.7751061564147397, + "grad_norm": 2.665289353085339, + "learning_rate": 6.105266693249043e-07, + "loss": 0.9922, + "step": 7370 + }, + { + "epoch": 0.775211326856587, + "grad_norm": 3.2766595461201926, + "learning_rate": 6.099801224919522e-07, + "loss": 1.0238, + "step": 7371 + }, + { + "epoch": 0.7753164972984343, + "grad_norm": 2.810829062797779, + "learning_rate": 6.094337864108993e-07, + "loss": 0.9732, + "step": 7372 + }, + { + "epoch": 0.7754216677402815, + "grad_norm": 2.2147571794146486, + "learning_rate": 6.08887661142665e-07, + "loss": 0.9908, + "step": 7373 + }, + { + "epoch": 0.7755268381821289, + "grad_norm": 2.1162563329579482, + "learning_rate": 6.083417467481479e-07, + "loss": 0.9771, + "step": 7374 + }, + { + "epoch": 0.7756320086239762, + "grad_norm": 3.0165229052813265, + "learning_rate": 6.077960432882202e-07, + "loss": 0.9558, + "step": 7375 + }, + { + "epoch": 0.7757371790658235, + "grad_norm": 2.780788058142792, + "learning_rate": 6.072505508237328e-07, + "loss": 0.9393, + "step": 7376 + }, + { + "epoch": 0.7758423495076708, + "grad_norm": 2.746344323741756, + "learning_rate": 6.067052694155132e-07, + "loss": 0.9889, + "step": 7377 + }, + { + "epoch": 0.7759475199495182, + "grad_norm": 2.8304408738719853, + "learning_rate": 6.061601991243629e-07, + "loss": 0.976, + "step": 7378 + }, + { + "epoch": 0.7760526903913655, + "grad_norm": 2.7760091448267428, + "learning_rate": 6.056153400110623e-07, + "loss": 0.9813, + "step": 7379 + }, + { + "epoch": 0.7761578608332128, + "grad_norm": 2.5521937340727234, + "learning_rate": 6.050706921363672e-07, + "loss": 0.9967, + "step": 7380 + }, + { + "epoch": 0.7762630312750601, + "grad_norm": 2.179859514373383, + "learning_rate": 6.045262555610113e-07, + "loss": 0.9829, + "step": 7381 + }, + { + "epoch": 0.7763682017169075, + "grad_norm": 2.7632582697479187, + "learning_rate": 6.039820303457022e-07, + "loss": 1.0111, + "step": 7382 + }, + { + "epoch": 0.7764733721587548, + "grad_norm": 2.822582215401159, + "learning_rate": 6.034380165511247e-07, + "loss": 0.996, + "step": 7383 + }, + { + "epoch": 0.7765785426006021, + "grad_norm": 2.8025819156968517, + "learning_rate": 6.028942142379416e-07, + "loss": 0.9985, + "step": 7384 + }, + { + "epoch": 0.7766837130424494, + "grad_norm": 2.311116198164196, + "learning_rate": 6.023506234667908e-07, + "loss": 0.9911, + "step": 7385 + }, + { + "epoch": 0.7767888834842968, + "grad_norm": 2.7484844931535504, + "learning_rate": 6.018072442982875e-07, + "loss": 1.0146, + "step": 7386 + }, + { + "epoch": 0.7768940539261441, + "grad_norm": 2.3912901717058896, + "learning_rate": 6.012640767930217e-07, + "loss": 0.9586, + "step": 7387 + }, + { + "epoch": 0.7769992243679914, + "grad_norm": 2.7928691824138614, + "learning_rate": 6.00721121011561e-07, + "loss": 0.9506, + "step": 7388 + }, + { + "epoch": 0.7771043948098387, + "grad_norm": 2.0128303006970305, + "learning_rate": 6.001783770144504e-07, + "loss": 1.0171, + "step": 7389 + }, + { + "epoch": 0.777209565251686, + "grad_norm": 2.240404598690575, + "learning_rate": 5.99635844862208e-07, + "loss": 0.9801, + "step": 7390 + }, + { + "epoch": 0.7773147356935334, + "grad_norm": 2.001832281945611, + "learning_rate": 5.990935246153326e-07, + "loss": 0.9841, + "step": 7391 + }, + { + "epoch": 0.7774199061353807, + "grad_norm": 2.653987884349642, + "learning_rate": 5.985514163342948e-07, + "loss": 0.9892, + "step": 7392 + }, + { + "epoch": 0.777525076577228, + "grad_norm": 2.5723150950012834, + "learning_rate": 5.980095200795452e-07, + "loss": 0.9935, + "step": 7393 + }, + { + "epoch": 0.7776302470190752, + "grad_norm": 2.0286439737482875, + "learning_rate": 5.974678359115094e-07, + "loss": 0.9336, + "step": 7394 + }, + { + "epoch": 0.7777354174609226, + "grad_norm": 2.672262654260771, + "learning_rate": 5.9692636389059e-07, + "loss": 0.9862, + "step": 7395 + }, + { + "epoch": 0.7778405879027699, + "grad_norm": 2.9017249311317173, + "learning_rate": 5.963851040771639e-07, + "loss": 0.9836, + "step": 7396 + }, + { + "epoch": 0.7779457583446172, + "grad_norm": 3.0804120297128468, + "learning_rate": 5.958440565315871e-07, + "loss": 1.0163, + "step": 7397 + }, + { + "epoch": 0.7780509287864645, + "grad_norm": 2.6802032536538176, + "learning_rate": 5.953032213141894e-07, + "loss": 0.9765, + "step": 7398 + }, + { + "epoch": 0.7781560992283119, + "grad_norm": 2.051038288544476, + "learning_rate": 5.947625984852787e-07, + "loss": 0.9461, + "step": 7399 + }, + { + "epoch": 0.7782612696701592, + "grad_norm": 2.1836813883193362, + "learning_rate": 5.942221881051394e-07, + "loss": 0.9772, + "step": 7400 + }, + { + "epoch": 0.7783664401120065, + "grad_norm": 2.6840419851820956, + "learning_rate": 5.936819902340299e-07, + "loss": 1.0152, + "step": 7401 + }, + { + "epoch": 0.7784716105538538, + "grad_norm": 2.431122357081175, + "learning_rate": 5.931420049321873e-07, + "loss": 0.9785, + "step": 7402 + }, + { + "epoch": 0.7785767809957012, + "grad_norm": 2.439690169026877, + "learning_rate": 5.926022322598249e-07, + "loss": 0.9506, + "step": 7403 + }, + { + "epoch": 0.7786819514375485, + "grad_norm": 3.004232477345118, + "learning_rate": 5.920626722771303e-07, + "loss": 0.9238, + "step": 7404 + }, + { + "epoch": 0.7787871218793958, + "grad_norm": 2.7134097989656207, + "learning_rate": 5.915233250442695e-07, + "loss": 0.9726, + "step": 7405 + }, + { + "epoch": 0.7788922923212431, + "grad_norm": 3.4146505334812702, + "learning_rate": 5.909841906213828e-07, + "loss": 1.0107, + "step": 7406 + }, + { + "epoch": 0.7789974627630905, + "grad_norm": 1.9095501012682825, + "learning_rate": 5.904452690685888e-07, + "loss": 1.0127, + "step": 7407 + }, + { + "epoch": 0.7791026332049378, + "grad_norm": 2.5125920673209254, + "learning_rate": 5.899065604459814e-07, + "loss": 0.9826, + "step": 7408 + }, + { + "epoch": 0.7792078036467851, + "grad_norm": 2.028561314466869, + "learning_rate": 5.893680648136311e-07, + "loss": 0.9689, + "step": 7409 + }, + { + "epoch": 0.7793129740886324, + "grad_norm": 2.20112168268948, + "learning_rate": 5.888297822315831e-07, + "loss": 0.9506, + "step": 7410 + }, + { + "epoch": 0.7794181445304798, + "grad_norm": 1.8066767860491197, + "learning_rate": 5.882917127598608e-07, + "loss": 0.9993, + "step": 7411 + }, + { + "epoch": 0.7795233149723271, + "grad_norm": 2.975379972499003, + "learning_rate": 5.877538564584642e-07, + "loss": 0.9631, + "step": 7412 + }, + { + "epoch": 0.7796284854141744, + "grad_norm": 2.533926267530022, + "learning_rate": 5.872162133873666e-07, + "loss": 0.9686, + "step": 7413 + }, + { + "epoch": 0.7797336558560216, + "grad_norm": 2.5144691976081175, + "learning_rate": 5.866787836065211e-07, + "loss": 1.0199, + "step": 7414 + }, + { + "epoch": 0.7798388262978689, + "grad_norm": 1.9435003292682747, + "learning_rate": 5.861415671758536e-07, + "loss": 0.977, + "step": 7415 + }, + { + "epoch": 0.7799439967397163, + "grad_norm": 2.516714132211873, + "learning_rate": 5.856045641552685e-07, + "loss": 1.0082, + "step": 7416 + }, + { + "epoch": 0.7800491671815636, + "grad_norm": 2.283896919126085, + "learning_rate": 5.850677746046471e-07, + "loss": 0.9556, + "step": 7417 + }, + { + "epoch": 0.7801543376234109, + "grad_norm": 2.396317543406843, + "learning_rate": 5.845311985838437e-07, + "loss": 1.0118, + "step": 7418 + }, + { + "epoch": 0.7802595080652582, + "grad_norm": 1.8411662172460828, + "learning_rate": 5.83994836152692e-07, + "loss": 1.0051, + "step": 7419 + }, + { + "epoch": 0.7803646785071056, + "grad_norm": 2.1120279732981038, + "learning_rate": 5.83458687371e-07, + "loss": 0.9711, + "step": 7420 + }, + { + "epoch": 0.7804698489489529, + "grad_norm": 2.5159122284997815, + "learning_rate": 5.82922752298552e-07, + "loss": 1.0153, + "step": 7421 + }, + { + "epoch": 0.7805750193908002, + "grad_norm": 3.0017293005335217, + "learning_rate": 5.823870309951096e-07, + "loss": 0.9498, + "step": 7422 + }, + { + "epoch": 0.7806801898326475, + "grad_norm": 2.7311817018256264, + "learning_rate": 5.818515235204109e-07, + "loss": 1.0429, + "step": 7423 + }, + { + "epoch": 0.7807853602744949, + "grad_norm": 2.1998485383151873, + "learning_rate": 5.813162299341669e-07, + "loss": 0.971, + "step": 7424 + }, + { + "epoch": 0.7808905307163422, + "grad_norm": 2.364551392600312, + "learning_rate": 5.807811502960683e-07, + "loss": 0.9979, + "step": 7425 + }, + { + "epoch": 0.7809957011581895, + "grad_norm": 2.574638685273489, + "learning_rate": 5.802462846657811e-07, + "loss": 1.0097, + "step": 7426 + }, + { + "epoch": 0.7811008716000368, + "grad_norm": 2.1856284743115335, + "learning_rate": 5.797116331029456e-07, + "loss": 0.9878, + "step": 7427 + }, + { + "epoch": 0.7812060420418842, + "grad_norm": 2.1441801091958776, + "learning_rate": 5.79177195667181e-07, + "loss": 1.005, + "step": 7428 + }, + { + "epoch": 0.7813112124837315, + "grad_norm": 2.3913829746681805, + "learning_rate": 5.786429724180801e-07, + "loss": 0.9599, + "step": 7429 + }, + { + "epoch": 0.7814163829255788, + "grad_norm": 2.4163311751474006, + "learning_rate": 5.781089634152131e-07, + "loss": 0.9963, + "step": 7430 + }, + { + "epoch": 0.7815215533674261, + "grad_norm": 3.0068250817206343, + "learning_rate": 5.775751687181266e-07, + "loss": 0.9787, + "step": 7431 + }, + { + "epoch": 0.7816267238092734, + "grad_norm": 2.896952363328142, + "learning_rate": 5.770415883863434e-07, + "loss": 1.0102, + "step": 7432 + }, + { + "epoch": 0.7817318942511208, + "grad_norm": 2.342426916784213, + "learning_rate": 5.765082224793611e-07, + "loss": 0.9588, + "step": 7433 + }, + { + "epoch": 0.781837064692968, + "grad_norm": 2.322954163018337, + "learning_rate": 5.75975071056653e-07, + "loss": 0.9995, + "step": 7434 + }, + { + "epoch": 0.7819422351348153, + "grad_norm": 2.7124852787883476, + "learning_rate": 5.75442134177672e-07, + "loss": 0.9848, + "step": 7435 + }, + { + "epoch": 0.7820474055766626, + "grad_norm": 2.262898311145278, + "learning_rate": 5.749094119018431e-07, + "loss": 1.0258, + "step": 7436 + }, + { + "epoch": 0.78215257601851, + "grad_norm": 2.4401944041421793, + "learning_rate": 5.743769042885697e-07, + "loss": 0.9959, + "step": 7437 + }, + { + "epoch": 0.7822577464603573, + "grad_norm": 3.501822098234867, + "learning_rate": 5.738446113972298e-07, + "loss": 1.0054, + "step": 7438 + }, + { + "epoch": 0.7823629169022046, + "grad_norm": 2.4007429912831073, + "learning_rate": 5.733125332871786e-07, + "loss": 0.9699, + "step": 7439 + }, + { + "epoch": 0.7824680873440519, + "grad_norm": 1.5286404234678088, + "learning_rate": 5.727806700177477e-07, + "loss": 0.9517, + "step": 7440 + }, + { + "epoch": 0.7825732577858993, + "grad_norm": 1.9106204956021087, + "learning_rate": 5.722490216482424e-07, + "loss": 0.9594, + "step": 7441 + }, + { + "epoch": 0.7826784282277466, + "grad_norm": 2.392573929678364, + "learning_rate": 5.717175882379473e-07, + "loss": 0.9796, + "step": 7442 + }, + { + "epoch": 0.7827835986695939, + "grad_norm": 2.4076467512658484, + "learning_rate": 5.711863698461198e-07, + "loss": 0.9668, + "step": 7443 + }, + { + "epoch": 0.7828887691114412, + "grad_norm": 2.9227877491691956, + "learning_rate": 5.706553665319955e-07, + "loss": 1.0166, + "step": 7444 + }, + { + "epoch": 0.7829939395532886, + "grad_norm": 1.7837281364826976, + "learning_rate": 5.701245783547856e-07, + "loss": 0.9527, + "step": 7445 + }, + { + "epoch": 0.7830991099951359, + "grad_norm": 3.8675285298144995, + "learning_rate": 5.695940053736779e-07, + "loss": 0.9783, + "step": 7446 + }, + { + "epoch": 0.7832042804369832, + "grad_norm": 2.6749508025236657, + "learning_rate": 5.690636476478337e-07, + "loss": 0.9893, + "step": 7447 + }, + { + "epoch": 0.7833094508788305, + "grad_norm": 2.344089841049213, + "learning_rate": 5.685335052363927e-07, + "loss": 0.9719, + "step": 7448 + }, + { + "epoch": 0.7834146213206779, + "grad_norm": 2.6937529790986607, + "learning_rate": 5.680035781984708e-07, + "loss": 0.9855, + "step": 7449 + }, + { + "epoch": 0.7835197917625252, + "grad_norm": 2.309732894716535, + "learning_rate": 5.674738665931575e-07, + "loss": 0.9556, + "step": 7450 + }, + { + "epoch": 0.7836249622043725, + "grad_norm": 2.380015885121929, + "learning_rate": 5.669443704795214e-07, + "loss": 1.007, + "step": 7451 + }, + { + "epoch": 0.7837301326462198, + "grad_norm": 2.175460160357494, + "learning_rate": 5.664150899166035e-07, + "loss": 0.9653, + "step": 7452 + }, + { + "epoch": 0.7838353030880671, + "grad_norm": 2.493955497046477, + "learning_rate": 5.658860249634237e-07, + "loss": 0.9901, + "step": 7453 + }, + { + "epoch": 0.7839404735299145, + "grad_norm": 3.0014222312856185, + "learning_rate": 5.653571756789777e-07, + "loss": 0.9816, + "step": 7454 + }, + { + "epoch": 0.7840456439717617, + "grad_norm": 1.879178780187409, + "learning_rate": 5.648285421222347e-07, + "loss": 0.9732, + "step": 7455 + }, + { + "epoch": 0.784150814413609, + "grad_norm": 2.4428944524881455, + "learning_rate": 5.643001243521429e-07, + "loss": 0.9955, + "step": 7456 + }, + { + "epoch": 0.7842559848554563, + "grad_norm": 2.4832015579932363, + "learning_rate": 5.637719224276228e-07, + "loss": 0.982, + "step": 7457 + }, + { + "epoch": 0.7843611552973037, + "grad_norm": 2.7676659547524247, + "learning_rate": 5.632439364075759e-07, + "loss": 1.0221, + "step": 7458 + }, + { + "epoch": 0.784466325739151, + "grad_norm": 2.900996870688416, + "learning_rate": 5.627161663508745e-07, + "loss": 0.9675, + "step": 7459 + }, + { + "epoch": 0.7845714961809983, + "grad_norm": 2.6087512206617434, + "learning_rate": 5.621886123163708e-07, + "loss": 1.0026, + "step": 7460 + }, + { + "epoch": 0.7846766666228456, + "grad_norm": 2.075036647944145, + "learning_rate": 5.616612743628896e-07, + "loss": 0.9838, + "step": 7461 + }, + { + "epoch": 0.784781837064693, + "grad_norm": 2.300780190568228, + "learning_rate": 5.611341525492337e-07, + "loss": 0.9434, + "step": 7462 + }, + { + "epoch": 0.7848870075065403, + "grad_norm": 1.8042294075183316, + "learning_rate": 5.606072469341823e-07, + "loss": 0.9684, + "step": 7463 + }, + { + "epoch": 0.7849921779483876, + "grad_norm": 2.5965873366492382, + "learning_rate": 5.600805575764878e-07, + "loss": 1.0404, + "step": 7464 + }, + { + "epoch": 0.7850973483902349, + "grad_norm": 2.5263383523718552, + "learning_rate": 5.595540845348815e-07, + "loss": 0.96, + "step": 7465 + }, + { + "epoch": 0.7852025188320823, + "grad_norm": 2.5277784570322916, + "learning_rate": 5.590278278680683e-07, + "loss": 1.0546, + "step": 7466 + }, + { + "epoch": 0.7853076892739296, + "grad_norm": 2.7356237220044166, + "learning_rate": 5.585017876347301e-07, + "loss": 0.9886, + "step": 7467 + }, + { + "epoch": 0.7854128597157769, + "grad_norm": 2.8495412546660472, + "learning_rate": 5.579759638935254e-07, + "loss": 1.0331, + "step": 7468 + }, + { + "epoch": 0.7855180301576242, + "grad_norm": 2.053798417338012, + "learning_rate": 5.574503567030861e-07, + "loss": 0.9651, + "step": 7469 + }, + { + "epoch": 0.7856232005994715, + "grad_norm": 3.106394812966373, + "learning_rate": 5.569249661220233e-07, + "loss": 0.9802, + "step": 7470 + }, + { + "epoch": 0.7857283710413189, + "grad_norm": 2.8187679442298954, + "learning_rate": 5.563997922089193e-07, + "loss": 1.0118, + "step": 7471 + }, + { + "epoch": 0.7858335414831662, + "grad_norm": 2.5263959039316113, + "learning_rate": 5.558748350223386e-07, + "loss": 0.9958, + "step": 7472 + }, + { + "epoch": 0.7859387119250135, + "grad_norm": 2.6283622778437747, + "learning_rate": 5.553500946208154e-07, + "loss": 0.9912, + "step": 7473 + }, + { + "epoch": 0.7860438823668608, + "grad_norm": 1.9388197155123486, + "learning_rate": 5.548255710628639e-07, + "loss": 1.0043, + "step": 7474 + }, + { + "epoch": 0.7861490528087081, + "grad_norm": 3.725787391804805, + "learning_rate": 5.543012644069714e-07, + "loss": 0.9957, + "step": 7475 + }, + { + "epoch": 0.7862542232505554, + "grad_norm": 2.8279644242846778, + "learning_rate": 5.537771747116024e-07, + "loss": 1.0032, + "step": 7476 + }, + { + "epoch": 0.7863593936924027, + "grad_norm": 2.05494665096958, + "learning_rate": 5.53253302035198e-07, + "loss": 0.9707, + "step": 7477 + }, + { + "epoch": 0.78646456413425, + "grad_norm": 2.2246914275505922, + "learning_rate": 5.527296464361725e-07, + "loss": 0.9912, + "step": 7478 + }, + { + "epoch": 0.7865697345760974, + "grad_norm": 2.390526330629458, + "learning_rate": 5.522062079729192e-07, + "loss": 1.0125, + "step": 7479 + }, + { + "epoch": 0.7866749050179447, + "grad_norm": 2.3621903521012877, + "learning_rate": 5.516829867038034e-07, + "loss": 0.9723, + "step": 7480 + }, + { + "epoch": 0.786780075459792, + "grad_norm": 2.432774028158043, + "learning_rate": 5.511599826871708e-07, + "loss": 0.9861, + "step": 7481 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 3.122179435897935, + "learning_rate": 5.506371959813386e-07, + "loss": 0.9478, + "step": 7482 + }, + { + "epoch": 0.7869904163434867, + "grad_norm": 2.2616352167488705, + "learning_rate": 5.501146266446031e-07, + "loss": 0.9887, + "step": 7483 + }, + { + "epoch": 0.787095586785334, + "grad_norm": 1.7783208579319671, + "learning_rate": 5.495922747352336e-07, + "loss": 0.9767, + "step": 7484 + }, + { + "epoch": 0.7872007572271813, + "grad_norm": 2.7785258155561667, + "learning_rate": 5.490701403114757e-07, + "loss": 1.0049, + "step": 7485 + }, + { + "epoch": 0.7873059276690286, + "grad_norm": 2.063969890861771, + "learning_rate": 5.485482234315537e-07, + "loss": 0.994, + "step": 7486 + }, + { + "epoch": 0.787411098110876, + "grad_norm": 1.9172938948485974, + "learning_rate": 5.480265241536636e-07, + "loss": 0.981, + "step": 7487 + }, + { + "epoch": 0.7875162685527233, + "grad_norm": 2.3568236845146515, + "learning_rate": 5.475050425359805e-07, + "loss": 0.9963, + "step": 7488 + }, + { + "epoch": 0.7876214389945706, + "grad_norm": 2.365770278271118, + "learning_rate": 5.469837786366514e-07, + "loss": 0.9908, + "step": 7489 + }, + { + "epoch": 0.7877266094364179, + "grad_norm": 2.0233878116931843, + "learning_rate": 5.464627325138031e-07, + "loss": 0.955, + "step": 7490 + }, + { + "epoch": 0.7878317798782652, + "grad_norm": 2.8116469349236874, + "learning_rate": 5.45941904225536e-07, + "loss": 0.9791, + "step": 7491 + }, + { + "epoch": 0.7879369503201126, + "grad_norm": 2.389189229680465, + "learning_rate": 5.454212938299256e-07, + "loss": 0.9532, + "step": 7492 + }, + { + "epoch": 0.7880421207619599, + "grad_norm": 3.2244042201754755, + "learning_rate": 5.449009013850253e-07, + "loss": 1.0245, + "step": 7493 + }, + { + "epoch": 0.7881472912038072, + "grad_norm": 2.350726006073612, + "learning_rate": 5.443807269488613e-07, + "loss": 0.9812, + "step": 7494 + }, + { + "epoch": 0.7882524616456544, + "grad_norm": 2.33734989566254, + "learning_rate": 5.438607705794391e-07, + "loss": 0.977, + "step": 7495 + }, + { + "epoch": 0.7883576320875018, + "grad_norm": 3.1617106808359474, + "learning_rate": 5.43341032334736e-07, + "loss": 1.0034, + "step": 7496 + }, + { + "epoch": 0.7884628025293491, + "grad_norm": 2.062664190865852, + "learning_rate": 5.428215122727084e-07, + "loss": 0.9338, + "step": 7497 + }, + { + "epoch": 0.7885679729711964, + "grad_norm": 2.541915976209122, + "learning_rate": 5.423022104512854e-07, + "loss": 1.01, + "step": 7498 + }, + { + "epoch": 0.7886731434130437, + "grad_norm": 2.6573006964220665, + "learning_rate": 5.417831269283741e-07, + "loss": 0.9934, + "step": 7499 + }, + { + "epoch": 0.7887783138548911, + "grad_norm": 3.667198982465556, + "learning_rate": 5.412642617618565e-07, + "loss": 1.0216, + "step": 7500 + }, + { + "epoch": 0.7888834842967384, + "grad_norm": 2.450330102335447, + "learning_rate": 5.407456150095891e-07, + "loss": 0.9705, + "step": 7501 + }, + { + "epoch": 0.7889886547385857, + "grad_norm": 2.6639634437563466, + "learning_rate": 5.402271867294062e-07, + "loss": 1.0375, + "step": 7502 + }, + { + "epoch": 0.789093825180433, + "grad_norm": 2.215568756488248, + "learning_rate": 5.39708976979115e-07, + "loss": 0.9998, + "step": 7503 + }, + { + "epoch": 0.7891989956222804, + "grad_norm": 2.7328182673914556, + "learning_rate": 5.39190985816502e-07, + "loss": 1.0026, + "step": 7504 + }, + { + "epoch": 0.7893041660641277, + "grad_norm": 2.3163502581110684, + "learning_rate": 5.386732132993264e-07, + "loss": 0.9868, + "step": 7505 + }, + { + "epoch": 0.789409336505975, + "grad_norm": 2.0522924122717083, + "learning_rate": 5.381556594853226e-07, + "loss": 0.9908, + "step": 7506 + }, + { + "epoch": 0.7895145069478223, + "grad_norm": 2.6138338131436214, + "learning_rate": 5.376383244322039e-07, + "loss": 0.9801, + "step": 7507 + }, + { + "epoch": 0.7896196773896696, + "grad_norm": 1.6691242712576417, + "learning_rate": 5.371212081976548e-07, + "loss": 0.9766, + "step": 7508 + }, + { + "epoch": 0.789724847831517, + "grad_norm": 2.8741546215777496, + "learning_rate": 5.366043108393407e-07, + "loss": 0.9829, + "step": 7509 + }, + { + "epoch": 0.7898300182733643, + "grad_norm": 2.2165275941322284, + "learning_rate": 5.360876324148972e-07, + "loss": 0.9629, + "step": 7510 + }, + { + "epoch": 0.7899351887152116, + "grad_norm": 1.605507267413963, + "learning_rate": 5.355711729819396e-07, + "loss": 0.9566, + "step": 7511 + }, + { + "epoch": 0.790040359157059, + "grad_norm": 2.2184467134191035, + "learning_rate": 5.350549325980558e-07, + "loss": 0.9624, + "step": 7512 + }, + { + "epoch": 0.7901455295989063, + "grad_norm": 2.840349144152613, + "learning_rate": 5.34538911320811e-07, + "loss": 0.9807, + "step": 7513 + }, + { + "epoch": 0.7902507000407536, + "grad_norm": 2.6440663804972333, + "learning_rate": 5.340231092077469e-07, + "loss": 0.998, + "step": 7514 + }, + { + "epoch": 0.7903558704826009, + "grad_norm": 2.9746470518481423, + "learning_rate": 5.335075263163774e-07, + "loss": 0.9896, + "step": 7515 + }, + { + "epoch": 0.7904610409244481, + "grad_norm": 2.217261654951149, + "learning_rate": 5.329921627041959e-07, + "loss": 0.9776, + "step": 7516 + }, + { + "epoch": 0.7905662113662955, + "grad_norm": 2.275129661652413, + "learning_rate": 5.324770184286668e-07, + "loss": 0.9754, + "step": 7517 + }, + { + "epoch": 0.7906713818081428, + "grad_norm": 2.125756351189819, + "learning_rate": 5.319620935472361e-07, + "loss": 0.9576, + "step": 7518 + }, + { + "epoch": 0.7907765522499901, + "grad_norm": 1.8544116319858905, + "learning_rate": 5.314473881173193e-07, + "loss": 1.0268, + "step": 7519 + }, + { + "epoch": 0.7908817226918374, + "grad_norm": 2.7586623648851947, + "learning_rate": 5.309329021963116e-07, + "loss": 0.9541, + "step": 7520 + }, + { + "epoch": 0.7909868931336848, + "grad_norm": 2.0077432117735503, + "learning_rate": 5.30418635841582e-07, + "loss": 1.0106, + "step": 7521 + }, + { + "epoch": 0.7910920635755321, + "grad_norm": 2.211322527248184, + "learning_rate": 5.299045891104729e-07, + "loss": 0.9729, + "step": 7522 + }, + { + "epoch": 0.7911972340173794, + "grad_norm": 3.561907418437129, + "learning_rate": 5.293907620603081e-07, + "loss": 1.0103, + "step": 7523 + }, + { + "epoch": 0.7913024044592267, + "grad_norm": 2.919228767233691, + "learning_rate": 5.288771547483807e-07, + "loss": 1.0016, + "step": 7524 + }, + { + "epoch": 0.791407574901074, + "grad_norm": 2.4514739629485764, + "learning_rate": 5.283637672319633e-07, + "loss": 1.0114, + "step": 7525 + }, + { + "epoch": 0.7915127453429214, + "grad_norm": 2.248230346805148, + "learning_rate": 5.278505995683014e-07, + "loss": 0.9975, + "step": 7526 + }, + { + "epoch": 0.7916179157847687, + "grad_norm": 2.299965654128683, + "learning_rate": 5.273376518146181e-07, + "loss": 0.9858, + "step": 7527 + }, + { + "epoch": 0.791723086226616, + "grad_norm": 2.631945993060661, + "learning_rate": 5.268249240281112e-07, + "loss": 1.053, + "step": 7528 + }, + { + "epoch": 0.7918282566684633, + "grad_norm": 2.965446830432594, + "learning_rate": 5.263124162659531e-07, + "loss": 1.0042, + "step": 7529 + }, + { + "epoch": 0.7919334271103107, + "grad_norm": 2.8523687341385866, + "learning_rate": 5.258001285852931e-07, + "loss": 0.998, + "step": 7530 + }, + { + "epoch": 0.792038597552158, + "grad_norm": 2.8763350701028076, + "learning_rate": 5.252880610432537e-07, + "loss": 0.9963, + "step": 7531 + }, + { + "epoch": 0.7921437679940053, + "grad_norm": 2.7401215806024384, + "learning_rate": 5.247762136969367e-07, + "loss": 0.9984, + "step": 7532 + }, + { + "epoch": 0.7922489384358526, + "grad_norm": 2.381855054501238, + "learning_rate": 5.242645866034154e-07, + "loss": 0.9807, + "step": 7533 + }, + { + "epoch": 0.7923541088777, + "grad_norm": 2.2923915086079303, + "learning_rate": 5.237531798197415e-07, + "loss": 0.9697, + "step": 7534 + }, + { + "epoch": 0.7924592793195473, + "grad_norm": 2.5095749130378695, + "learning_rate": 5.232419934029395e-07, + "loss": 0.9681, + "step": 7535 + }, + { + "epoch": 0.7925644497613945, + "grad_norm": 2.3453279983832793, + "learning_rate": 5.227310274100112e-07, + "loss": 1.0093, + "step": 7536 + }, + { + "epoch": 0.7926696202032418, + "grad_norm": 2.140545124978573, + "learning_rate": 5.222202818979338e-07, + "loss": 0.9813, + "step": 7537 + }, + { + "epoch": 0.7927747906450892, + "grad_norm": 2.4063998911718634, + "learning_rate": 5.217097569236581e-07, + "loss": 0.9952, + "step": 7538 + }, + { + "epoch": 0.7928799610869365, + "grad_norm": 3.1685790964036076, + "learning_rate": 5.211994525441133e-07, + "loss": 0.9982, + "step": 7539 + }, + { + "epoch": 0.7929851315287838, + "grad_norm": 2.789219170600396, + "learning_rate": 5.20689368816201e-07, + "loss": 1.0258, + "step": 7540 + }, + { + "epoch": 0.7930903019706311, + "grad_norm": 2.2210430744775196, + "learning_rate": 5.201795057967995e-07, + "loss": 1.0254, + "step": 7541 + }, + { + "epoch": 0.7931954724124785, + "grad_norm": 2.7466488855873314, + "learning_rate": 5.196698635427636e-07, + "loss": 0.9965, + "step": 7542 + }, + { + "epoch": 0.7933006428543258, + "grad_norm": 2.225180678588441, + "learning_rate": 5.191604421109211e-07, + "loss": 0.986, + "step": 7543 + }, + { + "epoch": 0.7934058132961731, + "grad_norm": 2.553086727079915, + "learning_rate": 5.186512415580778e-07, + "loss": 0.971, + "step": 7544 + }, + { + "epoch": 0.7935109837380204, + "grad_norm": 2.382433854163622, + "learning_rate": 5.181422619410109e-07, + "loss": 0.9843, + "step": 7545 + }, + { + "epoch": 0.7936161541798677, + "grad_norm": 2.3343492155076127, + "learning_rate": 5.176335033164793e-07, + "loss": 1.0111, + "step": 7546 + }, + { + "epoch": 0.7937213246217151, + "grad_norm": 2.256309064903577, + "learning_rate": 5.171249657412106e-07, + "loss": 0.9924, + "step": 7547 + }, + { + "epoch": 0.7938264950635624, + "grad_norm": 2.8424568000686192, + "learning_rate": 5.166166492719124e-07, + "loss": 0.9672, + "step": 7548 + }, + { + "epoch": 0.7939316655054097, + "grad_norm": 2.563574424214626, + "learning_rate": 5.161085539652649e-07, + "loss": 0.9811, + "step": 7549 + }, + { + "epoch": 0.794036835947257, + "grad_norm": 1.8740387198796324, + "learning_rate": 5.156006798779248e-07, + "loss": 1.0023, + "step": 7550 + }, + { + "epoch": 0.7941420063891044, + "grad_norm": 2.7294780666990612, + "learning_rate": 5.15093027066525e-07, + "loss": 0.9843, + "step": 7551 + }, + { + "epoch": 0.7942471768309517, + "grad_norm": 2.4042826855611787, + "learning_rate": 5.145855955876713e-07, + "loss": 1.0067, + "step": 7552 + }, + { + "epoch": 0.794352347272799, + "grad_norm": 2.6315789434241643, + "learning_rate": 5.140783854979475e-07, + "loss": 0.9392, + "step": 7553 + }, + { + "epoch": 0.7944575177146463, + "grad_norm": 2.3247912098399364, + "learning_rate": 5.135713968539096e-07, + "loss": 1.0035, + "step": 7554 + }, + { + "epoch": 0.7945626881564937, + "grad_norm": 2.311734008689543, + "learning_rate": 5.130646297120936e-07, + "loss": 1.0073, + "step": 7555 + }, + { + "epoch": 0.7946678585983409, + "grad_norm": 2.771031149106594, + "learning_rate": 5.125580841290063e-07, + "loss": 0.9839, + "step": 7556 + }, + { + "epoch": 0.7947730290401882, + "grad_norm": 2.5613460622634263, + "learning_rate": 5.12051760161131e-07, + "loss": 0.964, + "step": 7557 + }, + { + "epoch": 0.7948781994820355, + "grad_norm": 2.9699017410914097, + "learning_rate": 5.115456578649272e-07, + "loss": 0.9706, + "step": 7558 + }, + { + "epoch": 0.7949833699238829, + "grad_norm": 2.8383038959737035, + "learning_rate": 5.110397772968295e-07, + "loss": 1.0041, + "step": 7559 + }, + { + "epoch": 0.7950885403657302, + "grad_norm": 2.4690199798689925, + "learning_rate": 5.105341185132481e-07, + "loss": 1.0097, + "step": 7560 + }, + { + "epoch": 0.7951937108075775, + "grad_norm": 2.253149181816096, + "learning_rate": 5.100286815705665e-07, + "loss": 0.973, + "step": 7561 + }, + { + "epoch": 0.7952988812494248, + "grad_norm": 2.6862303113933934, + "learning_rate": 5.095234665251464e-07, + "loss": 0.9876, + "step": 7562 + }, + { + "epoch": 0.7954040516912722, + "grad_norm": 2.7435662802894734, + "learning_rate": 5.090184734333214e-07, + "loss": 0.9361, + "step": 7563 + }, + { + "epoch": 0.7955092221331195, + "grad_norm": 2.149989633339049, + "learning_rate": 5.085137023514034e-07, + "loss": 1.0072, + "step": 7564 + }, + { + "epoch": 0.7956143925749668, + "grad_norm": 2.5740579082453494, + "learning_rate": 5.080091533356784e-07, + "loss": 0.9908, + "step": 7565 + }, + { + "epoch": 0.7957195630168141, + "grad_norm": 2.3471609710728822, + "learning_rate": 5.075048264424065e-07, + "loss": 1.0258, + "step": 7566 + }, + { + "epoch": 0.7958247334586614, + "grad_norm": 2.406035264757779, + "learning_rate": 5.070007217278253e-07, + "loss": 0.9846, + "step": 7567 + }, + { + "epoch": 0.7959299039005088, + "grad_norm": 2.117858329168068, + "learning_rate": 5.064968392481448e-07, + "loss": 0.9419, + "step": 7568 + }, + { + "epoch": 0.7960350743423561, + "grad_norm": 2.193661332028775, + "learning_rate": 5.059931790595538e-07, + "loss": 0.9726, + "step": 7569 + }, + { + "epoch": 0.7961402447842034, + "grad_norm": 3.4342436943341283, + "learning_rate": 5.054897412182128e-07, + "loss": 0.9513, + "step": 7570 + }, + { + "epoch": 0.7962454152260507, + "grad_norm": 2.6107851702888367, + "learning_rate": 5.049865257802599e-07, + "loss": 1.0154, + "step": 7571 + }, + { + "epoch": 0.7963505856678981, + "grad_norm": 2.593692528488154, + "learning_rate": 5.044835328018069e-07, + "loss": 0.9833, + "step": 7572 + }, + { + "epoch": 0.7964557561097454, + "grad_norm": 2.374728480707458, + "learning_rate": 5.039807623389418e-07, + "loss": 0.9846, + "step": 7573 + }, + { + "epoch": 0.7965609265515927, + "grad_norm": 3.1091749055796667, + "learning_rate": 5.034782144477276e-07, + "loss": 1.0286, + "step": 7574 + }, + { + "epoch": 0.79666609699344, + "grad_norm": 2.3409299896756965, + "learning_rate": 5.029758891842015e-07, + "loss": 1.0111, + "step": 7575 + }, + { + "epoch": 0.7967712674352874, + "grad_norm": 2.659765654383437, + "learning_rate": 5.02473786604378e-07, + "loss": 0.9769, + "step": 7576 + }, + { + "epoch": 0.7968764378771346, + "grad_norm": 2.6745888226219923, + "learning_rate": 5.019719067642437e-07, + "loss": 0.9902, + "step": 7577 + }, + { + "epoch": 0.7969816083189819, + "grad_norm": 2.987835654743405, + "learning_rate": 5.014702497197632e-07, + "loss": 1.028, + "step": 7578 + }, + { + "epoch": 0.7970867787608292, + "grad_norm": 2.401381290760075, + "learning_rate": 5.009688155268755e-07, + "loss": 0.9726, + "step": 7579 + }, + { + "epoch": 0.7971919492026766, + "grad_norm": 2.120903536373421, + "learning_rate": 5.004676042414936e-07, + "loss": 0.9738, + "step": 7580 + }, + { + "epoch": 0.7972971196445239, + "grad_norm": 2.2322743664947065, + "learning_rate": 4.999666159195063e-07, + "loss": 0.9493, + "step": 7581 + }, + { + "epoch": 0.7974022900863712, + "grad_norm": 3.0099693723923004, + "learning_rate": 4.994658506167782e-07, + "loss": 1.0182, + "step": 7582 + }, + { + "epoch": 0.7975074605282185, + "grad_norm": 1.656585929583154, + "learning_rate": 4.989653083891493e-07, + "loss": 0.9718, + "step": 7583 + }, + { + "epoch": 0.7976126309700659, + "grad_norm": 2.5313550286466056, + "learning_rate": 4.984649892924323e-07, + "loss": 0.9698, + "step": 7584 + }, + { + "epoch": 0.7977178014119132, + "grad_norm": 3.027950770126478, + "learning_rate": 4.979648933824183e-07, + "loss": 1.0244, + "step": 7585 + }, + { + "epoch": 0.7978229718537605, + "grad_norm": 3.251073308685086, + "learning_rate": 4.974650207148701e-07, + "loss": 0.9823, + "step": 7586 + }, + { + "epoch": 0.7979281422956078, + "grad_norm": 2.2190156660054363, + "learning_rate": 4.969653713455286e-07, + "loss": 0.979, + "step": 7587 + }, + { + "epoch": 0.7980333127374551, + "grad_norm": 2.1132623922709732, + "learning_rate": 4.964659453301088e-07, + "loss": 0.9428, + "step": 7588 + }, + { + "epoch": 0.7981384831793025, + "grad_norm": 2.2951743727023306, + "learning_rate": 4.959667427242995e-07, + "loss": 0.9855, + "step": 7589 + }, + { + "epoch": 0.7982436536211498, + "grad_norm": 2.2665071186589705, + "learning_rate": 4.954677635837668e-07, + "loss": 0.9737, + "step": 7590 + }, + { + "epoch": 0.7983488240629971, + "grad_norm": 2.327325568834645, + "learning_rate": 4.949690079641498e-07, + "loss": 0.9589, + "step": 7591 + }, + { + "epoch": 0.7984539945048444, + "grad_norm": 2.8604374797701353, + "learning_rate": 4.944704759210637e-07, + "loss": 0.9786, + "step": 7592 + }, + { + "epoch": 0.7985591649466918, + "grad_norm": 2.2699713114872684, + "learning_rate": 4.939721675101001e-07, + "loss": 0.9744, + "step": 7593 + }, + { + "epoch": 0.7986643353885391, + "grad_norm": 2.1867243451965, + "learning_rate": 4.934740827868221e-07, + "loss": 0.9416, + "step": 7594 + }, + { + "epoch": 0.7987695058303864, + "grad_norm": 2.032577080022669, + "learning_rate": 4.929762218067713e-07, + "loss": 0.9277, + "step": 7595 + }, + { + "epoch": 0.7988746762722337, + "grad_norm": 2.469836920824304, + "learning_rate": 4.924785846254629e-07, + "loss": 1.0052, + "step": 7596 + }, + { + "epoch": 0.798979846714081, + "grad_norm": 2.2285999066069544, + "learning_rate": 4.91981171298388e-07, + "loss": 1.0035, + "step": 7597 + }, + { + "epoch": 0.7990850171559283, + "grad_norm": 2.7998486974210626, + "learning_rate": 4.914839818810105e-07, + "loss": 0.9873, + "step": 7598 + }, + { + "epoch": 0.7991901875977756, + "grad_norm": 2.2023794298511308, + "learning_rate": 4.909870164287725e-07, + "loss": 0.9966, + "step": 7599 + }, + { + "epoch": 0.7992953580396229, + "grad_norm": 2.5707030773678423, + "learning_rate": 4.904902749970883e-07, + "loss": 0.9916, + "step": 7600 + }, + { + "epoch": 0.7994005284814703, + "grad_norm": 2.3180485019623442, + "learning_rate": 4.899937576413486e-07, + "loss": 0.961, + "step": 7601 + }, + { + "epoch": 0.7995056989233176, + "grad_norm": 2.3791731807882335, + "learning_rate": 4.894974644169198e-07, + "loss": 0.9891, + "step": 7602 + }, + { + "epoch": 0.7996108693651649, + "grad_norm": 2.479486951922807, + "learning_rate": 4.890013953791411e-07, + "loss": 0.9645, + "step": 7603 + }, + { + "epoch": 0.7997160398070122, + "grad_norm": 2.633404751177737, + "learning_rate": 4.885055505833291e-07, + "loss": 0.9565, + "step": 7604 + }, + { + "epoch": 0.7998212102488595, + "grad_norm": 2.684117417959632, + "learning_rate": 4.880099300847741e-07, + "loss": 0.9838, + "step": 7605 + }, + { + "epoch": 0.7999263806907069, + "grad_norm": 2.8746409642803243, + "learning_rate": 4.875145339387418e-07, + "loss": 1.036, + "step": 7606 + }, + { + "epoch": 0.8000315511325542, + "grad_norm": 2.0807055982824276, + "learning_rate": 4.870193622004729e-07, + "loss": 0.9486, + "step": 7607 + }, + { + "epoch": 0.8001367215744015, + "grad_norm": 2.471988925752685, + "learning_rate": 4.865244149251815e-07, + "loss": 0.9727, + "step": 7608 + }, + { + "epoch": 0.8002418920162488, + "grad_norm": 1.9412511005330855, + "learning_rate": 4.860296921680593e-07, + "loss": 0.9654, + "step": 7609 + }, + { + "epoch": 0.8003470624580962, + "grad_norm": 1.9790419108013764, + "learning_rate": 4.855351939842717e-07, + "loss": 0.9808, + "step": 7610 + }, + { + "epoch": 0.8004522328999435, + "grad_norm": 2.4043654013707254, + "learning_rate": 4.850409204289591e-07, + "loss": 0.9727, + "step": 7611 + }, + { + "epoch": 0.8005574033417908, + "grad_norm": 2.7262779181762666, + "learning_rate": 4.845468715572363e-07, + "loss": 0.9834, + "step": 7612 + }, + { + "epoch": 0.8006625737836381, + "grad_norm": 2.2391899912644715, + "learning_rate": 4.840530474241944e-07, + "loss": 0.9767, + "step": 7613 + }, + { + "epoch": 0.8007677442254855, + "grad_norm": 2.4174326455832977, + "learning_rate": 4.835594480848974e-07, + "loss": 0.9769, + "step": 7614 + }, + { + "epoch": 0.8008729146673328, + "grad_norm": 2.6250830234736307, + "learning_rate": 4.830660735943865e-07, + "loss": 0.9464, + "step": 7615 + }, + { + "epoch": 0.8009780851091801, + "grad_norm": 2.34149423966112, + "learning_rate": 4.825729240076768e-07, + "loss": 1.0037, + "step": 7616 + }, + { + "epoch": 0.8010832555510273, + "grad_norm": 1.8300856800483398, + "learning_rate": 4.820799993797576e-07, + "loss": 0.9735, + "step": 7617 + }, + { + "epoch": 0.8011884259928747, + "grad_norm": 1.764340560176855, + "learning_rate": 4.815872997655941e-07, + "loss": 0.9249, + "step": 7618 + }, + { + "epoch": 0.801293596434722, + "grad_norm": 2.320272163563018, + "learning_rate": 4.810948252201261e-07, + "loss": 0.9664, + "step": 7619 + }, + { + "epoch": 0.8013987668765693, + "grad_norm": 3.0912542656654733, + "learning_rate": 4.806025757982694e-07, + "loss": 0.9396, + "step": 7620 + }, + { + "epoch": 0.8015039373184166, + "grad_norm": 2.2930284148085316, + "learning_rate": 4.80110551554912e-07, + "loss": 0.9745, + "step": 7621 + }, + { + "epoch": 0.801609107760264, + "grad_norm": 2.3498868797024106, + "learning_rate": 4.796187525449198e-07, + "loss": 1.0354, + "step": 7622 + }, + { + "epoch": 0.8017142782021113, + "grad_norm": 2.2536909100182534, + "learning_rate": 4.791271788231309e-07, + "loss": 1.011, + "step": 7623 + }, + { + "epoch": 0.8018194486439586, + "grad_norm": 2.5584982606240603, + "learning_rate": 4.786358304443603e-07, + "loss": 1.0019, + "step": 7624 + }, + { + "epoch": 0.8019246190858059, + "grad_norm": 2.356806815511279, + "learning_rate": 4.781447074633977e-07, + "loss": 0.9796, + "step": 7625 + }, + { + "epoch": 0.8020297895276532, + "grad_norm": 2.654243031206716, + "learning_rate": 4.776538099350058e-07, + "loss": 0.966, + "step": 7626 + }, + { + "epoch": 0.8021349599695006, + "grad_norm": 2.7182415623648843, + "learning_rate": 4.771631379139244e-07, + "loss": 0.9779, + "step": 7627 + }, + { + "epoch": 0.8022401304113479, + "grad_norm": 2.5630762496199044, + "learning_rate": 4.7667269145486777e-07, + "loss": 0.9753, + "step": 7628 + }, + { + "epoch": 0.8023453008531952, + "grad_norm": 2.485238287825221, + "learning_rate": 4.7618247061252286e-07, + "loss": 0.9542, + "step": 7629 + }, + { + "epoch": 0.8024504712950425, + "grad_norm": 3.0101852738819375, + "learning_rate": 4.7569247544155486e-07, + "loss": 1.0098, + "step": 7630 + }, + { + "epoch": 0.8025556417368899, + "grad_norm": 2.4862972563714694, + "learning_rate": 4.7520270599660053e-07, + "loss": 0.907, + "step": 7631 + }, + { + "epoch": 0.8026608121787372, + "grad_norm": 3.043564384568565, + "learning_rate": 4.747131623322737e-07, + "loss": 1.0138, + "step": 7632 + }, + { + "epoch": 0.8027659826205845, + "grad_norm": 2.1240214301887583, + "learning_rate": 4.7422384450316244e-07, + "loss": 1.0008, + "step": 7633 + }, + { + "epoch": 0.8028711530624318, + "grad_norm": 2.6774258034215537, + "learning_rate": 4.7373475256382973e-07, + "loss": 0.9516, + "step": 7634 + }, + { + "epoch": 0.8029763235042792, + "grad_norm": 2.5417333491458285, + "learning_rate": 4.7324588656881194e-07, + "loss": 1.0021, + "step": 7635 + }, + { + "epoch": 0.8030814939461265, + "grad_norm": 2.8488514697489786, + "learning_rate": 4.7275724657262293e-07, + "loss": 0.9775, + "step": 7636 + }, + { + "epoch": 0.8031866643879738, + "grad_norm": 2.3577591104181437, + "learning_rate": 4.7226883262974865e-07, + "loss": 1.0096, + "step": 7637 + }, + { + "epoch": 0.803291834829821, + "grad_norm": 2.6074240401396542, + "learning_rate": 4.717806447946513e-07, + "loss": 0.9777, + "step": 7638 + }, + { + "epoch": 0.8033970052716684, + "grad_norm": 3.3809444206445995, + "learning_rate": 4.712926831217685e-07, + "loss": 0.9335, + "step": 7639 + }, + { + "epoch": 0.8035021757135157, + "grad_norm": 2.0108425607516263, + "learning_rate": 4.708049476655105e-07, + "loss": 0.9809, + "step": 7640 + }, + { + "epoch": 0.803607346155363, + "grad_norm": 2.6521203726038514, + "learning_rate": 4.703174384802639e-07, + "loss": 1.0248, + "step": 7641 + }, + { + "epoch": 0.8037125165972103, + "grad_norm": 2.848291027442207, + "learning_rate": 4.69830155620391e-07, + "loss": 0.9835, + "step": 7642 + }, + { + "epoch": 0.8038176870390576, + "grad_norm": 2.475573839743065, + "learning_rate": 4.693430991402259e-07, + "loss": 0.9945, + "step": 7643 + }, + { + "epoch": 0.803922857480905, + "grad_norm": 2.345453581854106, + "learning_rate": 4.6885626909408037e-07, + "loss": 0.9622, + "step": 7644 + }, + { + "epoch": 0.8040280279227523, + "grad_norm": 2.7478312834618808, + "learning_rate": 4.683696655362391e-07, + "loss": 0.9866, + "step": 7645 + }, + { + "epoch": 0.8041331983645996, + "grad_norm": 1.9800930016394065, + "learning_rate": 4.678832885209622e-07, + "loss": 1.0, + "step": 7646 + }, + { + "epoch": 0.804238368806447, + "grad_norm": 1.6413329542669401, + "learning_rate": 4.6739713810248443e-07, + "loss": 0.9856, + "step": 7647 + }, + { + "epoch": 0.8043435392482943, + "grad_norm": 2.414821706263618, + "learning_rate": 4.6691121433501657e-07, + "loss": 0.9904, + "step": 7648 + }, + { + "epoch": 0.8044487096901416, + "grad_norm": 2.4848547284531506, + "learning_rate": 4.664255172727411e-07, + "loss": 0.9819, + "step": 7649 + }, + { + "epoch": 0.8045538801319889, + "grad_norm": 1.9804328938918532, + "learning_rate": 4.6594004696981774e-07, + "loss": 0.9987, + "step": 7650 + }, + { + "epoch": 0.8046590505738362, + "grad_norm": 2.5014058610541983, + "learning_rate": 4.65454803480381e-07, + "loss": 0.9909, + "step": 7651 + }, + { + "epoch": 0.8047642210156836, + "grad_norm": 2.202903476738561, + "learning_rate": 4.649697868585379e-07, + "loss": 0.9658, + "step": 7652 + }, + { + "epoch": 0.8048693914575309, + "grad_norm": 2.380362604444181, + "learning_rate": 4.644849971583729e-07, + "loss": 1.0007, + "step": 7653 + }, + { + "epoch": 0.8049745618993782, + "grad_norm": 6.351090296949297, + "learning_rate": 4.6400043443394246e-07, + "loss": 1.0105, + "step": 7654 + }, + { + "epoch": 0.8050797323412255, + "grad_norm": 2.8797515711713615, + "learning_rate": 4.6351609873927976e-07, + "loss": 1.03, + "step": 7655 + }, + { + "epoch": 0.8051849027830729, + "grad_norm": 3.2146357197391846, + "learning_rate": 4.6303199012839225e-07, + "loss": 1.0333, + "step": 7656 + }, + { + "epoch": 0.8052900732249202, + "grad_norm": 2.079836576250059, + "learning_rate": 4.6254810865526187e-07, + "loss": 0.9962, + "step": 7657 + }, + { + "epoch": 0.8053952436667674, + "grad_norm": 1.649972670577275, + "learning_rate": 4.6206445437384504e-07, + "loss": 0.9932, + "step": 7658 + }, + { + "epoch": 0.8055004141086147, + "grad_norm": 1.9272657097173842, + "learning_rate": 4.615810273380722e-07, + "loss": 0.9981, + "step": 7659 + }, + { + "epoch": 0.805605584550462, + "grad_norm": 2.8001761793669577, + "learning_rate": 4.610978276018496e-07, + "loss": 0.9994, + "step": 7660 + }, + { + "epoch": 0.8057107549923094, + "grad_norm": 2.0720005349560995, + "learning_rate": 4.6061485521905815e-07, + "loss": 0.9708, + "step": 7661 + }, + { + "epoch": 0.8058159254341567, + "grad_norm": 1.9105799009272426, + "learning_rate": 4.6013211024355353e-07, + "loss": 1.0271, + "step": 7662 + }, + { + "epoch": 0.805921095876004, + "grad_norm": 1.9282586380528615, + "learning_rate": 4.5964959272916415e-07, + "loss": 1.0037, + "step": 7663 + }, + { + "epoch": 0.8060262663178513, + "grad_norm": 1.9696455842256768, + "learning_rate": 4.59167302729695e-07, + "loss": 1.0094, + "step": 7664 + }, + { + "epoch": 0.8061314367596987, + "grad_norm": 1.9870486254495214, + "learning_rate": 4.5868524029892616e-07, + "loss": 0.9653, + "step": 7665 + }, + { + "epoch": 0.806236607201546, + "grad_norm": 2.631029480979719, + "learning_rate": 4.582034054906098e-07, + "loss": 1.0113, + "step": 7666 + }, + { + "epoch": 0.8063417776433933, + "grad_norm": 1.9761375577657758, + "learning_rate": 4.5772179835847584e-07, + "loss": 0.9523, + "step": 7667 + }, + { + "epoch": 0.8064469480852406, + "grad_norm": 2.7997469746623267, + "learning_rate": 4.5724041895622546e-07, + "loss": 0.9985, + "step": 7668 + }, + { + "epoch": 0.806552118527088, + "grad_norm": 2.581890446856423, + "learning_rate": 4.567592673375371e-07, + "loss": 0.9612, + "step": 7669 + }, + { + "epoch": 0.8066572889689353, + "grad_norm": 2.1149257021470946, + "learning_rate": 4.56278343556063e-07, + "loss": 0.9749, + "step": 7670 + }, + { + "epoch": 0.8067624594107826, + "grad_norm": 2.547543034627653, + "learning_rate": 4.5579764766543023e-07, + "loss": 1.0053, + "step": 7671 + }, + { + "epoch": 0.8068676298526299, + "grad_norm": 2.369288975718212, + "learning_rate": 4.5531717971923904e-07, + "loss": 0.9468, + "step": 7672 + }, + { + "epoch": 0.8069728002944773, + "grad_norm": 1.9398001873861406, + "learning_rate": 4.5483693977106614e-07, + "loss": 0.9719, + "step": 7673 + }, + { + "epoch": 0.8070779707363246, + "grad_norm": 1.7047845543366074, + "learning_rate": 4.543569278744625e-07, + "loss": 0.9695, + "step": 7674 + }, + { + "epoch": 0.8071831411781719, + "grad_norm": 2.311255391335769, + "learning_rate": 4.5387714408295174e-07, + "loss": 1.021, + "step": 7675 + }, + { + "epoch": 0.8072883116200192, + "grad_norm": 2.5665068402737297, + "learning_rate": 4.5339758845003497e-07, + "loss": 0.9434, + "step": 7676 + }, + { + "epoch": 0.8073934820618666, + "grad_norm": 1.987419007348485, + "learning_rate": 4.529182610291849e-07, + "loss": 1.0189, + "step": 7677 + }, + { + "epoch": 0.8074986525037138, + "grad_norm": 2.7148221725671258, + "learning_rate": 4.5243916187385104e-07, + "loss": 1.0317, + "step": 7678 + }, + { + "epoch": 0.8076038229455611, + "grad_norm": 1.9771244563101253, + "learning_rate": 4.5196029103745736e-07, + "loss": 0.9574, + "step": 7679 + }, + { + "epoch": 0.8077089933874084, + "grad_norm": 2.5643391230827577, + "learning_rate": 4.514816485734003e-07, + "loss": 0.9949, + "step": 7680 + }, + { + "epoch": 0.8078141638292557, + "grad_norm": 2.6365658218139947, + "learning_rate": 4.5100323453505344e-07, + "loss": 0.991, + "step": 7681 + }, + { + "epoch": 0.8079193342711031, + "grad_norm": 2.0213099184862746, + "learning_rate": 4.5052504897576177e-07, + "loss": 1.0065, + "step": 7682 + }, + { + "epoch": 0.8080245047129504, + "grad_norm": 2.430948178647297, + "learning_rate": 4.500470919488495e-07, + "loss": 0.9754, + "step": 7683 + }, + { + "epoch": 0.8081296751547977, + "grad_norm": 2.432965011287217, + "learning_rate": 4.495693635076101e-07, + "loss": 0.9688, + "step": 7684 + }, + { + "epoch": 0.808234845596645, + "grad_norm": 1.9832928166525534, + "learning_rate": 4.49091863705316e-07, + "loss": 0.967, + "step": 7685 + }, + { + "epoch": 0.8083400160384924, + "grad_norm": 2.718332781399688, + "learning_rate": 4.4861459259521013e-07, + "loss": 0.9604, + "step": 7686 + }, + { + "epoch": 0.8084451864803397, + "grad_norm": 2.5869970801279707, + "learning_rate": 4.4813755023051323e-07, + "loss": 1.0265, + "step": 7687 + }, + { + "epoch": 0.808550356922187, + "grad_norm": 2.249567127749935, + "learning_rate": 4.476607366644192e-07, + "loss": 1.0034, + "step": 7688 + }, + { + "epoch": 0.8086555273640343, + "grad_norm": 1.8946857360784728, + "learning_rate": 4.4718415195009577e-07, + "loss": 1.009, + "step": 7689 + }, + { + "epoch": 0.8087606978058817, + "grad_norm": 2.775009327722139, + "learning_rate": 4.4670779614068683e-07, + "loss": 0.9728, + "step": 7690 + }, + { + "epoch": 0.808865868247729, + "grad_norm": 2.3078323532076626, + "learning_rate": 4.4623166928930846e-07, + "loss": 0.9838, + "step": 7691 + }, + { + "epoch": 0.8089710386895763, + "grad_norm": 2.454669259509621, + "learning_rate": 4.457557714490532e-07, + "loss": 1.0128, + "step": 7692 + }, + { + "epoch": 0.8090762091314236, + "grad_norm": 2.256343596048476, + "learning_rate": 4.4528010267298803e-07, + "loss": 1.0084, + "step": 7693 + }, + { + "epoch": 0.809181379573271, + "grad_norm": 2.0591184791276738, + "learning_rate": 4.4480466301415253e-07, + "loss": 0.9828, + "step": 7694 + }, + { + "epoch": 0.8092865500151183, + "grad_norm": 2.8039610013628145, + "learning_rate": 4.4432945252556285e-07, + "loss": 1.0063, + "step": 7695 + }, + { + "epoch": 0.8093917204569656, + "grad_norm": 1.8867274438204686, + "learning_rate": 4.4385447126020695e-07, + "loss": 0.979, + "step": 7696 + }, + { + "epoch": 0.8094968908988129, + "grad_norm": 2.0342847166171265, + "learning_rate": 4.433797192710515e-07, + "loss": 0.959, + "step": 7697 + }, + { + "epoch": 0.8096020613406603, + "grad_norm": 2.143958088883331, + "learning_rate": 4.4290519661103313e-07, + "loss": 0.9682, + "step": 7698 + }, + { + "epoch": 0.8097072317825075, + "grad_norm": 2.307795459628221, + "learning_rate": 4.424309033330662e-07, + "loss": 0.9781, + "step": 7699 + }, + { + "epoch": 0.8098124022243548, + "grad_norm": 2.6737394534107755, + "learning_rate": 4.4195683949003645e-07, + "loss": 0.9879, + "step": 7700 + }, + { + "epoch": 0.8099175726662021, + "grad_norm": 2.468361380665413, + "learning_rate": 4.414830051348068e-07, + "loss": 0.9823, + "step": 7701 + }, + { + "epoch": 0.8100227431080494, + "grad_norm": 2.464813507317941, + "learning_rate": 4.410094003202134e-07, + "loss": 0.9437, + "step": 7702 + }, + { + "epoch": 0.8101279135498968, + "grad_norm": 2.439270737619183, + "learning_rate": 4.4053602509906647e-07, + "loss": 0.9409, + "step": 7703 + }, + { + "epoch": 0.8102330839917441, + "grad_norm": 2.2253177582591266, + "learning_rate": 4.4006287952415156e-07, + "loss": 1.0017, + "step": 7704 + }, + { + "epoch": 0.8103382544335914, + "grad_norm": 1.8670735694432252, + "learning_rate": 4.395899636482265e-07, + "loss": 0.9638, + "step": 7705 + }, + { + "epoch": 0.8104434248754387, + "grad_norm": 2.537791466303488, + "learning_rate": 4.391172775240277e-07, + "loss": 0.9782, + "step": 7706 + }, + { + "epoch": 0.8105485953172861, + "grad_norm": 2.2360740386296953, + "learning_rate": 4.386448212042613e-07, + "loss": 0.9666, + "step": 7707 + }, + { + "epoch": 0.8106537657591334, + "grad_norm": 1.9347825652393957, + "learning_rate": 4.38172594741611e-07, + "loss": 1.0115, + "step": 7708 + }, + { + "epoch": 0.8107589362009807, + "grad_norm": 2.644131866061192, + "learning_rate": 4.3770059818873335e-07, + "loss": 0.9877, + "step": 7709 + }, + { + "epoch": 0.810864106642828, + "grad_norm": 2.7175911960332906, + "learning_rate": 4.372288315982584e-07, + "loss": 0.9678, + "step": 7710 + }, + { + "epoch": 0.8109692770846754, + "grad_norm": 2.336165314669402, + "learning_rate": 4.3675729502279385e-07, + "loss": 1.0202, + "step": 7711 + }, + { + "epoch": 0.8110744475265227, + "grad_norm": 2.5282003663888952, + "learning_rate": 4.3628598851491816e-07, + "loss": 0.9948, + "step": 7712 + }, + { + "epoch": 0.81117961796837, + "grad_norm": 2.6721797231956863, + "learning_rate": 4.358149121271871e-07, + "loss": 1.0091, + "step": 7713 + }, + { + "epoch": 0.8112847884102173, + "grad_norm": 2.155263569819944, + "learning_rate": 4.3534406591212775e-07, + "loss": 0.9918, + "step": 7714 + }, + { + "epoch": 0.8113899588520647, + "grad_norm": 2.878491927577863, + "learning_rate": 4.3487344992224375e-07, + "loss": 0.9162, + "step": 7715 + }, + { + "epoch": 0.811495129293912, + "grad_norm": 2.6975923680738703, + "learning_rate": 4.344030642100133e-07, + "loss": 1.0168, + "step": 7716 + }, + { + "epoch": 0.8116002997357593, + "grad_norm": 2.24948546017479, + "learning_rate": 4.3393290882788654e-07, + "loss": 0.979, + "step": 7717 + }, + { + "epoch": 0.8117054701776066, + "grad_norm": 2.2782812764649836, + "learning_rate": 4.3346298382829085e-07, + "loss": 0.9802, + "step": 7718 + }, + { + "epoch": 0.8118106406194538, + "grad_norm": 2.369431861546599, + "learning_rate": 4.329932892636249e-07, + "loss": 0.9852, + "step": 7719 + }, + { + "epoch": 0.8119158110613012, + "grad_norm": 2.819628728763317, + "learning_rate": 4.325238251862651e-07, + "loss": 1.007, + "step": 7720 + }, + { + "epoch": 0.8120209815031485, + "grad_norm": 2.480164594518704, + "learning_rate": 4.3205459164855914e-07, + "loss": 0.9727, + "step": 7721 + }, + { + "epoch": 0.8121261519449958, + "grad_norm": 2.3940648557433524, + "learning_rate": 4.315855887028314e-07, + "loss": 0.9746, + "step": 7722 + }, + { + "epoch": 0.8122313223868431, + "grad_norm": 3.393286161834988, + "learning_rate": 4.3111681640137763e-07, + "loss": 0.9979, + "step": 7723 + }, + { + "epoch": 0.8123364928286905, + "grad_norm": 2.154801256417696, + "learning_rate": 4.306482747964705e-07, + "loss": 1.0169, + "step": 7724 + }, + { + "epoch": 0.8124416632705378, + "grad_norm": 2.6404030152506657, + "learning_rate": 4.3017996394035674e-07, + "loss": 0.9906, + "step": 7725 + }, + { + "epoch": 0.8125468337123851, + "grad_norm": 2.921067892801514, + "learning_rate": 4.2971188388525515e-07, + "loss": 1.0098, + "step": 7726 + }, + { + "epoch": 0.8126520041542324, + "grad_norm": 2.151845511245677, + "learning_rate": 4.292440346833618e-07, + "loss": 0.9851, + "step": 7727 + }, + { + "epoch": 0.8127571745960798, + "grad_norm": 2.1131586999724665, + "learning_rate": 4.287764163868441e-07, + "loss": 0.9912, + "step": 7728 + }, + { + "epoch": 0.8128623450379271, + "grad_norm": 2.540790376974774, + "learning_rate": 4.283090290478459e-07, + "loss": 0.9808, + "step": 7729 + }, + { + "epoch": 0.8129675154797744, + "grad_norm": 2.5175938968858476, + "learning_rate": 4.278418727184849e-07, + "loss": 0.9565, + "step": 7730 + }, + { + "epoch": 0.8130726859216217, + "grad_norm": 3.2981499359448265, + "learning_rate": 4.273749474508515e-07, + "loss": 0.9686, + "step": 7731 + }, + { + "epoch": 0.8131778563634691, + "grad_norm": 2.9415949773488514, + "learning_rate": 4.2690825329701313e-07, + "loss": 0.9655, + "step": 7732 + }, + { + "epoch": 0.8132830268053164, + "grad_norm": 2.7939627395608104, + "learning_rate": 4.2644179030900734e-07, + "loss": 0.9893, + "step": 7733 + }, + { + "epoch": 0.8133881972471637, + "grad_norm": 2.668181131671682, + "learning_rate": 4.2597555853885117e-07, + "loss": 1.0038, + "step": 7734 + }, + { + "epoch": 0.813493367689011, + "grad_norm": 2.065827171733609, + "learning_rate": 4.2550955803853125e-07, + "loss": 0.9932, + "step": 7735 + }, + { + "epoch": 0.8135985381308584, + "grad_norm": 2.4982865299615513, + "learning_rate": 4.250437888600115e-07, + "loss": 0.9941, + "step": 7736 + }, + { + "epoch": 0.8137037085727057, + "grad_norm": 2.9611235171473536, + "learning_rate": 4.2457825105522736e-07, + "loss": 0.9584, + "step": 7737 + }, + { + "epoch": 0.813808879014553, + "grad_norm": 1.856983927521474, + "learning_rate": 4.241129446760905e-07, + "loss": 0.9412, + "step": 7738 + }, + { + "epoch": 0.8139140494564002, + "grad_norm": 2.298057826980122, + "learning_rate": 4.2364786977448723e-07, + "loss": 0.977, + "step": 7739 + }, + { + "epoch": 0.8140192198982475, + "grad_norm": 2.857504158227023, + "learning_rate": 4.231830264022757e-07, + "loss": 0.9809, + "step": 7740 + }, + { + "epoch": 0.8141243903400949, + "grad_norm": 1.9125168283750302, + "learning_rate": 4.227184146112903e-07, + "loss": 0.9547, + "step": 7741 + }, + { + "epoch": 0.8142295607819422, + "grad_norm": 2.3292724206858115, + "learning_rate": 4.222540344533374e-07, + "loss": 0.9752, + "step": 7742 + }, + { + "epoch": 0.8143347312237895, + "grad_norm": 2.8514614953686768, + "learning_rate": 4.2178988598020164e-07, + "loss": 0.9877, + "step": 7743 + }, + { + "epoch": 0.8144399016656368, + "grad_norm": 2.3444264375718555, + "learning_rate": 4.2132596924363666e-07, + "loss": 0.9912, + "step": 7744 + }, + { + "epoch": 0.8145450721074842, + "grad_norm": 2.51703936814917, + "learning_rate": 4.208622842953747e-07, + "loss": 0.9761, + "step": 7745 + }, + { + "epoch": 0.8146502425493315, + "grad_norm": 2.401729234637294, + "learning_rate": 4.2039883118711935e-07, + "loss": 0.9613, + "step": 7746 + }, + { + "epoch": 0.8147554129911788, + "grad_norm": 2.10443613373023, + "learning_rate": 4.1993560997054787e-07, + "loss": 0.9799, + "step": 7747 + }, + { + "epoch": 0.8148605834330261, + "grad_norm": 2.7838525850839613, + "learning_rate": 4.194726206973157e-07, + "loss": 0.958, + "step": 7748 + }, + { + "epoch": 0.8149657538748735, + "grad_norm": 2.471550286019982, + "learning_rate": 4.1900986341904787e-07, + "loss": 0.957, + "step": 7749 + }, + { + "epoch": 0.8150709243167208, + "grad_norm": 2.6262073326290087, + "learning_rate": 4.1854733818734646e-07, + "loss": 0.9636, + "step": 7750 + }, + { + "epoch": 0.8151760947585681, + "grad_norm": 1.7860526138744137, + "learning_rate": 4.180850450537857e-07, + "loss": 0.9505, + "step": 7751 + }, + { + "epoch": 0.8152812652004154, + "grad_norm": 2.3407264097231866, + "learning_rate": 4.1762298406991503e-07, + "loss": 0.9565, + "step": 7752 + }, + { + "epoch": 0.8153864356422628, + "grad_norm": 2.150623638303471, + "learning_rate": 4.1716115528725896e-07, + "loss": 0.9697, + "step": 7753 + }, + { + "epoch": 0.8154916060841101, + "grad_norm": 3.2525410184474173, + "learning_rate": 4.166995587573133e-07, + "loss": 1.0229, + "step": 7754 + }, + { + "epoch": 0.8155967765259574, + "grad_norm": 2.1671249003463577, + "learning_rate": 4.1623819453155133e-07, + "loss": 0.9757, + "step": 7755 + }, + { + "epoch": 0.8157019469678047, + "grad_norm": 2.097060159238962, + "learning_rate": 4.157770626614166e-07, + "loss": 0.9595, + "step": 7756 + }, + { + "epoch": 0.8158071174096521, + "grad_norm": 2.5745070957454947, + "learning_rate": 4.153161631983313e-07, + "loss": 0.9547, + "step": 7757 + }, + { + "epoch": 0.8159122878514994, + "grad_norm": 2.6795685146216854, + "learning_rate": 4.1485549619368765e-07, + "loss": 0.9458, + "step": 7758 + }, + { + "epoch": 0.8160174582933467, + "grad_norm": 2.462379782717814, + "learning_rate": 4.1439506169885484e-07, + "loss": 0.9906, + "step": 7759 + }, + { + "epoch": 0.8161226287351939, + "grad_norm": 2.4584203736359918, + "learning_rate": 4.139348597651735e-07, + "loss": 0.9678, + "step": 7760 + }, + { + "epoch": 0.8162277991770412, + "grad_norm": 2.373737093242701, + "learning_rate": 4.134748904439603e-07, + "loss": 0.9874, + "step": 7761 + }, + { + "epoch": 0.8163329696188886, + "grad_norm": 2.682893399793693, + "learning_rate": 4.1301515378650654e-07, + "loss": 0.9744, + "step": 7762 + }, + { + "epoch": 0.8164381400607359, + "grad_norm": 2.6158066777998443, + "learning_rate": 4.1255564984407445e-07, + "loss": 0.9914, + "step": 7763 + }, + { + "epoch": 0.8165433105025832, + "grad_norm": 2.320463099049456, + "learning_rate": 4.120963786679039e-07, + "loss": 0.9913, + "step": 7764 + }, + { + "epoch": 0.8166484809444305, + "grad_norm": 2.584830907970578, + "learning_rate": 4.116373403092058e-07, + "loss": 0.9571, + "step": 7765 + }, + { + "epoch": 0.8167536513862779, + "grad_norm": 2.3221349508899425, + "learning_rate": 4.111785348191674e-07, + "loss": 0.9629, + "step": 7766 + }, + { + "epoch": 0.8168588218281252, + "grad_norm": 2.3555780902934207, + "learning_rate": 4.1071996224894937e-07, + "loss": 0.9926, + "step": 7767 + }, + { + "epoch": 0.8169639922699725, + "grad_norm": 2.4447457069951763, + "learning_rate": 4.1026162264968526e-07, + "loss": 0.9714, + "step": 7768 + }, + { + "epoch": 0.8170691627118198, + "grad_norm": 2.5930949291002143, + "learning_rate": 4.098035160724842e-07, + "loss": 0.9894, + "step": 7769 + }, + { + "epoch": 0.8171743331536672, + "grad_norm": 2.4978191323631975, + "learning_rate": 4.093456425684275e-07, + "loss": 0.9675, + "step": 7770 + }, + { + "epoch": 0.8172795035955145, + "grad_norm": 1.763876023991041, + "learning_rate": 4.0888800218857326e-07, + "loss": 0.9674, + "step": 7771 + }, + { + "epoch": 0.8173846740373618, + "grad_norm": 2.562886748389969, + "learning_rate": 4.0843059498395063e-07, + "loss": 0.9959, + "step": 7772 + }, + { + "epoch": 0.8174898444792091, + "grad_norm": 3.542366051507965, + "learning_rate": 4.0797342100556526e-07, + "loss": 0.9939, + "step": 7773 + }, + { + "epoch": 0.8175950149210565, + "grad_norm": 2.978184338970861, + "learning_rate": 4.075164803043941e-07, + "loss": 1.0054, + "step": 7774 + }, + { + "epoch": 0.8177001853629038, + "grad_norm": 2.0685694903463263, + "learning_rate": 4.070597729313905e-07, + "loss": 0.9819, + "step": 7775 + }, + { + "epoch": 0.8178053558047511, + "grad_norm": 2.4513268899881755, + "learning_rate": 4.066032989374813e-07, + "loss": 0.9578, + "step": 7776 + }, + { + "epoch": 0.8179105262465984, + "grad_norm": 2.789567940068473, + "learning_rate": 4.061470583735655e-07, + "loss": 0.9826, + "step": 7777 + }, + { + "epoch": 0.8180156966884458, + "grad_norm": 2.2071024352563495, + "learning_rate": 4.056910512905193e-07, + "loss": 0.9408, + "step": 7778 + }, + { + "epoch": 0.8181208671302931, + "grad_norm": 2.462265054582973, + "learning_rate": 4.052352777391885e-07, + "loss": 0.9993, + "step": 7779 + }, + { + "epoch": 0.8182260375721403, + "grad_norm": 2.1507474468996537, + "learning_rate": 4.0477973777039854e-07, + "loss": 0.9428, + "step": 7780 + }, + { + "epoch": 0.8183312080139876, + "grad_norm": 1.9943322945072193, + "learning_rate": 4.0432443143494373e-07, + "loss": 0.9267, + "step": 7781 + }, + { + "epoch": 0.818436378455835, + "grad_norm": 2.4651273882986757, + "learning_rate": 4.03869358783594e-07, + "loss": 0.9777, + "step": 7782 + }, + { + "epoch": 0.8185415488976823, + "grad_norm": 2.3629386718998173, + "learning_rate": 4.0341451986709494e-07, + "loss": 1.0024, + "step": 7783 + }, + { + "epoch": 0.8186467193395296, + "grad_norm": 1.98682520235926, + "learning_rate": 4.029599147361624e-07, + "loss": 1.0093, + "step": 7784 + }, + { + "epoch": 0.8187518897813769, + "grad_norm": 2.6681064116513826, + "learning_rate": 4.0250554344149094e-07, + "loss": 0.998, + "step": 7785 + }, + { + "epoch": 0.8188570602232242, + "grad_norm": 2.0258952155927568, + "learning_rate": 4.020514060337447e-07, + "loss": 0.9847, + "step": 7786 + }, + { + "epoch": 0.8189622306650716, + "grad_norm": 2.2616741774571074, + "learning_rate": 4.015975025635646e-07, + "loss": 0.9739, + "step": 7787 + }, + { + "epoch": 0.8190674011069189, + "grad_norm": 2.946241928430217, + "learning_rate": 4.011438330815634e-07, + "loss": 0.9951, + "step": 7788 + }, + { + "epoch": 0.8191725715487662, + "grad_norm": 2.6286525561939342, + "learning_rate": 4.0069039763832946e-07, + "loss": 0.9628, + "step": 7789 + }, + { + "epoch": 0.8192777419906135, + "grad_norm": 2.3544960061523943, + "learning_rate": 4.0023719628442454e-07, + "loss": 0.9692, + "step": 7790 + }, + { + "epoch": 0.8193829124324609, + "grad_norm": 2.447301762694223, + "learning_rate": 3.997842290703835e-07, + "loss": 0.988, + "step": 7791 + }, + { + "epoch": 0.8194880828743082, + "grad_norm": 2.0937992451075864, + "learning_rate": 3.9933149604671624e-07, + "loss": 0.9527, + "step": 7792 + }, + { + "epoch": 0.8195932533161555, + "grad_norm": 2.5017080295162653, + "learning_rate": 3.9887899726390455e-07, + "loss": 0.9585, + "step": 7793 + }, + { + "epoch": 0.8196984237580028, + "grad_norm": 2.1071679855154173, + "learning_rate": 3.9842673277240813e-07, + "loss": 0.9637, + "step": 7794 + }, + { + "epoch": 0.8198035941998502, + "grad_norm": 2.6444030746408793, + "learning_rate": 3.97974702622656e-07, + "loss": 0.986, + "step": 7795 + }, + { + "epoch": 0.8199087646416975, + "grad_norm": 2.2415822506715366, + "learning_rate": 3.975229068650541e-07, + "loss": 0.9998, + "step": 7796 + }, + { + "epoch": 0.8200139350835448, + "grad_norm": 2.8744646198567714, + "learning_rate": 3.970713455499803e-07, + "loss": 0.9492, + "step": 7797 + }, + { + "epoch": 0.8201191055253921, + "grad_norm": 2.0835911020378837, + "learning_rate": 3.9662001872778746e-07, + "loss": 0.9694, + "step": 7798 + }, + { + "epoch": 0.8202242759672395, + "grad_norm": 2.0735661240104832, + "learning_rate": 3.961689264488025e-07, + "loss": 0.9942, + "step": 7799 + }, + { + "epoch": 0.8203294464090867, + "grad_norm": 2.132869834888772, + "learning_rate": 3.9571806876332527e-07, + "loss": 0.9882, + "step": 7800 + }, + { + "epoch": 0.820434616850934, + "grad_norm": 3.1197931626041218, + "learning_rate": 3.9526744572163016e-07, + "loss": 0.9942, + "step": 7801 + }, + { + "epoch": 0.8205397872927813, + "grad_norm": 2.4680929826487352, + "learning_rate": 3.9481705737396457e-07, + "loss": 1.01, + "step": 7802 + }, + { + "epoch": 0.8206449577346286, + "grad_norm": 3.1575237316281837, + "learning_rate": 3.943669037705508e-07, + "loss": 1.0134, + "step": 7803 + }, + { + "epoch": 0.820750128176476, + "grad_norm": 2.5835775583665943, + "learning_rate": 3.939169849615851e-07, + "loss": 1.0241, + "step": 7804 + }, + { + "epoch": 0.8208552986183233, + "grad_norm": 2.231742487138655, + "learning_rate": 3.9346730099723533e-07, + "loss": 0.9715, + "step": 7805 + }, + { + "epoch": 0.8209604690601706, + "grad_norm": 2.6521036190376783, + "learning_rate": 3.930178519276462e-07, + "loss": 1.0162, + "step": 7806 + }, + { + "epoch": 0.8210656395020179, + "grad_norm": 2.867026937723473, + "learning_rate": 3.925686378029331e-07, + "loss": 0.9818, + "step": 7807 + }, + { + "epoch": 0.8211708099438653, + "grad_norm": 2.263917199281129, + "learning_rate": 3.921196586731893e-07, + "loss": 0.9731, + "step": 7808 + }, + { + "epoch": 0.8212759803857126, + "grad_norm": 2.3946757029431684, + "learning_rate": 3.916709145884773e-07, + "loss": 0.9887, + "step": 7809 + }, + { + "epoch": 0.8213811508275599, + "grad_norm": 2.280792937113479, + "learning_rate": 3.912224055988373e-07, + "loss": 0.9711, + "step": 7810 + }, + { + "epoch": 0.8214863212694072, + "grad_norm": 2.3686260861759547, + "learning_rate": 3.907741317542799e-07, + "loss": 0.9954, + "step": 7811 + }, + { + "epoch": 0.8215914917112546, + "grad_norm": 2.6674673001111167, + "learning_rate": 3.903260931047917e-07, + "loss": 0.9966, + "step": 7812 + }, + { + "epoch": 0.8216966621531019, + "grad_norm": 2.7598548320790517, + "learning_rate": 3.898782897003334e-07, + "loss": 0.9814, + "step": 7813 + }, + { + "epoch": 0.8218018325949492, + "grad_norm": 2.464756135941767, + "learning_rate": 3.894307215908372e-07, + "loss": 1.0448, + "step": 7814 + }, + { + "epoch": 0.8219070030367965, + "grad_norm": 1.6827932013253648, + "learning_rate": 3.889833888262115e-07, + "loss": 0.9651, + "step": 7815 + }, + { + "epoch": 0.8220121734786439, + "grad_norm": 2.3519562279418014, + "learning_rate": 3.8853629145633646e-07, + "loss": 1.0091, + "step": 7816 + }, + { + "epoch": 0.8221173439204912, + "grad_norm": 1.9766869386337151, + "learning_rate": 3.880894295310672e-07, + "loss": 0.9603, + "step": 7817 + }, + { + "epoch": 0.8222225143623385, + "grad_norm": 2.3412319875825585, + "learning_rate": 3.87642803100233e-07, + "loss": 0.987, + "step": 7818 + }, + { + "epoch": 0.8223276848041858, + "grad_norm": 2.2093606730393156, + "learning_rate": 3.8719641221363514e-07, + "loss": 0.9679, + "step": 7819 + }, + { + "epoch": 0.8224328552460332, + "grad_norm": 3.16030150590197, + "learning_rate": 3.8675025692105015e-07, + "loss": 0.9895, + "step": 7820 + }, + { + "epoch": 0.8225380256878804, + "grad_norm": 2.3349525577022585, + "learning_rate": 3.863043372722278e-07, + "loss": 1.0192, + "step": 7821 + }, + { + "epoch": 0.8226431961297277, + "grad_norm": 2.6371286764996618, + "learning_rate": 3.858586533168923e-07, + "loss": 0.96, + "step": 7822 + }, + { + "epoch": 0.822748366571575, + "grad_norm": 2.30573441792884, + "learning_rate": 3.8541320510473963e-07, + "loss": 0.9927, + "step": 7823 + }, + { + "epoch": 0.8228535370134223, + "grad_norm": 2.873334351698845, + "learning_rate": 3.849679926854416e-07, + "loss": 0.9672, + "step": 7824 + }, + { + "epoch": 0.8229587074552697, + "grad_norm": 2.4655038088603978, + "learning_rate": 3.845230161086422e-07, + "loss": 1.0114, + "step": 7825 + }, + { + "epoch": 0.823063877897117, + "grad_norm": 2.277321669840928, + "learning_rate": 3.840782754239597e-07, + "loss": 0.9895, + "step": 7826 + }, + { + "epoch": 0.8231690483389643, + "grad_norm": 2.4885441680443154, + "learning_rate": 3.8363377068098756e-07, + "loss": 1.0021, + "step": 7827 + }, + { + "epoch": 0.8232742187808116, + "grad_norm": 2.581402189491797, + "learning_rate": 3.831895019292897e-07, + "loss": 0.9841, + "step": 7828 + }, + { + "epoch": 0.823379389222659, + "grad_norm": 2.6225815215802952, + "learning_rate": 3.8274546921840653e-07, + "loss": 0.98, + "step": 7829 + }, + { + "epoch": 0.8234845596645063, + "grad_norm": 2.672257972613925, + "learning_rate": 3.823016725978507e-07, + "loss": 0.917, + "step": 7830 + }, + { + "epoch": 0.8235897301063536, + "grad_norm": 2.6941147355501207, + "learning_rate": 3.8185811211710997e-07, + "loss": 0.9917, + "step": 7831 + }, + { + "epoch": 0.8236949005482009, + "grad_norm": 2.5181042464817778, + "learning_rate": 3.814147878256441e-07, + "loss": 0.9903, + "step": 7832 + }, + { + "epoch": 0.8238000709900483, + "grad_norm": 2.982863876704575, + "learning_rate": 3.809716997728863e-07, + "loss": 0.9598, + "step": 7833 + }, + { + "epoch": 0.8239052414318956, + "grad_norm": 2.4829472094226075, + "learning_rate": 3.805288480082453e-07, + "loss": 1.0105, + "step": 7834 + }, + { + "epoch": 0.8240104118737429, + "grad_norm": 2.262790895769715, + "learning_rate": 3.800862325811025e-07, + "loss": 0.9802, + "step": 7835 + }, + { + "epoch": 0.8241155823155902, + "grad_norm": 2.3779766898281287, + "learning_rate": 3.796438535408134e-07, + "loss": 0.9596, + "step": 7836 + }, + { + "epoch": 0.8242207527574376, + "grad_norm": 2.5486165785150594, + "learning_rate": 3.792017109367052e-07, + "loss": 0.9497, + "step": 7837 + }, + { + "epoch": 0.8243259231992849, + "grad_norm": 2.2377493045016306, + "learning_rate": 3.787598048180821e-07, + "loss": 0.9521, + "step": 7838 + }, + { + "epoch": 0.8244310936411322, + "grad_norm": 2.560942533796768, + "learning_rate": 3.7831813523421856e-07, + "loss": 0.9648, + "step": 7839 + }, + { + "epoch": 0.8245362640829795, + "grad_norm": 3.155199120538444, + "learning_rate": 3.7787670223436464e-07, + "loss": 1.0093, + "step": 7840 + }, + { + "epoch": 0.8246414345248267, + "grad_norm": 2.5558594549066047, + "learning_rate": 3.7743550586774425e-07, + "loss": 0.9772, + "step": 7841 + }, + { + "epoch": 0.8247466049666741, + "grad_norm": 3.286085851170872, + "learning_rate": 3.7699454618355306e-07, + "loss": 1.0098, + "step": 7842 + }, + { + "epoch": 0.8248517754085214, + "grad_norm": 3.071919558180755, + "learning_rate": 3.765538232309621e-07, + "loss": 0.978, + "step": 7843 + }, + { + "epoch": 0.8249569458503687, + "grad_norm": 1.9611542629163186, + "learning_rate": 3.7611333705911526e-07, + "loss": 0.9318, + "step": 7844 + }, + { + "epoch": 0.825062116292216, + "grad_norm": 2.668955460154382, + "learning_rate": 3.756730877171308e-07, + "loss": 0.9825, + "step": 7845 + }, + { + "epoch": 0.8251672867340634, + "grad_norm": 2.097041197652859, + "learning_rate": 3.752330752540989e-07, + "loss": 0.951, + "step": 7846 + }, + { + "epoch": 0.8252724571759107, + "grad_norm": 2.4381455759333552, + "learning_rate": 3.747932997190853e-07, + "loss": 0.9731, + "step": 7847 + }, + { + "epoch": 0.825377627617758, + "grad_norm": 1.9389944794723009, + "learning_rate": 3.7435376116112753e-07, + "loss": 0.9945, + "step": 7848 + }, + { + "epoch": 0.8254827980596053, + "grad_norm": 2.6544698505058437, + "learning_rate": 3.7391445962923824e-07, + "loss": 0.9866, + "step": 7849 + }, + { + "epoch": 0.8255879685014527, + "grad_norm": 2.205561586287741, + "learning_rate": 3.734753951724029e-07, + "loss": 0.9731, + "step": 7850 + }, + { + "epoch": 0.8256931389433, + "grad_norm": 2.4950986903703165, + "learning_rate": 3.7303656783958e-07, + "loss": 0.9696, + "step": 7851 + }, + { + "epoch": 0.8257983093851473, + "grad_norm": 2.824630018964161, + "learning_rate": 3.7259797767970255e-07, + "loss": 1.0108, + "step": 7852 + }, + { + "epoch": 0.8259034798269946, + "grad_norm": 1.5885466979887306, + "learning_rate": 3.7215962474167754e-07, + "loss": 0.9861, + "step": 7853 + }, + { + "epoch": 0.826008650268842, + "grad_norm": 2.407273379719963, + "learning_rate": 3.717215090743834e-07, + "loss": 0.9186, + "step": 7854 + }, + { + "epoch": 0.8261138207106893, + "grad_norm": 2.108109969570331, + "learning_rate": 3.7128363072667455e-07, + "loss": 0.9826, + "step": 7855 + }, + { + "epoch": 0.8262189911525366, + "grad_norm": 2.125810712110948, + "learning_rate": 3.708459897473768e-07, + "loss": 0.9589, + "step": 7856 + }, + { + "epoch": 0.8263241615943839, + "grad_norm": 2.276739627965222, + "learning_rate": 3.70408586185291e-07, + "loss": 0.9668, + "step": 7857 + }, + { + "epoch": 0.8264293320362313, + "grad_norm": 1.9368576617583315, + "learning_rate": 3.69971420089191e-07, + "loss": 1.0004, + "step": 7858 + }, + { + "epoch": 0.8265345024780786, + "grad_norm": 2.122279799459209, + "learning_rate": 3.69534491507825e-07, + "loss": 0.9629, + "step": 7859 + }, + { + "epoch": 0.8266396729199259, + "grad_norm": 3.094014832454307, + "learning_rate": 3.6909780048991264e-07, + "loss": 0.9908, + "step": 7860 + }, + { + "epoch": 0.8267448433617731, + "grad_norm": 2.759129120062527, + "learning_rate": 3.6866134708414964e-07, + "loss": 0.98, + "step": 7861 + }, + { + "epoch": 0.8268500138036204, + "grad_norm": 2.276400624904361, + "learning_rate": 3.682251313392024e-07, + "loss": 1.0001, + "step": 7862 + }, + { + "epoch": 0.8269551842454678, + "grad_norm": 1.7834147878732247, + "learning_rate": 3.677891533037134e-07, + "loss": 0.9976, + "step": 7863 + }, + { + "epoch": 0.8270603546873151, + "grad_norm": 2.167298852018403, + "learning_rate": 3.6735341302629816e-07, + "loss": 0.991, + "step": 7864 + }, + { + "epoch": 0.8271655251291624, + "grad_norm": 2.480960772681218, + "learning_rate": 3.6691791055554344e-07, + "loss": 0.9629, + "step": 7865 + }, + { + "epoch": 0.8272706955710097, + "grad_norm": 2.448579622721506, + "learning_rate": 3.664826459400123e-07, + "loss": 0.9664, + "step": 7866 + }, + { + "epoch": 0.8273758660128571, + "grad_norm": 2.2157167037720105, + "learning_rate": 3.660476192282403e-07, + "loss": 0.9307, + "step": 7867 + }, + { + "epoch": 0.8274810364547044, + "grad_norm": 2.4536996567992246, + "learning_rate": 3.656128304687354e-07, + "loss": 0.9936, + "step": 7868 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 3.368456460878063, + "learning_rate": 3.6517827970998095e-07, + "loss": 0.9781, + "step": 7869 + }, + { + "epoch": 0.827691377338399, + "grad_norm": 1.71414901522867, + "learning_rate": 3.647439670004316e-07, + "loss": 0.9807, + "step": 7870 + }, + { + "epoch": 0.8277965477802464, + "grad_norm": 2.97285183356878, + "learning_rate": 3.6430989238851717e-07, + "loss": 0.9555, + "step": 7871 + }, + { + "epoch": 0.8279017182220937, + "grad_norm": 2.96702239531014, + "learning_rate": 3.6387605592264036e-07, + "loss": 0.9843, + "step": 7872 + }, + { + "epoch": 0.828006888663941, + "grad_norm": 2.166596883395088, + "learning_rate": 3.63442457651178e-07, + "loss": 0.9585, + "step": 7873 + }, + { + "epoch": 0.8281120591057883, + "grad_norm": 2.5899405106290123, + "learning_rate": 3.6300909762247813e-07, + "loss": 0.9518, + "step": 7874 + }, + { + "epoch": 0.8282172295476357, + "grad_norm": 2.2108133293110157, + "learning_rate": 3.625759758848649e-07, + "loss": 0.9702, + "step": 7875 + }, + { + "epoch": 0.828322399989483, + "grad_norm": 1.7475000460222763, + "learning_rate": 3.621430924866348e-07, + "loss": 0.9557, + "step": 7876 + }, + { + "epoch": 0.8284275704313303, + "grad_norm": 2.9751242328091165, + "learning_rate": 3.617104474760569e-07, + "loss": 0.9627, + "step": 7877 + }, + { + "epoch": 0.8285327408731776, + "grad_norm": 2.2824598602628066, + "learning_rate": 3.612780409013755e-07, + "loss": 0.9557, + "step": 7878 + }, + { + "epoch": 0.828637911315025, + "grad_norm": 2.560107410245565, + "learning_rate": 3.6084587281080605e-07, + "loss": 0.9479, + "step": 7879 + }, + { + "epoch": 0.8287430817568723, + "grad_norm": 2.6404332069080954, + "learning_rate": 3.604139432525394e-07, + "loss": 1.0084, + "step": 7880 + }, + { + "epoch": 0.8288482521987196, + "grad_norm": 3.5468570151691994, + "learning_rate": 3.59982252274739e-07, + "loss": 0.9756, + "step": 7881 + }, + { + "epoch": 0.8289534226405668, + "grad_norm": 3.1714421863078326, + "learning_rate": 3.595507999255421e-07, + "loss": 0.9722, + "step": 7882 + }, + { + "epoch": 0.8290585930824141, + "grad_norm": 1.7670191648623403, + "learning_rate": 3.591195862530586e-07, + "loss": 0.9664, + "step": 7883 + }, + { + "epoch": 0.8291637635242615, + "grad_norm": 2.8058619365312394, + "learning_rate": 3.586886113053717e-07, + "loss": 1.0019, + "step": 7884 + }, + { + "epoch": 0.8292689339661088, + "grad_norm": 2.274264569970648, + "learning_rate": 3.5825787513053905e-07, + "loss": 1.001, + "step": 7885 + }, + { + "epoch": 0.8293741044079561, + "grad_norm": 2.1691351608255953, + "learning_rate": 3.5782737777659086e-07, + "loss": 0.9753, + "step": 7886 + }, + { + "epoch": 0.8294792748498034, + "grad_norm": 3.0588797917567683, + "learning_rate": 3.573971192915315e-07, + "loss": 0.9733, + "step": 7887 + }, + { + "epoch": 0.8295844452916508, + "grad_norm": 2.2148340995812124, + "learning_rate": 3.569670997233371e-07, + "loss": 0.9621, + "step": 7888 + }, + { + "epoch": 0.8296896157334981, + "grad_norm": 3.2864299842355895, + "learning_rate": 3.5653731911995893e-07, + "loss": 1.0221, + "step": 7889 + }, + { + "epoch": 0.8297947861753454, + "grad_norm": 2.4206938864595036, + "learning_rate": 3.5610777752932117e-07, + "loss": 0.9424, + "step": 7890 + }, + { + "epoch": 0.8298999566171927, + "grad_norm": 2.524736186816006, + "learning_rate": 3.5567847499932004e-07, + "loss": 1.0228, + "step": 7891 + }, + { + "epoch": 0.8300051270590401, + "grad_norm": 2.8928862446277224, + "learning_rate": 3.552494115778271e-07, + "loss": 0.9781, + "step": 7892 + }, + { + "epoch": 0.8301102975008874, + "grad_norm": 1.951995142147348, + "learning_rate": 3.5482058731268503e-07, + "loss": 0.9185, + "step": 7893 + }, + { + "epoch": 0.8302154679427347, + "grad_norm": 1.5618142840999194, + "learning_rate": 3.5439200225171193e-07, + "loss": 0.8677, + "step": 7894 + }, + { + "epoch": 0.830320638384582, + "grad_norm": 2.567017305726929, + "learning_rate": 3.539636564426985e-07, + "loss": 1.0362, + "step": 7895 + }, + { + "epoch": 0.8304258088264294, + "grad_norm": 2.357574361477999, + "learning_rate": 3.535355499334087e-07, + "loss": 1.0022, + "step": 7896 + }, + { + "epoch": 0.8305309792682767, + "grad_norm": 2.3855821109988904, + "learning_rate": 3.5310768277157875e-07, + "loss": 0.9882, + "step": 7897 + }, + { + "epoch": 0.830636149710124, + "grad_norm": 2.4659785940655357, + "learning_rate": 3.5268005500492004e-07, + "loss": 0.9991, + "step": 7898 + }, + { + "epoch": 0.8307413201519713, + "grad_norm": 2.8955739385288086, + "learning_rate": 3.522526666811166e-07, + "loss": 0.9721, + "step": 7899 + }, + { + "epoch": 0.8308464905938187, + "grad_norm": 2.2199143818637275, + "learning_rate": 3.518255178478247e-07, + "loss": 1.028, + "step": 7900 + }, + { + "epoch": 0.830951661035666, + "grad_norm": 2.198553962750598, + "learning_rate": 3.513986085526755e-07, + "loss": 0.949, + "step": 7901 + }, + { + "epoch": 0.8310568314775132, + "grad_norm": 2.6375318358953828, + "learning_rate": 3.50971938843272e-07, + "loss": 1.0045, + "step": 7902 + }, + { + "epoch": 0.8311620019193605, + "grad_norm": 2.2539176960571585, + "learning_rate": 3.5054550876719134e-07, + "loss": 0.9797, + "step": 7903 + }, + { + "epoch": 0.8312671723612078, + "grad_norm": 2.621497832230979, + "learning_rate": 3.5011931837198476e-07, + "loss": 0.9706, + "step": 7904 + }, + { + "epoch": 0.8313723428030552, + "grad_norm": 2.408630576835979, + "learning_rate": 3.496933677051745e-07, + "loss": 1.0101, + "step": 7905 + }, + { + "epoch": 0.8314775132449025, + "grad_norm": 2.5560399097177635, + "learning_rate": 3.4926765681425835e-07, + "loss": 1.0304, + "step": 7906 + }, + { + "epoch": 0.8315826836867498, + "grad_norm": 2.6823556900283565, + "learning_rate": 3.488421857467053e-07, + "loss": 0.9543, + "step": 7907 + }, + { + "epoch": 0.8316878541285971, + "grad_norm": 1.7262174828332262, + "learning_rate": 3.484169545499594e-07, + "loss": 1.0048, + "step": 7908 + }, + { + "epoch": 0.8317930245704445, + "grad_norm": 2.243625820617893, + "learning_rate": 3.479919632714374e-07, + "loss": 0.9714, + "step": 7909 + }, + { + "epoch": 0.8318981950122918, + "grad_norm": 2.2906586964812985, + "learning_rate": 3.475672119585291e-07, + "loss": 0.9659, + "step": 7910 + }, + { + "epoch": 0.8320033654541391, + "grad_norm": 2.6417368186284964, + "learning_rate": 3.4714270065859673e-07, + "loss": 0.9472, + "step": 7911 + }, + { + "epoch": 0.8321085358959864, + "grad_norm": 3.0260483510274017, + "learning_rate": 3.4671842941897764e-07, + "loss": 1.0046, + "step": 7912 + }, + { + "epoch": 0.8322137063378338, + "grad_norm": 2.59174736426387, + "learning_rate": 3.4629439828698125e-07, + "loss": 1.0117, + "step": 7913 + }, + { + "epoch": 0.8323188767796811, + "grad_norm": 2.855621058386072, + "learning_rate": 3.4587060730988943e-07, + "loss": 0.9702, + "step": 7914 + }, + { + "epoch": 0.8324240472215284, + "grad_norm": 2.3861443308093295, + "learning_rate": 3.4544705653495936e-07, + "loss": 0.9683, + "step": 7915 + }, + { + "epoch": 0.8325292176633757, + "grad_norm": 2.956455393722912, + "learning_rate": 3.4502374600941915e-07, + "loss": 0.9796, + "step": 7916 + }, + { + "epoch": 0.8326343881052231, + "grad_norm": 3.3272393954310275, + "learning_rate": 3.446006757804715e-07, + "loss": 0.9626, + "step": 7917 + }, + { + "epoch": 0.8327395585470704, + "grad_norm": 3.117936160153246, + "learning_rate": 3.44177845895293e-07, + "loss": 1.0031, + "step": 7918 + }, + { + "epoch": 0.8328447289889177, + "grad_norm": 2.3984852683425406, + "learning_rate": 3.4375525640103103e-07, + "loss": 0.9611, + "step": 7919 + }, + { + "epoch": 0.832949899430765, + "grad_norm": 2.548645604067526, + "learning_rate": 3.4333290734480885e-07, + "loss": 0.9952, + "step": 7920 + }, + { + "epoch": 0.8330550698726124, + "grad_norm": 2.5500483837731815, + "learning_rate": 3.4291079877371984e-07, + "loss": 0.9917, + "step": 7921 + }, + { + "epoch": 0.8331602403144596, + "grad_norm": 2.514529585621814, + "learning_rate": 3.4248893073483504e-07, + "loss": 1.0036, + "step": 7922 + }, + { + "epoch": 0.8332654107563069, + "grad_norm": 2.025207298660727, + "learning_rate": 3.4206730327519365e-07, + "loss": 0.978, + "step": 7923 + }, + { + "epoch": 0.8333705811981542, + "grad_norm": 2.5083140341213355, + "learning_rate": 3.4164591644181233e-07, + "loss": 0.9394, + "step": 7924 + }, + { + "epoch": 0.8334757516400015, + "grad_norm": 3.419460327667554, + "learning_rate": 3.412247702816771e-07, + "loss": 0.9656, + "step": 7925 + }, + { + "epoch": 0.8335809220818489, + "grad_norm": 2.5984729823260144, + "learning_rate": 3.408038648417503e-07, + "loss": 0.9887, + "step": 7926 + }, + { + "epoch": 0.8336860925236962, + "grad_norm": 2.3818846798670767, + "learning_rate": 3.403832001689661e-07, + "loss": 0.9715, + "step": 7927 + }, + { + "epoch": 0.8337912629655435, + "grad_norm": 2.210634104658908, + "learning_rate": 3.399627763102309e-07, + "loss": 0.978, + "step": 7928 + }, + { + "epoch": 0.8338964334073908, + "grad_norm": 2.5371862459046146, + "learning_rate": 3.3954259331242664e-07, + "loss": 0.9618, + "step": 7929 + }, + { + "epoch": 0.8340016038492382, + "grad_norm": 2.480566157647304, + "learning_rate": 3.391226512224055e-07, + "loss": 0.9725, + "step": 7930 + }, + { + "epoch": 0.8341067742910855, + "grad_norm": 2.1229236675007845, + "learning_rate": 3.387029500869954e-07, + "loss": 0.9619, + "step": 7931 + }, + { + "epoch": 0.8342119447329328, + "grad_norm": 1.9698108801024228, + "learning_rate": 3.3828348995299575e-07, + "loss": 0.9741, + "step": 7932 + }, + { + "epoch": 0.8343171151747801, + "grad_norm": 1.9418318681400226, + "learning_rate": 3.3786427086718006e-07, + "loss": 0.9577, + "step": 7933 + }, + { + "epoch": 0.8344222856166275, + "grad_norm": 3.3957679090439017, + "learning_rate": 3.3744529287629446e-07, + "loss": 0.9471, + "step": 7934 + }, + { + "epoch": 0.8345274560584748, + "grad_norm": 2.4124644360616436, + "learning_rate": 3.3702655602705693e-07, + "loss": 0.983, + "step": 7935 + }, + { + "epoch": 0.8346326265003221, + "grad_norm": 2.7101113991825825, + "learning_rate": 3.36608060366162e-07, + "loss": 0.9811, + "step": 7936 + }, + { + "epoch": 0.8347377969421694, + "grad_norm": 2.2798654842255157, + "learning_rate": 3.361898059402738e-07, + "loss": 0.9592, + "step": 7937 + }, + { + "epoch": 0.8348429673840168, + "grad_norm": 2.982597973091621, + "learning_rate": 3.357717927960316e-07, + "loss": 1.0268, + "step": 7938 + }, + { + "epoch": 0.8349481378258641, + "grad_norm": 2.298141064600936, + "learning_rate": 3.3535402098004626e-07, + "loss": 0.9903, + "step": 7939 + }, + { + "epoch": 0.8350533082677114, + "grad_norm": 2.3752357601389704, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.9704, + "step": 7940 + }, + { + "epoch": 0.8351584787095587, + "grad_norm": 2.763407955749695, + "learning_rate": 3.345192015191609e-07, + "loss": 0.9829, + "step": 7941 + }, + { + "epoch": 0.835263649151406, + "grad_norm": 2.828981552542588, + "learning_rate": 3.341021539673492e-07, + "loss": 1.0321, + "step": 7942 + }, + { + "epoch": 0.8353688195932533, + "grad_norm": 2.2262635053414574, + "learning_rate": 3.3368534792997285e-07, + "loss": 0.9639, + "step": 7943 + }, + { + "epoch": 0.8354739900351006, + "grad_norm": 2.446688699369652, + "learning_rate": 3.3326878345350784e-07, + "loss": 1.0052, + "step": 7944 + }, + { + "epoch": 0.8355791604769479, + "grad_norm": 2.7455178000846363, + "learning_rate": 3.3285246058440635e-07, + "loss": 0.9826, + "step": 7945 + }, + { + "epoch": 0.8356843309187952, + "grad_norm": 2.464176366963785, + "learning_rate": 3.324363793690899e-07, + "loss": 0.9878, + "step": 7946 + }, + { + "epoch": 0.8357895013606426, + "grad_norm": 2.2726195863266705, + "learning_rate": 3.320205398539561e-07, + "loss": 0.9455, + "step": 7947 + }, + { + "epoch": 0.8358946718024899, + "grad_norm": 2.2989720657926402, + "learning_rate": 3.316049420853729e-07, + "loss": 0.9815, + "step": 7948 + }, + { + "epoch": 0.8359998422443372, + "grad_norm": 2.1279969880508705, + "learning_rate": 3.311895861096831e-07, + "loss": 0.9454, + "step": 7949 + }, + { + "epoch": 0.8361050126861845, + "grad_norm": 2.6446950816118195, + "learning_rate": 3.307744719732031e-07, + "loss": 0.9773, + "step": 7950 + }, + { + "epoch": 0.8362101831280319, + "grad_norm": 1.955699911337364, + "learning_rate": 3.3035959972222e-07, + "loss": 0.9484, + "step": 7951 + }, + { + "epoch": 0.8363153535698792, + "grad_norm": 2.332225298540404, + "learning_rate": 3.299449694029966e-07, + "loss": 0.9963, + "step": 7952 + }, + { + "epoch": 0.8364205240117265, + "grad_norm": 4.216667098594006, + "learning_rate": 3.2953058106176607e-07, + "loss": 1.0338, + "step": 7953 + }, + { + "epoch": 0.8365256944535738, + "grad_norm": 2.0765665881975752, + "learning_rate": 3.2911643474473647e-07, + "loss": 0.9991, + "step": 7954 + }, + { + "epoch": 0.8366308648954212, + "grad_norm": 2.044749035524921, + "learning_rate": 3.28702530498089e-07, + "loss": 0.9589, + "step": 7955 + }, + { + "epoch": 0.8367360353372685, + "grad_norm": 3.061177661810104, + "learning_rate": 3.282888683679758e-07, + "loss": 1.0022, + "step": 7956 + }, + { + "epoch": 0.8368412057791158, + "grad_norm": 1.852367172280801, + "learning_rate": 3.2787544840052503e-07, + "loss": 0.9729, + "step": 7957 + }, + { + "epoch": 0.8369463762209631, + "grad_norm": 2.4129977271544436, + "learning_rate": 3.274622706418346e-07, + "loss": 0.9655, + "step": 7958 + }, + { + "epoch": 0.8370515466628105, + "grad_norm": 2.467914364505314, + "learning_rate": 3.270493351379786e-07, + "loss": 0.9492, + "step": 7959 + }, + { + "epoch": 0.8371567171046578, + "grad_norm": 2.311681107608195, + "learning_rate": 3.2663664193500114e-07, + "loss": 0.9855, + "step": 7960 + }, + { + "epoch": 0.8372618875465051, + "grad_norm": 2.670758248675138, + "learning_rate": 3.262241910789221e-07, + "loss": 0.9683, + "step": 7961 + }, + { + "epoch": 0.8373670579883524, + "grad_norm": 2.1454789719559284, + "learning_rate": 3.258119826157319e-07, + "loss": 0.9359, + "step": 7962 + }, + { + "epoch": 0.8374722284301996, + "grad_norm": 2.4776693808188446, + "learning_rate": 3.2540001659139506e-07, + "loss": 1.0157, + "step": 7963 + }, + { + "epoch": 0.837577398872047, + "grad_norm": 1.8011863229194303, + "learning_rate": 3.249882930518497e-07, + "loss": 0.9874, + "step": 7964 + }, + { + "epoch": 0.8376825693138943, + "grad_norm": 2.168737945078986, + "learning_rate": 3.245768120430054e-07, + "loss": 0.993, + "step": 7965 + }, + { + "epoch": 0.8377877397557416, + "grad_norm": 3.0895348956728617, + "learning_rate": 3.241655736107466e-07, + "loss": 0.9992, + "step": 7966 + }, + { + "epoch": 0.8378929101975889, + "grad_norm": 3.3010610656877093, + "learning_rate": 3.237545778009274e-07, + "loss": 0.9793, + "step": 7967 + }, + { + "epoch": 0.8379980806394363, + "grad_norm": 2.7672284112761956, + "learning_rate": 3.233438246593795e-07, + "loss": 0.9919, + "step": 7968 + }, + { + "epoch": 0.8381032510812836, + "grad_norm": 2.0517422870765625, + "learning_rate": 3.229333142319041e-07, + "loss": 0.991, + "step": 7969 + }, + { + "epoch": 0.8382084215231309, + "grad_norm": 2.0009951842118143, + "learning_rate": 3.2252304656427566e-07, + "loss": 0.9795, + "step": 7970 + }, + { + "epoch": 0.8383135919649782, + "grad_norm": 2.4005492702244977, + "learning_rate": 3.221130217022433e-07, + "loss": 0.9675, + "step": 7971 + }, + { + "epoch": 0.8384187624068256, + "grad_norm": 2.1804042366396232, + "learning_rate": 3.217032396915265e-07, + "loss": 0.972, + "step": 7972 + }, + { + "epoch": 0.8385239328486729, + "grad_norm": 2.973961335735684, + "learning_rate": 3.2129370057782106e-07, + "loss": 0.9951, + "step": 7973 + }, + { + "epoch": 0.8386291032905202, + "grad_norm": 2.025266752429975, + "learning_rate": 3.208844044067921e-07, + "loss": 1.0002, + "step": 7974 + }, + { + "epoch": 0.8387342737323675, + "grad_norm": 2.4059729700613457, + "learning_rate": 3.2047535122408076e-07, + "loss": 0.9795, + "step": 7975 + }, + { + "epoch": 0.8388394441742149, + "grad_norm": 2.753932435517075, + "learning_rate": 3.200665410752982e-07, + "loss": 0.9798, + "step": 7976 + }, + { + "epoch": 0.8389446146160622, + "grad_norm": 2.412042721936347, + "learning_rate": 3.196579740060307e-07, + "loss": 0.9764, + "step": 7977 + }, + { + "epoch": 0.8390497850579095, + "grad_norm": 2.922557537170381, + "learning_rate": 3.192496500618372e-07, + "loss": 0.9586, + "step": 7978 + }, + { + "epoch": 0.8391549554997568, + "grad_norm": 2.2610207215000173, + "learning_rate": 3.188415692882477e-07, + "loss": 0.9687, + "step": 7979 + }, + { + "epoch": 0.8392601259416042, + "grad_norm": 2.6933915853361006, + "learning_rate": 3.1843373173076784e-07, + "loss": 1.0122, + "step": 7980 + }, + { + "epoch": 0.8393652963834515, + "grad_norm": 1.9203482516232904, + "learning_rate": 3.1802613743487255e-07, + "loss": 0.9712, + "step": 7981 + }, + { + "epoch": 0.8394704668252988, + "grad_norm": 2.8523465380370348, + "learning_rate": 3.1761878644601425e-07, + "loss": 0.9694, + "step": 7982 + }, + { + "epoch": 0.839575637267146, + "grad_norm": 2.2911244103217774, + "learning_rate": 3.172116788096141e-07, + "loss": 0.9529, + "step": 7983 + }, + { + "epoch": 0.8396808077089933, + "grad_norm": 2.715895442199244, + "learning_rate": 3.1680481457106895e-07, + "loss": 0.9757, + "step": 7984 + }, + { + "epoch": 0.8397859781508407, + "grad_norm": 2.250183946803774, + "learning_rate": 3.1639819377574624e-07, + "loss": 0.967, + "step": 7985 + }, + { + "epoch": 0.839891148592688, + "grad_norm": 2.107649904873707, + "learning_rate": 3.159918164689871e-07, + "loss": 1.0024, + "step": 7986 + }, + { + "epoch": 0.8399963190345353, + "grad_norm": 2.094200221460402, + "learning_rate": 3.1558568269610707e-07, + "loss": 0.9948, + "step": 7987 + }, + { + "epoch": 0.8401014894763826, + "grad_norm": 2.144487336078061, + "learning_rate": 3.151797925023922e-07, + "loss": 1.03, + "step": 7988 + }, + { + "epoch": 0.84020665991823, + "grad_norm": 2.264741520119065, + "learning_rate": 3.147741459331033e-07, + "loss": 0.9903, + "step": 7989 + }, + { + "epoch": 0.8403118303600773, + "grad_norm": 2.3375622944629764, + "learning_rate": 3.1436874303347185e-07, + "loss": 0.9826, + "step": 7990 + }, + { + "epoch": 0.8404170008019246, + "grad_norm": 2.5671078673407473, + "learning_rate": 3.1396358384870426e-07, + "loss": 0.9646, + "step": 7991 + }, + { + "epoch": 0.8405221712437719, + "grad_norm": 2.3619671652091667, + "learning_rate": 3.135586684239794e-07, + "loss": 0.9667, + "step": 7992 + }, + { + "epoch": 0.8406273416856193, + "grad_norm": 2.416335511787064, + "learning_rate": 3.1315399680444727e-07, + "loss": 0.9691, + "step": 7993 + }, + { + "epoch": 0.8407325121274666, + "grad_norm": 1.9661025621449095, + "learning_rate": 3.127495690352331e-07, + "loss": 0.9958, + "step": 7994 + }, + { + "epoch": 0.8408376825693139, + "grad_norm": 3.035263773083995, + "learning_rate": 3.1234538516143194e-07, + "loss": 0.9768, + "step": 7995 + }, + { + "epoch": 0.8409428530111612, + "grad_norm": 2.0867961026565207, + "learning_rate": 3.119414452281158e-07, + "loss": 1.0095, + "step": 7996 + }, + { + "epoch": 0.8410480234530086, + "grad_norm": 2.7704288005888826, + "learning_rate": 3.1153774928032564e-07, + "loss": 0.9915, + "step": 7997 + }, + { + "epoch": 0.8411531938948559, + "grad_norm": 2.6501249494588413, + "learning_rate": 3.111342973630771e-07, + "loss": 0.9808, + "step": 7998 + }, + { + "epoch": 0.8412583643367032, + "grad_norm": 2.438311465297352, + "learning_rate": 3.107310895213578e-07, + "loss": 0.962, + "step": 7999 + }, + { + "epoch": 0.8413635347785505, + "grad_norm": 2.345077898806914, + "learning_rate": 3.1032812580012895e-07, + "loss": 0.9818, + "step": 8000 + }, + { + "epoch": 0.8414687052203979, + "grad_norm": 2.891411545914225, + "learning_rate": 3.0992540624432436e-07, + "loss": 0.9883, + "step": 8001 + }, + { + "epoch": 0.8415738756622452, + "grad_norm": 2.415834378113018, + "learning_rate": 3.0952293089884984e-07, + "loss": 0.9521, + "step": 8002 + }, + { + "epoch": 0.8416790461040925, + "grad_norm": 2.8617855981045426, + "learning_rate": 3.0912069980858496e-07, + "loss": 0.9902, + "step": 8003 + }, + { + "epoch": 0.8417842165459397, + "grad_norm": 2.8879149599658422, + "learning_rate": 3.0871871301838053e-07, + "loss": 1.0216, + "step": 8004 + }, + { + "epoch": 0.841889386987787, + "grad_norm": 1.8612027892835419, + "learning_rate": 3.083169705730629e-07, + "loss": 0.96, + "step": 8005 + }, + { + "epoch": 0.8419945574296344, + "grad_norm": 2.6330298233689122, + "learning_rate": 3.079154725174288e-07, + "loss": 0.9465, + "step": 8006 + }, + { + "epoch": 0.8420997278714817, + "grad_norm": 2.138955256278022, + "learning_rate": 3.0751421889624766e-07, + "loss": 0.947, + "step": 8007 + }, + { + "epoch": 0.842204898313329, + "grad_norm": 2.616727462598564, + "learning_rate": 3.0711320975426367e-07, + "loss": 0.9434, + "step": 8008 + }, + { + "epoch": 0.8423100687551763, + "grad_norm": 2.4895885340978543, + "learning_rate": 3.067124451361905e-07, + "loss": 1.0038, + "step": 8009 + }, + { + "epoch": 0.8424152391970237, + "grad_norm": 3.4008930173947873, + "learning_rate": 3.063119250867186e-07, + "loss": 0.9394, + "step": 8010 + }, + { + "epoch": 0.842520409638871, + "grad_norm": 2.851466967075379, + "learning_rate": 3.05911649650508e-07, + "loss": 0.954, + "step": 8011 + }, + { + "epoch": 0.8426255800807183, + "grad_norm": 2.9185957846098027, + "learning_rate": 3.055116188721932e-07, + "loss": 1.0138, + "step": 8012 + }, + { + "epoch": 0.8427307505225656, + "grad_norm": 2.260085530350411, + "learning_rate": 3.051118327963798e-07, + "loss": 0.9917, + "step": 8013 + }, + { + "epoch": 0.842835920964413, + "grad_norm": 2.458882313705006, + "learning_rate": 3.047122914676473e-07, + "loss": 1.0258, + "step": 8014 + }, + { + "epoch": 0.8429410914062603, + "grad_norm": 3.1014214186637883, + "learning_rate": 3.043129949305487e-07, + "loss": 0.9228, + "step": 8015 + }, + { + "epoch": 0.8430462618481076, + "grad_norm": 1.6614260018195441, + "learning_rate": 3.0391394322960706e-07, + "loss": 0.9825, + "step": 8016 + }, + { + "epoch": 0.8431514322899549, + "grad_norm": 2.0667501456498605, + "learning_rate": 3.035151364093214e-07, + "loss": 1.0093, + "step": 8017 + }, + { + "epoch": 0.8432566027318023, + "grad_norm": 2.598733278916, + "learning_rate": 3.031165745141595e-07, + "loss": 0.9792, + "step": 8018 + }, + { + "epoch": 0.8433617731736496, + "grad_norm": 2.5060894728850913, + "learning_rate": 3.0271825758856675e-07, + "loss": 0.977, + "step": 8019 + }, + { + "epoch": 0.8434669436154969, + "grad_norm": 2.55384223599344, + "learning_rate": 3.02320185676957e-07, + "loss": 1.0313, + "step": 8020 + }, + { + "epoch": 0.8435721140573442, + "grad_norm": 3.1720208128466916, + "learning_rate": 3.0192235882371913e-07, + "loss": 0.9646, + "step": 8021 + }, + { + "epoch": 0.8436772844991915, + "grad_norm": 2.813164544453831, + "learning_rate": 3.01524777073213e-07, + "loss": 0.9844, + "step": 8022 + }, + { + "epoch": 0.8437824549410389, + "grad_norm": 1.7681715170092602, + "learning_rate": 3.0112744046977255e-07, + "loss": 0.9852, + "step": 8023 + }, + { + "epoch": 0.8438876253828861, + "grad_norm": 2.8437496154777326, + "learning_rate": 3.007303490577043e-07, + "loss": 1.0048, + "step": 8024 + }, + { + "epoch": 0.8439927958247334, + "grad_norm": 2.7174670709863884, + "learning_rate": 3.003335028812862e-07, + "loss": 0.978, + "step": 8025 + }, + { + "epoch": 0.8440979662665807, + "grad_norm": 3.067825714603444, + "learning_rate": 2.999369019847709e-07, + "loss": 0.9515, + "step": 8026 + }, + { + "epoch": 0.8442031367084281, + "grad_norm": 2.8915929181982105, + "learning_rate": 2.99540546412381e-07, + "loss": 1.0105, + "step": 8027 + }, + { + "epoch": 0.8443083071502754, + "grad_norm": 1.8900985976770452, + "learning_rate": 2.99144436208314e-07, + "loss": 0.9914, + "step": 8028 + }, + { + "epoch": 0.8444134775921227, + "grad_norm": 2.2877820769502213, + "learning_rate": 2.987485714167396e-07, + "loss": 0.9966, + "step": 8029 + }, + { + "epoch": 0.84451864803397, + "grad_norm": 2.196650528892029, + "learning_rate": 2.9835295208179907e-07, + "loss": 0.9936, + "step": 8030 + }, + { + "epoch": 0.8446238184758174, + "grad_norm": 3.2894078035994663, + "learning_rate": 2.9795757824760796e-07, + "loss": 1.0142, + "step": 8031 + }, + { + "epoch": 0.8447289889176647, + "grad_norm": 2.297997513581351, + "learning_rate": 2.975624499582516e-07, + "loss": 0.911, + "step": 8032 + }, + { + "epoch": 0.844834159359512, + "grad_norm": 2.647899001008078, + "learning_rate": 2.9716756725779254e-07, + "loss": 0.9509, + "step": 8033 + }, + { + "epoch": 0.8449393298013593, + "grad_norm": 2.360019920855114, + "learning_rate": 2.967729301902611e-07, + "loss": 1.0027, + "step": 8034 + }, + { + "epoch": 0.8450445002432067, + "grad_norm": 2.999008966583943, + "learning_rate": 2.96378538799664e-07, + "loss": 1.0052, + "step": 8035 + }, + { + "epoch": 0.845149670685054, + "grad_norm": 2.6618078141442987, + "learning_rate": 2.9598439312997745e-07, + "loss": 0.9906, + "step": 8036 + }, + { + "epoch": 0.8452548411269013, + "grad_norm": 2.4720173368158074, + "learning_rate": 2.955904932251527e-07, + "loss": 0.9599, + "step": 8037 + }, + { + "epoch": 0.8453600115687486, + "grad_norm": 2.8992055015069678, + "learning_rate": 2.9519683912911267e-07, + "loss": 1.0055, + "step": 8038 + }, + { + "epoch": 0.845465182010596, + "grad_norm": 2.265207100310217, + "learning_rate": 2.9480343088575227e-07, + "loss": 1.0136, + "step": 8039 + }, + { + "epoch": 0.8455703524524433, + "grad_norm": 2.4872651865839615, + "learning_rate": 2.9441026853894024e-07, + "loss": 0.9741, + "step": 8040 + }, + { + "epoch": 0.8456755228942906, + "grad_norm": 2.3337883217309714, + "learning_rate": 2.940173521325165e-07, + "loss": 0.9959, + "step": 8041 + }, + { + "epoch": 0.8457806933361379, + "grad_norm": 2.4394701294128054, + "learning_rate": 2.936246817102947e-07, + "loss": 0.9584, + "step": 8042 + }, + { + "epoch": 0.8458858637779852, + "grad_norm": 1.8882443397504602, + "learning_rate": 2.9323225731606094e-07, + "loss": 0.9236, + "step": 8043 + }, + { + "epoch": 0.8459910342198325, + "grad_norm": 2.2667033507578913, + "learning_rate": 2.9284007899357286e-07, + "loss": 0.9221, + "step": 8044 + }, + { + "epoch": 0.8460962046616798, + "grad_norm": 2.370795500055156, + "learning_rate": 2.924481467865617e-07, + "loss": 0.9892, + "step": 8045 + }, + { + "epoch": 0.8462013751035271, + "grad_norm": 2.6229927783843827, + "learning_rate": 2.920564607387313e-07, + "loss": 1.0033, + "step": 8046 + }, + { + "epoch": 0.8463065455453744, + "grad_norm": 2.675563045044748, + "learning_rate": 2.916650208937577e-07, + "loss": 0.9703, + "step": 8047 + }, + { + "epoch": 0.8464117159872218, + "grad_norm": 2.7466454975694754, + "learning_rate": 2.912738272952889e-07, + "loss": 0.9759, + "step": 8048 + }, + { + "epoch": 0.8465168864290691, + "grad_norm": 2.1964656610410445, + "learning_rate": 2.9088287998694673e-07, + "loss": 0.9888, + "step": 8049 + }, + { + "epoch": 0.8466220568709164, + "grad_norm": 2.479272749557706, + "learning_rate": 2.9049217901232387e-07, + "loss": 0.9398, + "step": 8050 + }, + { + "epoch": 0.8467272273127637, + "grad_norm": 2.787185292911654, + "learning_rate": 2.901017244149873e-07, + "loss": 0.9828, + "step": 8051 + }, + { + "epoch": 0.846832397754611, + "grad_norm": 1.98715699218698, + "learning_rate": 2.897115162384759e-07, + "loss": 1.0073, + "step": 8052 + }, + { + "epoch": 0.8469375681964584, + "grad_norm": 2.17127153048729, + "learning_rate": 2.893215545263001e-07, + "loss": 0.9981, + "step": 8053 + }, + { + "epoch": 0.8470427386383057, + "grad_norm": 2.3336424406843115, + "learning_rate": 2.889318393219445e-07, + "loss": 0.9829, + "step": 8054 + }, + { + "epoch": 0.847147909080153, + "grad_norm": 2.250375988178359, + "learning_rate": 2.885423706688642e-07, + "loss": 0.9889, + "step": 8055 + }, + { + "epoch": 0.8472530795220004, + "grad_norm": 2.1354421736272293, + "learning_rate": 2.8815314861048966e-07, + "loss": 1.0016, + "step": 8056 + }, + { + "epoch": 0.8473582499638477, + "grad_norm": 2.331711007943775, + "learning_rate": 2.8776417319022145e-07, + "loss": 0.9307, + "step": 8057 + }, + { + "epoch": 0.847463420405695, + "grad_norm": 2.8514494357851214, + "learning_rate": 2.8737544445143263e-07, + "loss": 0.9425, + "step": 8058 + }, + { + "epoch": 0.8475685908475423, + "grad_norm": 2.6329555820560557, + "learning_rate": 2.869869624374699e-07, + "loss": 0.9704, + "step": 8059 + }, + { + "epoch": 0.8476737612893896, + "grad_norm": 2.129162011010815, + "learning_rate": 2.8659872719165203e-07, + "loss": 0.9776, + "step": 8060 + }, + { + "epoch": 0.847778931731237, + "grad_norm": 2.334892721506637, + "learning_rate": 2.86210738757271e-07, + "loss": 0.9834, + "step": 8061 + }, + { + "epoch": 0.8478841021730843, + "grad_norm": 2.3641357364728273, + "learning_rate": 2.858229971775894e-07, + "loss": 0.9409, + "step": 8062 + }, + { + "epoch": 0.8479892726149316, + "grad_norm": 2.1169833059337764, + "learning_rate": 2.8543550249584465e-07, + "loss": 0.9716, + "step": 8063 + }, + { + "epoch": 0.848094443056779, + "grad_norm": 2.2632523362465875, + "learning_rate": 2.850482547552441e-07, + "loss": 0.9724, + "step": 8064 + }, + { + "epoch": 0.8481996134986262, + "grad_norm": 2.508398111461374, + "learning_rate": 2.846612539989693e-07, + "loss": 0.9645, + "step": 8065 + }, + { + "epoch": 0.8483047839404735, + "grad_norm": 3.107750551526337, + "learning_rate": 2.8427450027017493e-07, + "loss": 0.9812, + "step": 8066 + }, + { + "epoch": 0.8484099543823208, + "grad_norm": 2.1641973776169845, + "learning_rate": 2.838879936119854e-07, + "loss": 0.9508, + "step": 8067 + }, + { + "epoch": 0.8485151248241681, + "grad_norm": 2.465015127853074, + "learning_rate": 2.8350173406749975e-07, + "loss": 0.9792, + "step": 8068 + }, + { + "epoch": 0.8486202952660155, + "grad_norm": 1.9790492113518683, + "learning_rate": 2.8311572167978947e-07, + "loss": 0.9586, + "step": 8069 + }, + { + "epoch": 0.8487254657078628, + "grad_norm": 3.114675329966703, + "learning_rate": 2.827299564918978e-07, + "loss": 0.9591, + "step": 8070 + }, + { + "epoch": 0.8488306361497101, + "grad_norm": 2.733280124937022, + "learning_rate": 2.823444385468399e-07, + "loss": 1.0032, + "step": 8071 + }, + { + "epoch": 0.8489358065915574, + "grad_norm": 2.0042271783468766, + "learning_rate": 2.819591678876052e-07, + "loss": 0.9599, + "step": 8072 + }, + { + "epoch": 0.8490409770334048, + "grad_norm": 2.8190894222776337, + "learning_rate": 2.815741445571529e-07, + "loss": 0.9586, + "step": 8073 + }, + { + "epoch": 0.8491461474752521, + "grad_norm": 2.8042120225394753, + "learning_rate": 2.8118936859841684e-07, + "loss": 0.9776, + "step": 8074 + }, + { + "epoch": 0.8492513179170994, + "grad_norm": 2.0986367427377446, + "learning_rate": 2.8080484005430313e-07, + "loss": 0.9594, + "step": 8075 + }, + { + "epoch": 0.8493564883589467, + "grad_norm": 2.8316742922493017, + "learning_rate": 2.804205589676884e-07, + "loss": 0.9596, + "step": 8076 + }, + { + "epoch": 0.849461658800794, + "grad_norm": 3.077204387316349, + "learning_rate": 2.8003652538142413e-07, + "loss": 0.96, + "step": 8077 + }, + { + "epoch": 0.8495668292426414, + "grad_norm": 2.310308677624084, + "learning_rate": 2.796527393383322e-07, + "loss": 0.9743, + "step": 8078 + }, + { + "epoch": 0.8496719996844887, + "grad_norm": 2.1833465701082884, + "learning_rate": 2.7926920088120786e-07, + "loss": 0.9564, + "step": 8079 + }, + { + "epoch": 0.849777170126336, + "grad_norm": 2.3941525202307274, + "learning_rate": 2.788859100528196e-07, + "loss": 0.9678, + "step": 8080 + }, + { + "epoch": 0.8498823405681833, + "grad_norm": 2.6311798448564607, + "learning_rate": 2.785028668959061e-07, + "loss": 0.9633, + "step": 8081 + }, + { + "epoch": 0.8499875110100307, + "grad_norm": 2.963362741185363, + "learning_rate": 2.781200714531801e-07, + "loss": 0.9707, + "step": 8082 + }, + { + "epoch": 0.850092681451878, + "grad_norm": 3.184965341727443, + "learning_rate": 2.7773752376732605e-07, + "loss": 1.0092, + "step": 8083 + }, + { + "epoch": 0.8501978518937253, + "grad_norm": 1.8532486139994901, + "learning_rate": 2.7735522388100206e-07, + "loss": 0.9832, + "step": 8084 + }, + { + "epoch": 0.8503030223355725, + "grad_norm": 2.1835915901309075, + "learning_rate": 2.76973171836836e-07, + "loss": 0.9377, + "step": 8085 + }, + { + "epoch": 0.8504081927774199, + "grad_norm": 3.5734845337817935, + "learning_rate": 2.7659136767743096e-07, + "loss": 1.0057, + "step": 8086 + }, + { + "epoch": 0.8505133632192672, + "grad_norm": 2.1809470260774644, + "learning_rate": 2.7620981144536014e-07, + "loss": 0.943, + "step": 8087 + }, + { + "epoch": 0.8506185336611145, + "grad_norm": 2.348459690619445, + "learning_rate": 2.7582850318317005e-07, + "loss": 0.929, + "step": 8088 + }, + { + "epoch": 0.8507237041029618, + "grad_norm": 2.4740076632830146, + "learning_rate": 2.7544744293338063e-07, + "loss": 0.9618, + "step": 8089 + }, + { + "epoch": 0.8508288745448092, + "grad_norm": 3.4907303797398517, + "learning_rate": 2.750666307384814e-07, + "loss": 1.0089, + "step": 8090 + }, + { + "epoch": 0.8509340449866565, + "grad_norm": 3.116098339723679, + "learning_rate": 2.746860666409371e-07, + "loss": 0.9985, + "step": 8091 + }, + { + "epoch": 0.8510392154285038, + "grad_norm": 2.302135132288717, + "learning_rate": 2.743057506831834e-07, + "loss": 0.9955, + "step": 8092 + }, + { + "epoch": 0.8511443858703511, + "grad_norm": 2.454597566905971, + "learning_rate": 2.7392568290762786e-07, + "loss": 0.9345, + "step": 8093 + }, + { + "epoch": 0.8512495563121985, + "grad_norm": 2.297569677220409, + "learning_rate": 2.7354586335665205e-07, + "loss": 0.9634, + "step": 8094 + }, + { + "epoch": 0.8513547267540458, + "grad_norm": 2.5670899366081614, + "learning_rate": 2.7316629207260745e-07, + "loss": 0.9547, + "step": 8095 + }, + { + "epoch": 0.8514598971958931, + "grad_norm": 2.111670089542108, + "learning_rate": 2.727869690978202e-07, + "loss": 0.9991, + "step": 8096 + }, + { + "epoch": 0.8515650676377404, + "grad_norm": 2.1740992859773454, + "learning_rate": 2.7240789447458756e-07, + "loss": 0.9991, + "step": 8097 + }, + { + "epoch": 0.8516702380795877, + "grad_norm": 3.134509753421687, + "learning_rate": 2.7202906824517955e-07, + "loss": 0.9657, + "step": 8098 + }, + { + "epoch": 0.8517754085214351, + "grad_norm": 2.375342092275538, + "learning_rate": 2.716504904518372e-07, + "loss": 0.9857, + "step": 8099 + }, + { + "epoch": 0.8518805789632824, + "grad_norm": 2.1830836767759045, + "learning_rate": 2.7127216113677636e-07, + "loss": 0.9715, + "step": 8100 + }, + { + "epoch": 0.8519857494051297, + "grad_norm": 3.471010207274205, + "learning_rate": 2.708940803421825e-07, + "loss": 1.0784, + "step": 8101 + }, + { + "epoch": 0.852090919846977, + "grad_norm": 2.6989538225629897, + "learning_rate": 2.705162481102147e-07, + "loss": 0.9515, + "step": 8102 + }, + { + "epoch": 0.8521960902888244, + "grad_norm": 2.428855683370294, + "learning_rate": 2.7013866448300506e-07, + "loss": 1.0098, + "step": 8103 + }, + { + "epoch": 0.8523012607306717, + "grad_norm": 1.9294068184039133, + "learning_rate": 2.697613295026563e-07, + "loss": 0.9569, + "step": 8104 + }, + { + "epoch": 0.8524064311725189, + "grad_norm": 2.703791788987313, + "learning_rate": 2.6938424321124396e-07, + "loss": 1.0116, + "step": 8105 + }, + { + "epoch": 0.8525116016143662, + "grad_norm": 1.8334006288217113, + "learning_rate": 2.690074056508168e-07, + "loss": 1.0143, + "step": 8106 + }, + { + "epoch": 0.8526167720562136, + "grad_norm": 2.086771645052246, + "learning_rate": 2.686308168633953e-07, + "loss": 0.9975, + "step": 8107 + }, + { + "epoch": 0.8527219424980609, + "grad_norm": 2.064759245549448, + "learning_rate": 2.6825447689097174e-07, + "loss": 0.9949, + "step": 8108 + }, + { + "epoch": 0.8528271129399082, + "grad_norm": 2.5706290935033627, + "learning_rate": 2.678783857755102e-07, + "loss": 0.9576, + "step": 8109 + }, + { + "epoch": 0.8529322833817555, + "grad_norm": 2.7378677889966974, + "learning_rate": 2.675025435589482e-07, + "loss": 0.9744, + "step": 8110 + }, + { + "epoch": 0.8530374538236029, + "grad_norm": 2.8537643482308424, + "learning_rate": 2.671269502831955e-07, + "loss": 0.9979, + "step": 8111 + }, + { + "epoch": 0.8531426242654502, + "grad_norm": 2.18393636096386, + "learning_rate": 2.6675160599013374e-07, + "loss": 0.9741, + "step": 8112 + }, + { + "epoch": 0.8532477947072975, + "grad_norm": 2.101881287973468, + "learning_rate": 2.66376510721616e-07, + "loss": 0.9905, + "step": 8113 + }, + { + "epoch": 0.8533529651491448, + "grad_norm": 2.1876358871831973, + "learning_rate": 2.660016645194688e-07, + "loss": 0.9924, + "step": 8114 + }, + { + "epoch": 0.8534581355909922, + "grad_norm": 3.243463727452169, + "learning_rate": 2.6562706742549073e-07, + "loss": 0.9812, + "step": 8115 + }, + { + "epoch": 0.8535633060328395, + "grad_norm": 3.2673747014657084, + "learning_rate": 2.652527194814511e-07, + "loss": 1.0241, + "step": 8116 + }, + { + "epoch": 0.8536684764746868, + "grad_norm": 2.4360167551039074, + "learning_rate": 2.6487862072909404e-07, + "loss": 0.9474, + "step": 8117 + }, + { + "epoch": 0.8537736469165341, + "grad_norm": 2.431103723048994, + "learning_rate": 2.645047712101334e-07, + "loss": 1.0166, + "step": 8118 + }, + { + "epoch": 0.8538788173583814, + "grad_norm": 2.598232947614198, + "learning_rate": 2.641311709662567e-07, + "loss": 0.9566, + "step": 8119 + }, + { + "epoch": 0.8539839878002288, + "grad_norm": 2.5875433885989123, + "learning_rate": 2.637578200391233e-07, + "loss": 0.9922, + "step": 8120 + }, + { + "epoch": 0.8540891582420761, + "grad_norm": 2.3388693300392327, + "learning_rate": 2.63384718470365e-07, + "loss": 0.9561, + "step": 8121 + }, + { + "epoch": 0.8541943286839234, + "grad_norm": 2.754827845758967, + "learning_rate": 2.630118663015849e-07, + "loss": 0.989, + "step": 8122 + }, + { + "epoch": 0.8542994991257707, + "grad_norm": 2.1192941339795683, + "learning_rate": 2.626392635743594e-07, + "loss": 0.9967, + "step": 8123 + }, + { + "epoch": 0.8544046695676181, + "grad_norm": 2.616739686808066, + "learning_rate": 2.6226691033023665e-07, + "loss": 0.9751, + "step": 8124 + }, + { + "epoch": 0.8545098400094654, + "grad_norm": 2.0640093068786705, + "learning_rate": 2.618948066107363e-07, + "loss": 0.9815, + "step": 8125 + }, + { + "epoch": 0.8546150104513126, + "grad_norm": 3.0722152608836364, + "learning_rate": 2.615229524573518e-07, + "loss": 0.9867, + "step": 8126 + }, + { + "epoch": 0.8547201808931599, + "grad_norm": 2.697664599244337, + "learning_rate": 2.6115134791154657e-07, + "loss": 0.9973, + "step": 8127 + }, + { + "epoch": 0.8548253513350073, + "grad_norm": 2.1491230648206012, + "learning_rate": 2.607799930147581e-07, + "loss": 0.9659, + "step": 8128 + }, + { + "epoch": 0.8549305217768546, + "grad_norm": 2.091866541025715, + "learning_rate": 2.604088878083957e-07, + "loss": 0.9658, + "step": 8129 + }, + { + "epoch": 0.8550356922187019, + "grad_norm": 2.065310881673391, + "learning_rate": 2.600380323338397e-07, + "loss": 0.981, + "step": 8130 + }, + { + "epoch": 0.8551408626605492, + "grad_norm": 2.637190411853234, + "learning_rate": 2.596674266324442e-07, + "loss": 0.9403, + "step": 8131 + }, + { + "epoch": 0.8552460331023966, + "grad_norm": 2.2549585723194756, + "learning_rate": 2.5929707074553364e-07, + "loss": 0.9679, + "step": 8132 + }, + { + "epoch": 0.8553512035442439, + "grad_norm": 2.1466692322213095, + "learning_rate": 2.58926964714406e-07, + "loss": 0.947, + "step": 8133 + }, + { + "epoch": 0.8554563739860912, + "grad_norm": 3.13947290894236, + "learning_rate": 2.5855710858033096e-07, + "loss": 0.9944, + "step": 8134 + }, + { + "epoch": 0.8555615444279385, + "grad_norm": 2.2159814940956912, + "learning_rate": 2.581875023845512e-07, + "loss": 0.9693, + "step": 8135 + }, + { + "epoch": 0.8556667148697858, + "grad_norm": 2.261359755971855, + "learning_rate": 2.5781814616827936e-07, + "loss": 0.9711, + "step": 8136 + }, + { + "epoch": 0.8557718853116332, + "grad_norm": 2.1664017570010468, + "learning_rate": 2.57449039972702e-07, + "loss": 0.9714, + "step": 8137 + }, + { + "epoch": 0.8558770557534805, + "grad_norm": 2.2579137649584418, + "learning_rate": 2.5708018383897803e-07, + "loss": 0.9975, + "step": 8138 + }, + { + "epoch": 0.8559822261953278, + "grad_norm": 3.238109092541349, + "learning_rate": 2.567115778082366e-07, + "loss": 0.9784, + "step": 8139 + }, + { + "epoch": 0.8560873966371751, + "grad_norm": 2.4097012019064357, + "learning_rate": 2.563432219215814e-07, + "loss": 0.9779, + "step": 8140 + }, + { + "epoch": 0.8561925670790225, + "grad_norm": 2.2467448008644575, + "learning_rate": 2.559751162200855e-07, + "loss": 0.9641, + "step": 8141 + }, + { + "epoch": 0.8562977375208698, + "grad_norm": 2.3268805950761577, + "learning_rate": 2.556072607447965e-07, + "loss": 1.0105, + "step": 8142 + }, + { + "epoch": 0.8564029079627171, + "grad_norm": 2.887233445189877, + "learning_rate": 2.552396555367334e-07, + "loss": 0.9784, + "step": 8143 + }, + { + "epoch": 0.8565080784045644, + "grad_norm": 2.2652184394813246, + "learning_rate": 2.548723006368864e-07, + "loss": 0.9439, + "step": 8144 + }, + { + "epoch": 0.8566132488464118, + "grad_norm": 3.416084310322756, + "learning_rate": 2.545051960862188e-07, + "loss": 1.0174, + "step": 8145 + }, + { + "epoch": 0.856718419288259, + "grad_norm": 2.2619883051129888, + "learning_rate": 2.541383419256646e-07, + "loss": 0.9155, + "step": 8146 + }, + { + "epoch": 0.8568235897301063, + "grad_norm": 3.2034627263132034, + "learning_rate": 2.5377173819613274e-07, + "loss": 0.953, + "step": 8147 + }, + { + "epoch": 0.8569287601719536, + "grad_norm": 2.3308420908834213, + "learning_rate": 2.53405384938501e-07, + "loss": 0.977, + "step": 8148 + }, + { + "epoch": 0.857033930613801, + "grad_norm": 2.549797506834715, + "learning_rate": 2.530392821936212e-07, + "loss": 0.9436, + "step": 8149 + }, + { + "epoch": 0.8571391010556483, + "grad_norm": 2.001168695818676, + "learning_rate": 2.526734300023162e-07, + "loss": 0.965, + "step": 8150 + }, + { + "epoch": 0.8572442714974956, + "grad_norm": 2.9838680004948293, + "learning_rate": 2.5230782840538147e-07, + "loss": 0.9686, + "step": 8151 + }, + { + "epoch": 0.8573494419393429, + "grad_norm": 2.0859980624244487, + "learning_rate": 2.5194247744358523e-07, + "loss": 0.9598, + "step": 8152 + }, + { + "epoch": 0.8574546123811903, + "grad_norm": 1.6033395922396991, + "learning_rate": 2.5157737715766544e-07, + "loss": 0.9206, + "step": 8153 + }, + { + "epoch": 0.8575597828230376, + "grad_norm": 2.47393713284398, + "learning_rate": 2.5121252758833537e-07, + "loss": 0.9946, + "step": 8154 + }, + { + "epoch": 0.8576649532648849, + "grad_norm": 2.375844708577336, + "learning_rate": 2.508479287762769e-07, + "loss": 0.9725, + "step": 8155 + }, + { + "epoch": 0.8577701237067322, + "grad_norm": 1.907812917306332, + "learning_rate": 2.504835807621464e-07, + "loss": 0.9562, + "step": 8156 + }, + { + "epoch": 0.8578752941485795, + "grad_norm": 1.967540450625011, + "learning_rate": 2.501194835865717e-07, + "loss": 0.9612, + "step": 8157 + }, + { + "epoch": 0.8579804645904269, + "grad_norm": 2.9450000337243147, + "learning_rate": 2.4975563729015244e-07, + "loss": 0.9839, + "step": 8158 + }, + { + "epoch": 0.8580856350322742, + "grad_norm": 2.85718562948114, + "learning_rate": 2.493920419134604e-07, + "loss": 0.9857, + "step": 8159 + }, + { + "epoch": 0.8581908054741215, + "grad_norm": 2.384457833137985, + "learning_rate": 2.4902869749703797e-07, + "loss": 0.9708, + "step": 8160 + }, + { + "epoch": 0.8582959759159688, + "grad_norm": 3.4120765102011985, + "learning_rate": 2.4866560408140284e-07, + "loss": 0.9713, + "step": 8161 + }, + { + "epoch": 0.8584011463578162, + "grad_norm": 2.19415436252959, + "learning_rate": 2.483027617070413e-07, + "loss": 0.953, + "step": 8162 + }, + { + "epoch": 0.8585063167996635, + "grad_norm": 2.8855737475987264, + "learning_rate": 2.479401704144144e-07, + "loss": 0.9333, + "step": 8163 + }, + { + "epoch": 0.8586114872415108, + "grad_norm": 2.3336198104728374, + "learning_rate": 2.4757783024395244e-07, + "loss": 0.9902, + "step": 8164 + }, + { + "epoch": 0.8587166576833581, + "grad_norm": 2.645052818170446, + "learning_rate": 2.472157412360596e-07, + "loss": 0.9993, + "step": 8165 + }, + { + "epoch": 0.8588218281252055, + "grad_norm": 2.332866176403049, + "learning_rate": 2.4685390343111265e-07, + "loss": 0.9832, + "step": 8166 + }, + { + "epoch": 0.8589269985670527, + "grad_norm": 2.8089915163327266, + "learning_rate": 2.46492316869458e-07, + "loss": 0.9792, + "step": 8167 + }, + { + "epoch": 0.8590321690089, + "grad_norm": 2.459421993014172, + "learning_rate": 2.461309815914162e-07, + "loss": 1.0708, + "step": 8168 + }, + { + "epoch": 0.8591373394507473, + "grad_norm": 2.3521339437040734, + "learning_rate": 2.45769897637278e-07, + "loss": 1.0167, + "step": 8169 + }, + { + "epoch": 0.8592425098925947, + "grad_norm": 3.0044324465369336, + "learning_rate": 2.4540906504730814e-07, + "loss": 0.9617, + "step": 8170 + }, + { + "epoch": 0.859347680334442, + "grad_norm": 2.554374282868245, + "learning_rate": 2.450484838617417e-07, + "loss": 0.9495, + "step": 8171 + }, + { + "epoch": 0.8594528507762893, + "grad_norm": 2.387412844879945, + "learning_rate": 2.446881541207868e-07, + "loss": 0.9826, + "step": 8172 + }, + { + "epoch": 0.8595580212181366, + "grad_norm": 1.942667199406407, + "learning_rate": 2.4432807586462214e-07, + "loss": 0.9746, + "step": 8173 + }, + { + "epoch": 0.859663191659984, + "grad_norm": 2.590230625555086, + "learning_rate": 2.4396824913339946e-07, + "loss": 0.9675, + "step": 8174 + }, + { + "epoch": 0.8597683621018313, + "grad_norm": 2.544725569198336, + "learning_rate": 2.436086739672433e-07, + "loss": 0.9573, + "step": 8175 + }, + { + "epoch": 0.8598735325436786, + "grad_norm": 2.875401976015427, + "learning_rate": 2.4324935040624747e-07, + "loss": 0.9746, + "step": 8176 + }, + { + "epoch": 0.8599787029855259, + "grad_norm": 2.330210767863903, + "learning_rate": 2.4289027849048076e-07, + "loss": 0.9509, + "step": 8177 + }, + { + "epoch": 0.8600838734273732, + "grad_norm": 3.8731187149712456, + "learning_rate": 2.4253145825998134e-07, + "loss": 0.972, + "step": 8178 + }, + { + "epoch": 0.8601890438692206, + "grad_norm": 2.4303181220029746, + "learning_rate": 2.4217288975476094e-07, + "loss": 0.9954, + "step": 8179 + }, + { + "epoch": 0.8602942143110679, + "grad_norm": 1.9710335180041265, + "learning_rate": 2.418145730148033e-07, + "loss": 0.957, + "step": 8180 + }, + { + "epoch": 0.8603993847529152, + "grad_norm": 2.7525905486987288, + "learning_rate": 2.414565080800624e-07, + "loss": 0.9754, + "step": 8181 + }, + { + "epoch": 0.8605045551947625, + "grad_norm": 3.292704941422295, + "learning_rate": 2.4109869499046647e-07, + "loss": 0.9751, + "step": 8182 + }, + { + "epoch": 0.8606097256366099, + "grad_norm": 2.165266902747974, + "learning_rate": 2.4074113378591254e-07, + "loss": 0.9757, + "step": 8183 + }, + { + "epoch": 0.8607148960784572, + "grad_norm": 2.4020231317592806, + "learning_rate": 2.4038382450627396e-07, + "loss": 0.9993, + "step": 8184 + }, + { + "epoch": 0.8608200665203045, + "grad_norm": 2.5604039867867816, + "learning_rate": 2.400267671913917e-07, + "loss": 1.0048, + "step": 8185 + }, + { + "epoch": 0.8609252369621518, + "grad_norm": 1.8354917739415633, + "learning_rate": 2.3966996188108133e-07, + "loss": 0.9127, + "step": 8186 + }, + { + "epoch": 0.861030407403999, + "grad_norm": 2.865868216648736, + "learning_rate": 2.3931340861512884e-07, + "loss": 0.9865, + "step": 8187 + }, + { + "epoch": 0.8611355778458464, + "grad_norm": 1.9746436738041568, + "learning_rate": 2.389571074332928e-07, + "loss": 0.9466, + "step": 8188 + }, + { + "epoch": 0.8612407482876937, + "grad_norm": 3.3252842390412796, + "learning_rate": 2.38601058375304e-07, + "loss": 1.0068, + "step": 8189 + }, + { + "epoch": 0.861345918729541, + "grad_norm": 2.847887785313545, + "learning_rate": 2.382452614808642e-07, + "loss": 0.999, + "step": 8190 + }, + { + "epoch": 0.8614510891713884, + "grad_norm": 1.955383199190147, + "learning_rate": 2.3788971678964794e-07, + "loss": 0.9989, + "step": 8191 + }, + { + "epoch": 0.8615562596132357, + "grad_norm": 2.285791537624384, + "learning_rate": 2.3753442434129998e-07, + "loss": 0.9703, + "step": 8192 + }, + { + "epoch": 0.861661430055083, + "grad_norm": 2.6866877107259026, + "learning_rate": 2.3717938417543995e-07, + "loss": 0.9215, + "step": 8193 + }, + { + "epoch": 0.8617666004969303, + "grad_norm": 2.7588675656238677, + "learning_rate": 2.3682459633165704e-07, + "loss": 0.974, + "step": 8194 + }, + { + "epoch": 0.8618717709387776, + "grad_norm": 3.3460239543274843, + "learning_rate": 2.3647006084951214e-07, + "loss": 0.9426, + "step": 8195 + }, + { + "epoch": 0.861976941380625, + "grad_norm": 2.3650104551290685, + "learning_rate": 2.3611577776853965e-07, + "loss": 0.9484, + "step": 8196 + }, + { + "epoch": 0.8620821118224723, + "grad_norm": 2.834027735071727, + "learning_rate": 2.3576174712824335e-07, + "loss": 0.9879, + "step": 8197 + }, + { + "epoch": 0.8621872822643196, + "grad_norm": 2.2266254740819966, + "learning_rate": 2.354079689681024e-07, + "loss": 1.0312, + "step": 8198 + }, + { + "epoch": 0.862292452706167, + "grad_norm": 2.550002403776596, + "learning_rate": 2.350544433275645e-07, + "loss": 0.9572, + "step": 8199 + }, + { + "epoch": 0.8623976231480143, + "grad_norm": 3.3029388049198825, + "learning_rate": 2.3470117024605138e-07, + "loss": 0.9695, + "step": 8200 + }, + { + "epoch": 0.8625027935898616, + "grad_norm": 2.7620071406830635, + "learning_rate": 2.3434814976295462e-07, + "loss": 0.9729, + "step": 8201 + }, + { + "epoch": 0.8626079640317089, + "grad_norm": 2.279717835201499, + "learning_rate": 2.3399538191763937e-07, + "loss": 1.0069, + "step": 8202 + }, + { + "epoch": 0.8627131344735562, + "grad_norm": 2.509494166741847, + "learning_rate": 2.3364286674944254e-07, + "loss": 0.9409, + "step": 8203 + }, + { + "epoch": 0.8628183049154036, + "grad_norm": 2.7206541859104267, + "learning_rate": 2.3329060429767124e-07, + "loss": 0.9442, + "step": 8204 + }, + { + "epoch": 0.8629234753572509, + "grad_norm": 2.372820235661989, + "learning_rate": 2.329385946016066e-07, + "loss": 0.9734, + "step": 8205 + }, + { + "epoch": 0.8630286457990982, + "grad_norm": 2.6935312672829874, + "learning_rate": 2.325868377004986e-07, + "loss": 0.9754, + "step": 8206 + }, + { + "epoch": 0.8631338162409454, + "grad_norm": 2.1552030244555587, + "learning_rate": 2.3223533363357308e-07, + "loss": 0.9789, + "step": 8207 + }, + { + "epoch": 0.8632389866827928, + "grad_norm": 2.412292284565908, + "learning_rate": 2.3188408244002424e-07, + "loss": 0.9809, + "step": 8208 + }, + { + "epoch": 0.8633441571246401, + "grad_norm": 2.088983007031558, + "learning_rate": 2.3153308415901993e-07, + "loss": 1.0039, + "step": 8209 + }, + { + "epoch": 0.8634493275664874, + "grad_norm": 2.4663761465549547, + "learning_rate": 2.3118233882969854e-07, + "loss": 1.0072, + "step": 8210 + }, + { + "epoch": 0.8635544980083347, + "grad_norm": 2.147148470340049, + "learning_rate": 2.3083184649117046e-07, + "loss": 0.9939, + "step": 8211 + }, + { + "epoch": 0.863659668450182, + "grad_norm": 2.709658500142506, + "learning_rate": 2.3048160718252e-07, + "loss": 0.9834, + "step": 8212 + }, + { + "epoch": 0.8637648388920294, + "grad_norm": 2.7515549275388804, + "learning_rate": 2.3013162094279977e-07, + "loss": 1.009, + "step": 8213 + }, + { + "epoch": 0.8638700093338767, + "grad_norm": 2.999999695153477, + "learning_rate": 2.297818878110375e-07, + "loss": 0.918, + "step": 8214 + }, + { + "epoch": 0.863975179775724, + "grad_norm": 2.8176326824155358, + "learning_rate": 2.2943240782623e-07, + "loss": 0.9514, + "step": 8215 + }, + { + "epoch": 0.8640803502175713, + "grad_norm": 2.673202502494635, + "learning_rate": 2.2908318102734724e-07, + "loss": 1.003, + "step": 8216 + }, + { + "epoch": 0.8641855206594187, + "grad_norm": 2.9697524493518253, + "learning_rate": 2.2873420745333163e-07, + "loss": 0.9965, + "step": 8217 + }, + { + "epoch": 0.864290691101266, + "grad_norm": 2.3065886709900365, + "learning_rate": 2.283854871430949e-07, + "loss": 0.9499, + "step": 8218 + }, + { + "epoch": 0.8643958615431133, + "grad_norm": 2.3908258377099565, + "learning_rate": 2.2803702013552364e-07, + "loss": 0.9679, + "step": 8219 + }, + { + "epoch": 0.8645010319849606, + "grad_norm": 2.586564468311502, + "learning_rate": 2.2768880646947268e-07, + "loss": 0.9946, + "step": 8220 + }, + { + "epoch": 0.864606202426808, + "grad_norm": 3.179651943448657, + "learning_rate": 2.2734084618377285e-07, + "loss": 1.0029, + "step": 8221 + }, + { + "epoch": 0.8647113728686553, + "grad_norm": 3.5059554764194156, + "learning_rate": 2.2699313931722284e-07, + "loss": 1.0056, + "step": 8222 + }, + { + "epoch": 0.8648165433105026, + "grad_norm": 2.3482968719594823, + "learning_rate": 2.2664568590859525e-07, + "loss": 0.9858, + "step": 8223 + }, + { + "epoch": 0.8649217137523499, + "grad_norm": 2.786091616547652, + "learning_rate": 2.2629848599663357e-07, + "loss": 0.9675, + "step": 8224 + }, + { + "epoch": 0.8650268841941973, + "grad_norm": 2.151802230862855, + "learning_rate": 2.2595153962005345e-07, + "loss": 1.0044, + "step": 8225 + }, + { + "epoch": 0.8651320546360446, + "grad_norm": 2.2954772594860793, + "learning_rate": 2.2560484681754258e-07, + "loss": 1.0249, + "step": 8226 + }, + { + "epoch": 0.8652372250778919, + "grad_norm": 2.4848587884057065, + "learning_rate": 2.2525840762775863e-07, + "loss": 0.9688, + "step": 8227 + }, + { + "epoch": 0.8653423955197391, + "grad_norm": 2.825905090752021, + "learning_rate": 2.2491222208933377e-07, + "loss": 1.004, + "step": 8228 + }, + { + "epoch": 0.8654475659615865, + "grad_norm": 2.6469755844197635, + "learning_rate": 2.245662902408688e-07, + "loss": 0.999, + "step": 8229 + }, + { + "epoch": 0.8655527364034338, + "grad_norm": 3.240033136066609, + "learning_rate": 2.2422061212093947e-07, + "loss": 0.9899, + "step": 8230 + }, + { + "epoch": 0.8656579068452811, + "grad_norm": 2.8389209244754787, + "learning_rate": 2.238751877680906e-07, + "loss": 1.0278, + "step": 8231 + }, + { + "epoch": 0.8657630772871284, + "grad_norm": 2.6429507672792374, + "learning_rate": 2.2353001722083962e-07, + "loss": 0.986, + "step": 8232 + }, + { + "epoch": 0.8658682477289757, + "grad_norm": 2.0796610902282047, + "learning_rate": 2.2318510051767632e-07, + "loss": 0.9746, + "step": 8233 + }, + { + "epoch": 0.8659734181708231, + "grad_norm": 2.1854334963011137, + "learning_rate": 2.2284043769706026e-07, + "loss": 0.9798, + "step": 8234 + }, + { + "epoch": 0.8660785886126704, + "grad_norm": 2.154239959742983, + "learning_rate": 2.2249602879742594e-07, + "loss": 0.9868, + "step": 8235 + }, + { + "epoch": 0.8661837590545177, + "grad_norm": 2.6042428551850563, + "learning_rate": 2.2215187385717623e-07, + "loss": 1.0048, + "step": 8236 + }, + { + "epoch": 0.866288929496365, + "grad_norm": 2.3168200385005435, + "learning_rate": 2.2180797291468825e-07, + "loss": 0.9696, + "step": 8237 + }, + { + "epoch": 0.8663940999382124, + "grad_norm": 2.612002220024735, + "learning_rate": 2.2146432600830799e-07, + "loss": 0.9707, + "step": 8238 + }, + { + "epoch": 0.8664992703800597, + "grad_norm": 2.7310391889877494, + "learning_rate": 2.2112093317635587e-07, + "loss": 0.9626, + "step": 8239 + }, + { + "epoch": 0.866604440821907, + "grad_norm": 2.1973508769452152, + "learning_rate": 2.207777944571232e-07, + "loss": 0.9735, + "step": 8240 + }, + { + "epoch": 0.8667096112637543, + "grad_norm": 2.184444168526134, + "learning_rate": 2.2043490988887168e-07, + "loss": 0.9651, + "step": 8241 + }, + { + "epoch": 0.8668147817056017, + "grad_norm": 1.7840753769682516, + "learning_rate": 2.200922795098362e-07, + "loss": 0.9442, + "step": 8242 + }, + { + "epoch": 0.866919952147449, + "grad_norm": 2.612245040597428, + "learning_rate": 2.1974990335822179e-07, + "loss": 0.9973, + "step": 8243 + }, + { + "epoch": 0.8670251225892963, + "grad_norm": 2.7915387255241346, + "learning_rate": 2.1940778147220758e-07, + "loss": 0.9892, + "step": 8244 + }, + { + "epoch": 0.8671302930311436, + "grad_norm": 2.104198732577539, + "learning_rate": 2.1906591388994148e-07, + "loss": 0.9436, + "step": 8245 + }, + { + "epoch": 0.867235463472991, + "grad_norm": 2.381480939039506, + "learning_rate": 2.187243006495454e-07, + "loss": 1.0353, + "step": 8246 + }, + { + "epoch": 0.8673406339148383, + "grad_norm": 2.7229634312983917, + "learning_rate": 2.1838294178911146e-07, + "loss": 1.0069, + "step": 8247 + }, + { + "epoch": 0.8674458043566855, + "grad_norm": 2.3789881685811154, + "learning_rate": 2.1804183734670277e-07, + "loss": 0.9485, + "step": 8248 + }, + { + "epoch": 0.8675509747985328, + "grad_norm": 2.5490016088242125, + "learning_rate": 2.1770098736035694e-07, + "loss": 0.9878, + "step": 8249 + }, + { + "epoch": 0.8676561452403801, + "grad_norm": 2.8862833601914764, + "learning_rate": 2.1736039186808e-07, + "loss": 0.988, + "step": 8250 + }, + { + "epoch": 0.8677613156822275, + "grad_norm": 2.4021252306313747, + "learning_rate": 2.170200509078521e-07, + "loss": 0.9925, + "step": 8251 + }, + { + "epoch": 0.8678664861240748, + "grad_norm": 2.489969509776828, + "learning_rate": 2.166799645176229e-07, + "loss": 1.0052, + "step": 8252 + }, + { + "epoch": 0.8679716565659221, + "grad_norm": 1.9153232869677428, + "learning_rate": 2.1634013273531512e-07, + "loss": 0.9971, + "step": 8253 + }, + { + "epoch": 0.8680768270077694, + "grad_norm": 2.0476262837462156, + "learning_rate": 2.160005555988229e-07, + "loss": 0.9905, + "step": 8254 + }, + { + "epoch": 0.8681819974496168, + "grad_norm": 3.0949990892144665, + "learning_rate": 2.1566123314601118e-07, + "loss": 0.9642, + "step": 8255 + }, + { + "epoch": 0.8682871678914641, + "grad_norm": 2.359080168169369, + "learning_rate": 2.1532216541471778e-07, + "loss": 1.0135, + "step": 8256 + }, + { + "epoch": 0.8683923383333114, + "grad_norm": 1.66683761103825, + "learning_rate": 2.1498335244275e-07, + "loss": 0.9266, + "step": 8257 + }, + { + "epoch": 0.8684975087751587, + "grad_norm": 2.133569622507562, + "learning_rate": 2.1464479426789005e-07, + "loss": 0.961, + "step": 8258 + }, + { + "epoch": 0.8686026792170061, + "grad_norm": 2.588002312127146, + "learning_rate": 2.143064909278883e-07, + "loss": 0.9796, + "step": 8259 + }, + { + "epoch": 0.8687078496588534, + "grad_norm": 3.192126932345157, + "learning_rate": 2.1396844246046904e-07, + "loss": 1.006, + "step": 8260 + }, + { + "epoch": 0.8688130201007007, + "grad_norm": 2.4896171740508946, + "learning_rate": 2.1363064890332658e-07, + "loss": 0.9849, + "step": 8261 + }, + { + "epoch": 0.868918190542548, + "grad_norm": 2.7731731799976305, + "learning_rate": 2.1329311029412796e-07, + "loss": 0.9458, + "step": 8262 + }, + { + "epoch": 0.8690233609843954, + "grad_norm": 2.722330994924211, + "learning_rate": 2.1295582667051173e-07, + "loss": 1.0028, + "step": 8263 + }, + { + "epoch": 0.8691285314262427, + "grad_norm": 2.1961408410319008, + "learning_rate": 2.1261879807008667e-07, + "loss": 1.0216, + "step": 8264 + }, + { + "epoch": 0.86923370186809, + "grad_norm": 2.123750174696564, + "learning_rate": 2.1228202453043522e-07, + "loss": 1.01, + "step": 8265 + }, + { + "epoch": 0.8693388723099373, + "grad_norm": 2.6809762221376463, + "learning_rate": 2.1194550608910902e-07, + "loss": 1.0123, + "step": 8266 + }, + { + "epoch": 0.8694440427517847, + "grad_norm": 2.5457872775140986, + "learning_rate": 2.1160924278363333e-07, + "loss": 0.9877, + "step": 8267 + }, + { + "epoch": 0.8695492131936319, + "grad_norm": 2.085122359184593, + "learning_rate": 2.1127323465150422e-07, + "loss": 0.9743, + "step": 8268 + }, + { + "epoch": 0.8696543836354792, + "grad_norm": 2.146743192267201, + "learning_rate": 2.1093748173018846e-07, + "loss": 0.9686, + "step": 8269 + }, + { + "epoch": 0.8697595540773265, + "grad_norm": 2.3756277033536812, + "learning_rate": 2.106019840571255e-07, + "loss": 0.954, + "step": 8270 + }, + { + "epoch": 0.8698647245191738, + "grad_norm": 1.9419516360185134, + "learning_rate": 2.1026674166972627e-07, + "loss": 0.9669, + "step": 8271 + }, + { + "epoch": 0.8699698949610212, + "grad_norm": 3.0111258367329397, + "learning_rate": 2.099317546053728e-07, + "loss": 1.0088, + "step": 8272 + }, + { + "epoch": 0.8700750654028685, + "grad_norm": 2.3215678619891364, + "learning_rate": 2.09597022901418e-07, + "loss": 0.9994, + "step": 8273 + }, + { + "epoch": 0.8701802358447158, + "grad_norm": 2.866407727751618, + "learning_rate": 2.0926254659518835e-07, + "loss": 0.9804, + "step": 8274 + }, + { + "epoch": 0.8702854062865631, + "grad_norm": 2.189238826195812, + "learning_rate": 2.0892832572397935e-07, + "loss": 0.9907, + "step": 8275 + }, + { + "epoch": 0.8703905767284105, + "grad_norm": 1.9925032781875867, + "learning_rate": 2.0859436032505954e-07, + "loss": 1.0121, + "step": 8276 + }, + { + "epoch": 0.8704957471702578, + "grad_norm": 2.0387873295462744, + "learning_rate": 2.0826065043566935e-07, + "loss": 0.9812, + "step": 8277 + }, + { + "epoch": 0.8706009176121051, + "grad_norm": 1.838300067613648, + "learning_rate": 2.0792719609301904e-07, + "loss": 0.9273, + "step": 8278 + }, + { + "epoch": 0.8707060880539524, + "grad_norm": 2.587747678789586, + "learning_rate": 2.075939973342922e-07, + "loss": 0.9786, + "step": 8279 + }, + { + "epoch": 0.8708112584957998, + "grad_norm": 3.083618118123322, + "learning_rate": 2.0726105419664188e-07, + "loss": 0.9509, + "step": 8280 + }, + { + "epoch": 0.8709164289376471, + "grad_norm": 2.1215733170081488, + "learning_rate": 2.0692836671719536e-07, + "loss": 0.9764, + "step": 8281 + }, + { + "epoch": 0.8710215993794944, + "grad_norm": 2.305156580274503, + "learning_rate": 2.065959349330493e-07, + "loss": 1.0078, + "step": 8282 + }, + { + "epoch": 0.8711267698213417, + "grad_norm": 2.6860204494280153, + "learning_rate": 2.0626375888127187e-07, + "loss": 0.996, + "step": 8283 + }, + { + "epoch": 0.8712319402631891, + "grad_norm": 2.897921994981041, + "learning_rate": 2.0593183859890369e-07, + "loss": 0.9731, + "step": 8284 + }, + { + "epoch": 0.8713371107050364, + "grad_norm": 3.5559919783151974, + "learning_rate": 2.0560017412295658e-07, + "loss": 0.9663, + "step": 8285 + }, + { + "epoch": 0.8714422811468837, + "grad_norm": 2.4981822525310826, + "learning_rate": 2.0526876549041368e-07, + "loss": 0.9962, + "step": 8286 + }, + { + "epoch": 0.871547451588731, + "grad_norm": 2.1707524269042926, + "learning_rate": 2.0493761273822937e-07, + "loss": 1.0296, + "step": 8287 + }, + { + "epoch": 0.8716526220305784, + "grad_norm": 1.9671933144654712, + "learning_rate": 2.046067159033302e-07, + "loss": 1.0113, + "step": 8288 + }, + { + "epoch": 0.8717577924724256, + "grad_norm": 1.8601153938979094, + "learning_rate": 2.0427607502261303e-07, + "loss": 1.0016, + "step": 8289 + }, + { + "epoch": 0.8718629629142729, + "grad_norm": 3.164956063667044, + "learning_rate": 2.039456901329473e-07, + "loss": 1.0117, + "step": 8290 + }, + { + "epoch": 0.8719681333561202, + "grad_norm": 2.629745086363934, + "learning_rate": 2.0361556127117404e-07, + "loss": 0.9834, + "step": 8291 + }, + { + "epoch": 0.8720733037979675, + "grad_norm": 2.1561302946827907, + "learning_rate": 2.0328568847410413e-07, + "loss": 0.9296, + "step": 8292 + }, + { + "epoch": 0.8721784742398149, + "grad_norm": 2.281348519655876, + "learning_rate": 2.0295607177852146e-07, + "loss": 0.9825, + "step": 8293 + }, + { + "epoch": 0.8722836446816622, + "grad_norm": 3.1673280335921876, + "learning_rate": 2.0262671122118078e-07, + "loss": 1.0055, + "step": 8294 + }, + { + "epoch": 0.8723888151235095, + "grad_norm": 2.292148446422658, + "learning_rate": 2.0229760683880884e-07, + "loss": 0.9556, + "step": 8295 + }, + { + "epoch": 0.8724939855653568, + "grad_norm": 3.1357836739117175, + "learning_rate": 2.0196875866810266e-07, + "loss": 1.0117, + "step": 8296 + }, + { + "epoch": 0.8725991560072042, + "grad_norm": 2.519946732312392, + "learning_rate": 2.0164016674573185e-07, + "loss": 0.9937, + "step": 8297 + }, + { + "epoch": 0.8727043264490515, + "grad_norm": 2.564131547407527, + "learning_rate": 2.0131183110833646e-07, + "loss": 0.9503, + "step": 8298 + }, + { + "epoch": 0.8728094968908988, + "grad_norm": 2.280967440361249, + "learning_rate": 2.0098375179252867e-07, + "loss": 0.9791, + "step": 8299 + }, + { + "epoch": 0.8729146673327461, + "grad_norm": 2.354741913861808, + "learning_rate": 2.006559288348922e-07, + "loss": 0.9536, + "step": 8300 + }, + { + "epoch": 0.8730198377745935, + "grad_norm": 3.5165919413572926, + "learning_rate": 2.003283622719815e-07, + "loss": 0.9616, + "step": 8301 + }, + { + "epoch": 0.8731250082164408, + "grad_norm": 2.355749665234238, + "learning_rate": 2.0000105214032313e-07, + "loss": 0.9959, + "step": 8302 + }, + { + "epoch": 0.8732301786582881, + "grad_norm": 2.5396746122652822, + "learning_rate": 1.99673998476414e-07, + "loss": 0.9603, + "step": 8303 + }, + { + "epoch": 0.8733353491001354, + "grad_norm": 2.314335599258069, + "learning_rate": 1.9934720131672357e-07, + "loss": 0.9933, + "step": 8304 + }, + { + "epoch": 0.8734405195419828, + "grad_norm": 2.4455577788280523, + "learning_rate": 1.99020660697693e-07, + "loss": 1.0243, + "step": 8305 + }, + { + "epoch": 0.8735456899838301, + "grad_norm": 2.393852124530319, + "learning_rate": 1.9869437665573255e-07, + "loss": 0.9855, + "step": 8306 + }, + { + "epoch": 0.8736508604256774, + "grad_norm": 1.9162658676831326, + "learning_rate": 1.9836834922722648e-07, + "loss": 0.9762, + "step": 8307 + }, + { + "epoch": 0.8737560308675247, + "grad_norm": 2.6740161604492907, + "learning_rate": 1.980425784485293e-07, + "loss": 0.9674, + "step": 8308 + }, + { + "epoch": 0.873861201309372, + "grad_norm": 2.380114295557652, + "learning_rate": 1.97717064355967e-07, + "loss": 0.999, + "step": 8309 + }, + { + "epoch": 0.8739663717512193, + "grad_norm": 3.1166913813516883, + "learning_rate": 1.973918069858366e-07, + "loss": 0.9538, + "step": 8310 + }, + { + "epoch": 0.8740715421930666, + "grad_norm": 2.807493356835643, + "learning_rate": 1.9706680637440745e-07, + "loss": 0.998, + "step": 8311 + }, + { + "epoch": 0.8741767126349139, + "grad_norm": 1.8540453587981012, + "learning_rate": 1.9674206255791862e-07, + "loss": 0.9781, + "step": 8312 + }, + { + "epoch": 0.8742818830767612, + "grad_norm": 2.297351573084357, + "learning_rate": 1.9641757557258223e-07, + "loss": 1.0059, + "step": 8313 + }, + { + "epoch": 0.8743870535186086, + "grad_norm": 2.3795558043590797, + "learning_rate": 1.9609334545458132e-07, + "loss": 0.9783, + "step": 8314 + }, + { + "epoch": 0.8744922239604559, + "grad_norm": 2.108673183419314, + "learning_rate": 1.9576937224006942e-07, + "loss": 0.9516, + "step": 8315 + }, + { + "epoch": 0.8745973944023032, + "grad_norm": 1.9087942867413161, + "learning_rate": 1.954456559651724e-07, + "loss": 0.9661, + "step": 8316 + }, + { + "epoch": 0.8747025648441505, + "grad_norm": 2.3081851420364266, + "learning_rate": 1.9512219666598774e-07, + "loss": 0.9554, + "step": 8317 + }, + { + "epoch": 0.8748077352859979, + "grad_norm": 2.9514737071246038, + "learning_rate": 1.9479899437858246e-07, + "loss": 1.0265, + "step": 8318 + }, + { + "epoch": 0.8749129057278452, + "grad_norm": 2.74991764931712, + "learning_rate": 1.9447604913899715e-07, + "loss": 0.9832, + "step": 8319 + }, + { + "epoch": 0.8750180761696925, + "grad_norm": 3.2909066831788985, + "learning_rate": 1.941533609832419e-07, + "loss": 1.0034, + "step": 8320 + }, + { + "epoch": 0.8751232466115398, + "grad_norm": 2.9061285090024063, + "learning_rate": 1.9383092994729956e-07, + "loss": 0.9832, + "step": 8321 + }, + { + "epoch": 0.8752284170533872, + "grad_norm": 2.0846159205628747, + "learning_rate": 1.935087560671231e-07, + "loss": 0.9958, + "step": 8322 + }, + { + "epoch": 0.8753335874952345, + "grad_norm": 2.3654315207259167, + "learning_rate": 1.9318683937863846e-07, + "loss": 0.9666, + "step": 8323 + }, + { + "epoch": 0.8754387579370818, + "grad_norm": 2.3194539073632963, + "learning_rate": 1.9286517991774084e-07, + "loss": 0.9446, + "step": 8324 + }, + { + "epoch": 0.8755439283789291, + "grad_norm": 2.3802207813099314, + "learning_rate": 1.9254377772029847e-07, + "loss": 0.9544, + "step": 8325 + }, + { + "epoch": 0.8756490988207765, + "grad_norm": 1.8571233807580478, + "learning_rate": 1.9222263282214908e-07, + "loss": 0.9913, + "step": 8326 + }, + { + "epoch": 0.8757542692626238, + "grad_norm": 3.2909851431362447, + "learning_rate": 1.9190174525910404e-07, + "loss": 0.9587, + "step": 8327 + }, + { + "epoch": 0.8758594397044711, + "grad_norm": 2.4571652278790554, + "learning_rate": 1.9158111506694442e-07, + "loss": 0.9733, + "step": 8328 + }, + { + "epoch": 0.8759646101463183, + "grad_norm": 2.497778471844159, + "learning_rate": 1.9126074228142278e-07, + "loss": 0.9895, + "step": 8329 + }, + { + "epoch": 0.8760697805881656, + "grad_norm": 2.9691315953675703, + "learning_rate": 1.9094062693826298e-07, + "loss": 1.0181, + "step": 8330 + }, + { + "epoch": 0.876174951030013, + "grad_norm": 2.5598658628083664, + "learning_rate": 1.9062076907316097e-07, + "loss": 1.0124, + "step": 8331 + }, + { + "epoch": 0.8762801214718603, + "grad_norm": 3.447326481914661, + "learning_rate": 1.9030116872178317e-07, + "loss": 1.0096, + "step": 8332 + }, + { + "epoch": 0.8763852919137076, + "grad_norm": 2.4686568055463027, + "learning_rate": 1.8998182591976776e-07, + "loss": 1.0027, + "step": 8333 + }, + { + "epoch": 0.876490462355555, + "grad_norm": 3.8447359211448853, + "learning_rate": 1.8966274070272294e-07, + "loss": 1.0149, + "step": 8334 + }, + { + "epoch": 0.8765956327974023, + "grad_norm": 2.5490921354790927, + "learning_rate": 1.8934391310622996e-07, + "loss": 0.9944, + "step": 8335 + }, + { + "epoch": 0.8767008032392496, + "grad_norm": 3.1163782715259734, + "learning_rate": 1.8902534316584065e-07, + "loss": 0.9888, + "step": 8336 + }, + { + "epoch": 0.8768059736810969, + "grad_norm": 2.157509845767865, + "learning_rate": 1.88707030917078e-07, + "loss": 0.9813, + "step": 8337 + }, + { + "epoch": 0.8769111441229442, + "grad_norm": 2.3749513405972773, + "learning_rate": 1.883889763954358e-07, + "loss": 1.0061, + "step": 8338 + }, + { + "epoch": 0.8770163145647916, + "grad_norm": 3.006163202985406, + "learning_rate": 1.8807117963637988e-07, + "loss": 0.9441, + "step": 8339 + }, + { + "epoch": 0.8771214850066389, + "grad_norm": 2.940885641311102, + "learning_rate": 1.8775364067534773e-07, + "loss": 0.9773, + "step": 8340 + }, + { + "epoch": 0.8772266554484862, + "grad_norm": 2.95414648550892, + "learning_rate": 1.874363595477463e-07, + "loss": 1.0312, + "step": 8341 + }, + { + "epoch": 0.8773318258903335, + "grad_norm": 1.881654030719675, + "learning_rate": 1.871193362889559e-07, + "loss": 0.9565, + "step": 8342 + }, + { + "epoch": 0.8774369963321809, + "grad_norm": 2.7330908747232567, + "learning_rate": 1.8680257093432603e-07, + "loss": 0.9809, + "step": 8343 + }, + { + "epoch": 0.8775421667740282, + "grad_norm": 2.3063931397393067, + "learning_rate": 1.8648606351917925e-07, + "loss": 0.9502, + "step": 8344 + }, + { + "epoch": 0.8776473372158755, + "grad_norm": 2.6553680130796127, + "learning_rate": 1.8616981407880818e-07, + "loss": 1.0175, + "step": 8345 + }, + { + "epoch": 0.8777525076577228, + "grad_norm": 2.198765623782212, + "learning_rate": 1.8585382264847795e-07, + "loss": 0.9657, + "step": 8346 + }, + { + "epoch": 0.8778576780995702, + "grad_norm": 2.2406067363185103, + "learning_rate": 1.8553808926342286e-07, + "loss": 0.9648, + "step": 8347 + }, + { + "epoch": 0.8779628485414175, + "grad_norm": 2.6426330128760926, + "learning_rate": 1.852226139588506e-07, + "loss": 0.9899, + "step": 8348 + }, + { + "epoch": 0.8780680189832648, + "grad_norm": 2.241496860388617, + "learning_rate": 1.849073967699383e-07, + "loss": 1.0067, + "step": 8349 + }, + { + "epoch": 0.878173189425112, + "grad_norm": 2.481058599968573, + "learning_rate": 1.845924377318356e-07, + "loss": 0.9788, + "step": 8350 + }, + { + "epoch": 0.8782783598669593, + "grad_norm": 2.046724061828166, + "learning_rate": 1.8427773687966304e-07, + "loss": 1.0109, + "step": 8351 + }, + { + "epoch": 0.8783835303088067, + "grad_norm": 2.376223438068095, + "learning_rate": 1.8396329424851174e-07, + "loss": 0.9638, + "step": 8352 + }, + { + "epoch": 0.878488700750654, + "grad_norm": 2.2632885387151354, + "learning_rate": 1.836491098734447e-07, + "loss": 1.0035, + "step": 8353 + }, + { + "epoch": 0.8785938711925013, + "grad_norm": 2.655319309933802, + "learning_rate": 1.8333518378949617e-07, + "loss": 0.997, + "step": 8354 + }, + { + "epoch": 0.8786990416343486, + "grad_norm": 2.788664034342027, + "learning_rate": 1.830215160316709e-07, + "loss": 0.9811, + "step": 8355 + }, + { + "epoch": 0.878804212076196, + "grad_norm": 2.4388561721428292, + "learning_rate": 1.8270810663494591e-07, + "loss": 0.9727, + "step": 8356 + }, + { + "epoch": 0.8789093825180433, + "grad_norm": 3.2497459609934944, + "learning_rate": 1.8239495563426802e-07, + "loss": 0.9763, + "step": 8357 + }, + { + "epoch": 0.8790145529598906, + "grad_norm": 2.7302020542205385, + "learning_rate": 1.8208206306455616e-07, + "loss": 0.9581, + "step": 8358 + }, + { + "epoch": 0.8791197234017379, + "grad_norm": 2.96775947443148, + "learning_rate": 1.8176942896070083e-07, + "loss": 0.9633, + "step": 8359 + }, + { + "epoch": 0.8792248938435853, + "grad_norm": 2.3368503352333776, + "learning_rate": 1.8145705335756298e-07, + "loss": 1.0008, + "step": 8360 + }, + { + "epoch": 0.8793300642854326, + "grad_norm": 2.601467656711171, + "learning_rate": 1.8114493628997448e-07, + "loss": 0.9852, + "step": 8361 + }, + { + "epoch": 0.8794352347272799, + "grad_norm": 2.0109472927516876, + "learning_rate": 1.808330777927389e-07, + "loss": 0.9794, + "step": 8362 + }, + { + "epoch": 0.8795404051691272, + "grad_norm": 2.457219484542861, + "learning_rate": 1.8052147790063146e-07, + "loss": 0.9961, + "step": 8363 + }, + { + "epoch": 0.8796455756109746, + "grad_norm": 2.36559880573984, + "learning_rate": 1.8021013664839744e-07, + "loss": 1.0001, + "step": 8364 + }, + { + "epoch": 0.8797507460528219, + "grad_norm": 2.0670040655202695, + "learning_rate": 1.7989905407075404e-07, + "loss": 0.9953, + "step": 8365 + }, + { + "epoch": 0.8798559164946692, + "grad_norm": 2.4646404767978787, + "learning_rate": 1.795882302023891e-07, + "loss": 0.9685, + "step": 8366 + }, + { + "epoch": 0.8799610869365165, + "grad_norm": 2.0767215820236795, + "learning_rate": 1.7927766507796208e-07, + "loss": 0.9679, + "step": 8367 + }, + { + "epoch": 0.8800662573783639, + "grad_norm": 2.3891164659375628, + "learning_rate": 1.7896735873210364e-07, + "loss": 0.9736, + "step": 8368 + }, + { + "epoch": 0.8801714278202112, + "grad_norm": 3.110884943284728, + "learning_rate": 1.7865731119941498e-07, + "loss": 0.9882, + "step": 8369 + }, + { + "epoch": 0.8802765982620584, + "grad_norm": 2.5430378783802707, + "learning_rate": 1.78347522514469e-07, + "loss": 0.9486, + "step": 8370 + }, + { + "epoch": 0.8803817687039057, + "grad_norm": 2.3703437804084424, + "learning_rate": 1.780379927118095e-07, + "loss": 0.931, + "step": 8371 + }, + { + "epoch": 0.880486939145753, + "grad_norm": 2.2176388854921583, + "learning_rate": 1.7772872182595102e-07, + "loss": 0.961, + "step": 8372 + }, + { + "epoch": 0.8805921095876004, + "grad_norm": 2.930906441515779, + "learning_rate": 1.7741970989138046e-07, + "loss": 0.9894, + "step": 8373 + }, + { + "epoch": 0.8806972800294477, + "grad_norm": 2.376144556185852, + "learning_rate": 1.771109569425547e-07, + "loss": 0.9908, + "step": 8374 + }, + { + "epoch": 0.880802450471295, + "grad_norm": 2.3563178048249656, + "learning_rate": 1.7680246301390202e-07, + "loss": 0.9782, + "step": 8375 + }, + { + "epoch": 0.8809076209131423, + "grad_norm": 2.407079080129235, + "learning_rate": 1.7649422813982187e-07, + "loss": 1.0058, + "step": 8376 + }, + { + "epoch": 0.8810127913549897, + "grad_norm": 2.6815694041139504, + "learning_rate": 1.7618625235468507e-07, + "loss": 0.9664, + "step": 8377 + }, + { + "epoch": 0.881117961796837, + "grad_norm": 3.395370750987973, + "learning_rate": 1.7587853569283302e-07, + "loss": 1.0168, + "step": 8378 + }, + { + "epoch": 0.8812231322386843, + "grad_norm": 2.2069125097551283, + "learning_rate": 1.7557107818857889e-07, + "loss": 0.9825, + "step": 8379 + }, + { + "epoch": 0.8813283026805316, + "grad_norm": 3.2766113793720266, + "learning_rate": 1.7526387987620602e-07, + "loss": 1.0089, + "step": 8380 + }, + { + "epoch": 0.881433473122379, + "grad_norm": 2.811898086555635, + "learning_rate": 1.7495694078996984e-07, + "loss": 1.0126, + "step": 8381 + }, + { + "epoch": 0.8815386435642263, + "grad_norm": 3.267487035349017, + "learning_rate": 1.7465026096409598e-07, + "loss": 1.006, + "step": 8382 + }, + { + "epoch": 0.8816438140060736, + "grad_norm": 2.2437889546052934, + "learning_rate": 1.743438404327827e-07, + "loss": 0.978, + "step": 8383 + }, + { + "epoch": 0.8817489844479209, + "grad_norm": 2.969311191359739, + "learning_rate": 1.740376792301973e-07, + "loss": 0.986, + "step": 8384 + }, + { + "epoch": 0.8818541548897683, + "grad_norm": 2.937579661645254, + "learning_rate": 1.7373177739047898e-07, + "loss": 0.9688, + "step": 8385 + }, + { + "epoch": 0.8819593253316156, + "grad_norm": 2.0161300763505494, + "learning_rate": 1.7342613494773896e-07, + "loss": 0.9934, + "step": 8386 + }, + { + "epoch": 0.8820644957734629, + "grad_norm": 2.640765009401412, + "learning_rate": 1.731207519360581e-07, + "loss": 0.971, + "step": 8387 + }, + { + "epoch": 0.8821696662153102, + "grad_norm": 2.4341326910073557, + "learning_rate": 1.7281562838948968e-07, + "loss": 1.0054, + "step": 8388 + }, + { + "epoch": 0.8822748366571576, + "grad_norm": 2.611158264197989, + "learning_rate": 1.725107643420565e-07, + "loss": 1.0137, + "step": 8389 + }, + { + "epoch": 0.8823800070990048, + "grad_norm": 2.040666613213012, + "learning_rate": 1.7220615982775357e-07, + "loss": 0.9651, + "step": 8390 + }, + { + "epoch": 0.8824851775408521, + "grad_norm": 1.9483793255653934, + "learning_rate": 1.719018148805471e-07, + "loss": 0.9993, + "step": 8391 + }, + { + "epoch": 0.8825903479826994, + "grad_norm": 2.2723352862096635, + "learning_rate": 1.715977295343732e-07, + "loss": 0.9899, + "step": 8392 + }, + { + "epoch": 0.8826955184245467, + "grad_norm": 1.9036323549582663, + "learning_rate": 1.7129390382314065e-07, + "loss": 0.9872, + "step": 8393 + }, + { + "epoch": 0.8828006888663941, + "grad_norm": 2.229306978269511, + "learning_rate": 1.7099033778072732e-07, + "loss": 1.0015, + "step": 8394 + }, + { + "epoch": 0.8829058593082414, + "grad_norm": 2.508833961836707, + "learning_rate": 1.7068703144098365e-07, + "loss": 0.9562, + "step": 8395 + }, + { + "epoch": 0.8830110297500887, + "grad_norm": 2.5956332163694156, + "learning_rate": 1.7038398483773088e-07, + "loss": 0.9996, + "step": 8396 + }, + { + "epoch": 0.883116200191936, + "grad_norm": 1.9692024923776876, + "learning_rate": 1.700811980047612e-07, + "loss": 0.9557, + "step": 8397 + }, + { + "epoch": 0.8832213706337834, + "grad_norm": 2.411668195333863, + "learning_rate": 1.69778670975837e-07, + "loss": 1.0025, + "step": 8398 + }, + { + "epoch": 0.8833265410756307, + "grad_norm": 2.4902348906801257, + "learning_rate": 1.69476403784693e-07, + "loss": 1.0012, + "step": 8399 + }, + { + "epoch": 0.883431711517478, + "grad_norm": 2.818533940448939, + "learning_rate": 1.6917439646503415e-07, + "loss": 0.9662, + "step": 8400 + }, + { + "epoch": 0.8835368819593253, + "grad_norm": 3.015978971725827, + "learning_rate": 1.688726490505363e-07, + "loss": 0.9438, + "step": 8401 + }, + { + "epoch": 0.8836420524011727, + "grad_norm": 3.354485337789241, + "learning_rate": 1.6857116157484755e-07, + "loss": 0.9615, + "step": 8402 + }, + { + "epoch": 0.88374722284302, + "grad_norm": 2.4921532513409224, + "learning_rate": 1.6826993407158488e-07, + "loss": 0.9736, + "step": 8403 + }, + { + "epoch": 0.8838523932848673, + "grad_norm": 2.414353246940312, + "learning_rate": 1.679689665743381e-07, + "loss": 0.9845, + "step": 8404 + }, + { + "epoch": 0.8839575637267146, + "grad_norm": 3.235772053731161, + "learning_rate": 1.6766825911666757e-07, + "loss": 1.0116, + "step": 8405 + }, + { + "epoch": 0.884062734168562, + "grad_norm": 2.262414448014693, + "learning_rate": 1.6736781173210426e-07, + "loss": 0.9497, + "step": 8406 + }, + { + "epoch": 0.8841679046104093, + "grad_norm": 2.731997494521064, + "learning_rate": 1.670676244541508e-07, + "loss": 1.0178, + "step": 8407 + }, + { + "epoch": 0.8842730750522566, + "grad_norm": 2.2117783222892786, + "learning_rate": 1.6676769731627907e-07, + "loss": 0.9542, + "step": 8408 + }, + { + "epoch": 0.8843782454941039, + "grad_norm": 2.330534181027143, + "learning_rate": 1.6646803035193532e-07, + "loss": 0.9347, + "step": 8409 + }, + { + "epoch": 0.8844834159359513, + "grad_norm": 2.492206689262255, + "learning_rate": 1.661686235945331e-07, + "loss": 1.0167, + "step": 8410 + }, + { + "epoch": 0.8845885863777985, + "grad_norm": 2.63746278586202, + "learning_rate": 1.6586947707745965e-07, + "loss": 0.9653, + "step": 8411 + }, + { + "epoch": 0.8846937568196458, + "grad_norm": 1.9434162730085702, + "learning_rate": 1.655705908340713e-07, + "loss": 0.893, + "step": 8412 + }, + { + "epoch": 0.8847989272614931, + "grad_norm": 1.9752644333020086, + "learning_rate": 1.6527196489769664e-07, + "loss": 1.0008, + "step": 8413 + }, + { + "epoch": 0.8849040977033404, + "grad_norm": 2.39797801920839, + "learning_rate": 1.6497359930163492e-07, + "loss": 1.0064, + "step": 8414 + }, + { + "epoch": 0.8850092681451878, + "grad_norm": 3.023142379449854, + "learning_rate": 1.6467549407915563e-07, + "loss": 0.9955, + "step": 8415 + }, + { + "epoch": 0.8851144385870351, + "grad_norm": 2.4594124824433967, + "learning_rate": 1.6437764926350074e-07, + "loss": 0.9866, + "step": 8416 + }, + { + "epoch": 0.8852196090288824, + "grad_norm": 3.1186745683219406, + "learning_rate": 1.640800648878807e-07, + "loss": 1.015, + "step": 8417 + }, + { + "epoch": 0.8853247794707297, + "grad_norm": 2.013217130768628, + "learning_rate": 1.637827409854803e-07, + "loss": 0.9681, + "step": 8418 + }, + { + "epoch": 0.8854299499125771, + "grad_norm": 2.412821949942149, + "learning_rate": 1.6348567758945277e-07, + "loss": 1.0154, + "step": 8419 + }, + { + "epoch": 0.8855351203544244, + "grad_norm": 2.82262481468373, + "learning_rate": 1.6318887473292245e-07, + "loss": 0.9982, + "step": 8420 + }, + { + "epoch": 0.8856402907962717, + "grad_norm": 2.4389919981151498, + "learning_rate": 1.6289233244898616e-07, + "loss": 0.9898, + "step": 8421 + }, + { + "epoch": 0.885745461238119, + "grad_norm": 2.5329579890512885, + "learning_rate": 1.6259605077070888e-07, + "loss": 0.9611, + "step": 8422 + }, + { + "epoch": 0.8858506316799664, + "grad_norm": 2.242020126640096, + "learning_rate": 1.6230002973113056e-07, + "loss": 0.9727, + "step": 8423 + }, + { + "epoch": 0.8859558021218137, + "grad_norm": 2.678622402703502, + "learning_rate": 1.620042693632584e-07, + "loss": 0.973, + "step": 8424 + }, + { + "epoch": 0.886060972563661, + "grad_norm": 3.081880759439436, + "learning_rate": 1.61708769700073e-07, + "loss": 0.9799, + "step": 8425 + }, + { + "epoch": 0.8861661430055083, + "grad_norm": 2.3125089299768073, + "learning_rate": 1.614135307745235e-07, + "loss": 0.9988, + "step": 8426 + }, + { + "epoch": 0.8862713134473557, + "grad_norm": 1.7855830987719163, + "learning_rate": 1.611185526195322e-07, + "loss": 0.9772, + "step": 8427 + }, + { + "epoch": 0.886376483889203, + "grad_norm": 1.8724555459292305, + "learning_rate": 1.6082383526799196e-07, + "loss": 1.0256, + "step": 8428 + }, + { + "epoch": 0.8864816543310503, + "grad_norm": 3.5183308798130972, + "learning_rate": 1.6052937875276481e-07, + "loss": 0.9885, + "step": 8429 + }, + { + "epoch": 0.8865868247728976, + "grad_norm": 1.9475600577969459, + "learning_rate": 1.602351831066862e-07, + "loss": 0.9556, + "step": 8430 + }, + { + "epoch": 0.8866919952147448, + "grad_norm": 2.068576517697177, + "learning_rate": 1.5994124836255952e-07, + "loss": 0.9723, + "step": 8431 + }, + { + "epoch": 0.8867971656565922, + "grad_norm": 2.0504911635408307, + "learning_rate": 1.5964757455316282e-07, + "loss": 0.9624, + "step": 8432 + }, + { + "epoch": 0.8869023360984395, + "grad_norm": 2.384507371414114, + "learning_rate": 1.593541617112415e-07, + "loss": 0.9625, + "step": 8433 + }, + { + "epoch": 0.8870075065402868, + "grad_norm": 2.2608056805889936, + "learning_rate": 1.5906100986951445e-07, + "loss": 0.9405, + "step": 8434 + }, + { + "epoch": 0.8871126769821341, + "grad_norm": 2.3029169049564775, + "learning_rate": 1.5876811906066992e-07, + "loss": 1.013, + "step": 8435 + }, + { + "epoch": 0.8872178474239815, + "grad_norm": 3.240342256666061, + "learning_rate": 1.5847548931736678e-07, + "loss": 0.9972, + "step": 8436 + }, + { + "epoch": 0.8873230178658288, + "grad_norm": 2.0368840918231412, + "learning_rate": 1.5818312067223673e-07, + "loss": 1.0, + "step": 8437 + }, + { + "epoch": 0.8874281883076761, + "grad_norm": 2.4964437098615773, + "learning_rate": 1.5789101315788007e-07, + "loss": 0.9554, + "step": 8438 + }, + { + "epoch": 0.8875333587495234, + "grad_norm": 2.719371843674228, + "learning_rate": 1.5759916680687022e-07, + "loss": 1.0049, + "step": 8439 + }, + { + "epoch": 0.8876385291913708, + "grad_norm": 2.545862838080534, + "learning_rate": 1.5730758165174914e-07, + "loss": 0.979, + "step": 8440 + }, + { + "epoch": 0.8877436996332181, + "grad_norm": 4.491846091536825, + "learning_rate": 1.5701625772503142e-07, + "loss": 1.0328, + "step": 8441 + }, + { + "epoch": 0.8878488700750654, + "grad_norm": 2.9635865459634285, + "learning_rate": 1.5672519505920248e-07, + "loss": 1.0136, + "step": 8442 + }, + { + "epoch": 0.8879540405169127, + "grad_norm": 2.5871391440866836, + "learning_rate": 1.5643439368671688e-07, + "loss": 1.0381, + "step": 8443 + }, + { + "epoch": 0.8880592109587601, + "grad_norm": 2.367393248660268, + "learning_rate": 1.561438536400023e-07, + "loss": 0.9617, + "step": 8444 + }, + { + "epoch": 0.8881643814006074, + "grad_norm": 2.293140538401526, + "learning_rate": 1.5585357495145504e-07, + "loss": 0.9558, + "step": 8445 + }, + { + "epoch": 0.8882695518424547, + "grad_norm": 2.217902357821252, + "learning_rate": 1.5556355765344477e-07, + "loss": 0.9856, + "step": 8446 + }, + { + "epoch": 0.888374722284302, + "grad_norm": 3.1515334464477927, + "learning_rate": 1.552738017783098e-07, + "loss": 0.9647, + "step": 8447 + }, + { + "epoch": 0.8884798927261494, + "grad_norm": 1.9568682729973774, + "learning_rate": 1.5498430735836067e-07, + "loss": 0.9509, + "step": 8448 + }, + { + "epoch": 0.8885850631679967, + "grad_norm": 2.243562043893478, + "learning_rate": 1.5469507442587766e-07, + "loss": 0.9903, + "step": 8449 + }, + { + "epoch": 0.888690233609844, + "grad_norm": 2.514164405589009, + "learning_rate": 1.5440610301311277e-07, + "loss": 1.0134, + "step": 8450 + }, + { + "epoch": 0.8887954040516912, + "grad_norm": 3.3526553400519883, + "learning_rate": 1.5411739315228909e-07, + "loss": 0.9841, + "step": 8451 + }, + { + "epoch": 0.8889005744935385, + "grad_norm": 2.7744608538169317, + "learning_rate": 1.538289448755989e-07, + "loss": 1.0142, + "step": 8452 + }, + { + "epoch": 0.8890057449353859, + "grad_norm": 2.2860868670728114, + "learning_rate": 1.5354075821520765e-07, + "loss": 1.0062, + "step": 8453 + }, + { + "epoch": 0.8891109153772332, + "grad_norm": 3.007217663938393, + "learning_rate": 1.5325283320324903e-07, + "loss": 0.9644, + "step": 8454 + }, + { + "epoch": 0.8892160858190805, + "grad_norm": 2.9791595489559435, + "learning_rate": 1.5296516987183042e-07, + "loss": 0.9896, + "step": 8455 + }, + { + "epoch": 0.8893212562609278, + "grad_norm": 1.8583784300969213, + "learning_rate": 1.5267776825302782e-07, + "loss": 1.0152, + "step": 8456 + }, + { + "epoch": 0.8894264267027752, + "grad_norm": 2.729685552570694, + "learning_rate": 1.523906283788884e-07, + "loss": 1.0083, + "step": 8457 + }, + { + "epoch": 0.8895315971446225, + "grad_norm": 3.1558272502401556, + "learning_rate": 1.5210375028143097e-07, + "loss": 0.9668, + "step": 8458 + }, + { + "epoch": 0.8896367675864698, + "grad_norm": 2.1971892785045033, + "learning_rate": 1.518171339926436e-07, + "loss": 1.0038, + "step": 8459 + }, + { + "epoch": 0.8897419380283171, + "grad_norm": 2.841371147766442, + "learning_rate": 1.5153077954448813e-07, + "loss": 0.9885, + "step": 8460 + }, + { + "epoch": 0.8898471084701645, + "grad_norm": 2.691644935448725, + "learning_rate": 1.5124468696889383e-07, + "loss": 0.9512, + "step": 8461 + }, + { + "epoch": 0.8899522789120118, + "grad_norm": 3.5217017668869097, + "learning_rate": 1.5095885629776319e-07, + "loss": 0.9877, + "step": 8462 + }, + { + "epoch": 0.8900574493538591, + "grad_norm": 1.6034639625291485, + "learning_rate": 1.506732875629674e-07, + "loss": 1.0055, + "step": 8463 + }, + { + "epoch": 0.8901626197957064, + "grad_norm": 2.1047634595751026, + "learning_rate": 1.5038798079635043e-07, + "loss": 0.9644, + "step": 8464 + }, + { + "epoch": 0.8902677902375538, + "grad_norm": 2.1406693190498256, + "learning_rate": 1.5010293602972653e-07, + "loss": 0.974, + "step": 8465 + }, + { + "epoch": 0.8903729606794011, + "grad_norm": 2.042851624189361, + "learning_rate": 1.4981815329487948e-07, + "loss": 0.9174, + "step": 8466 + }, + { + "epoch": 0.8904781311212484, + "grad_norm": 2.5483924252229673, + "learning_rate": 1.4953363262356552e-07, + "loss": 0.9662, + "step": 8467 + }, + { + "epoch": 0.8905833015630957, + "grad_norm": 3.0624720151851617, + "learning_rate": 1.4924937404750954e-07, + "loss": 0.9856, + "step": 8468 + }, + { + "epoch": 0.890688472004943, + "grad_norm": 3.0903489981255654, + "learning_rate": 1.4896537759841067e-07, + "loss": 1.0083, + "step": 8469 + }, + { + "epoch": 0.8907936424467904, + "grad_norm": 2.1498114196551823, + "learning_rate": 1.486816433079352e-07, + "loss": 0.9812, + "step": 8470 + }, + { + "epoch": 0.8908988128886377, + "grad_norm": 2.15804642211936, + "learning_rate": 1.4839817120772288e-07, + "loss": 0.9824, + "step": 8471 + }, + { + "epoch": 0.8910039833304849, + "grad_norm": 2.069762197489603, + "learning_rate": 1.4811496132938196e-07, + "loss": 0.9699, + "step": 8472 + }, + { + "epoch": 0.8911091537723322, + "grad_norm": 2.3885493176951296, + "learning_rate": 1.4783201370449225e-07, + "loss": 0.9592, + "step": 8473 + }, + { + "epoch": 0.8912143242141796, + "grad_norm": 2.5287347361503936, + "learning_rate": 1.4754932836460622e-07, + "loss": 0.927, + "step": 8474 + }, + { + "epoch": 0.8913194946560269, + "grad_norm": 2.477400242672135, + "learning_rate": 1.47266905341244e-07, + "loss": 0.9984, + "step": 8475 + }, + { + "epoch": 0.8914246650978742, + "grad_norm": 2.8722700619640027, + "learning_rate": 1.4698474466589896e-07, + "loss": 0.9341, + "step": 8476 + }, + { + "epoch": 0.8915298355397215, + "grad_norm": 2.722448788439971, + "learning_rate": 1.4670284637003345e-07, + "loss": 0.9496, + "step": 8477 + }, + { + "epoch": 0.8916350059815689, + "grad_norm": 2.8289714203437892, + "learning_rate": 1.464212104850815e-07, + "loss": 0.9568, + "step": 8478 + }, + { + "epoch": 0.8917401764234162, + "grad_norm": 2.6306864237152623, + "learning_rate": 1.4613983704244826e-07, + "loss": 1.0054, + "step": 8479 + }, + { + "epoch": 0.8918453468652635, + "grad_norm": 2.018496472612239, + "learning_rate": 1.4585872607350837e-07, + "loss": 0.9911, + "step": 8480 + }, + { + "epoch": 0.8919505173071108, + "grad_norm": 2.6075018388842577, + "learning_rate": 1.4557787760960812e-07, + "loss": 0.9536, + "step": 8481 + }, + { + "epoch": 0.8920556877489582, + "grad_norm": 2.419257762481413, + "learning_rate": 1.4529729168206386e-07, + "loss": 0.9466, + "step": 8482 + }, + { + "epoch": 0.8921608581908055, + "grad_norm": 2.726364003607082, + "learning_rate": 1.4501696832216445e-07, + "loss": 0.9499, + "step": 8483 + }, + { + "epoch": 0.8922660286326528, + "grad_norm": 2.3855719640701807, + "learning_rate": 1.4473690756116654e-07, + "loss": 0.9907, + "step": 8484 + }, + { + "epoch": 0.8923711990745001, + "grad_norm": 2.4566580192018845, + "learning_rate": 1.4445710943030018e-07, + "loss": 0.9854, + "step": 8485 + }, + { + "epoch": 0.8924763695163475, + "grad_norm": 2.816644567674129, + "learning_rate": 1.4417757396076427e-07, + "loss": 1.0299, + "step": 8486 + }, + { + "epoch": 0.8925815399581948, + "grad_norm": 2.4296406347952884, + "learning_rate": 1.4389830118372972e-07, + "loss": 0.9775, + "step": 8487 + }, + { + "epoch": 0.8926867104000421, + "grad_norm": 1.952494054315651, + "learning_rate": 1.4361929113033747e-07, + "loss": 0.9887, + "step": 8488 + }, + { + "epoch": 0.8927918808418894, + "grad_norm": 2.203328059269768, + "learning_rate": 1.4334054383169897e-07, + "loss": 0.9499, + "step": 8489 + }, + { + "epoch": 0.8928970512837368, + "grad_norm": 1.8322340247823978, + "learning_rate": 1.430620593188975e-07, + "loss": 0.9407, + "step": 8490 + }, + { + "epoch": 0.8930022217255841, + "grad_norm": 2.480996419359068, + "learning_rate": 1.4278383762298536e-07, + "loss": 0.9938, + "step": 8491 + }, + { + "epoch": 0.8931073921674313, + "grad_norm": 3.1090653197790528, + "learning_rate": 1.4250587877498695e-07, + "loss": 0.9909, + "step": 8492 + }, + { + "epoch": 0.8932125626092786, + "grad_norm": 2.720781388178653, + "learning_rate": 1.422281828058969e-07, + "loss": 1.0105, + "step": 8493 + }, + { + "epoch": 0.8933177330511259, + "grad_norm": 2.753443908893908, + "learning_rate": 1.4195074974667989e-07, + "loss": 0.994, + "step": 8494 + }, + { + "epoch": 0.8934229034929733, + "grad_norm": 1.8341611634515465, + "learning_rate": 1.4167357962827283e-07, + "loss": 0.9763, + "step": 8495 + }, + { + "epoch": 0.8935280739348206, + "grad_norm": 2.9427961867698977, + "learning_rate": 1.4139667248158072e-07, + "loss": 0.9565, + "step": 8496 + }, + { + "epoch": 0.8936332443766679, + "grad_norm": 2.4027494539397103, + "learning_rate": 1.4112002833748279e-07, + "loss": 0.9674, + "step": 8497 + }, + { + "epoch": 0.8937384148185152, + "grad_norm": 3.598778669005095, + "learning_rate": 1.4084364722682598e-07, + "loss": 0.9917, + "step": 8498 + }, + { + "epoch": 0.8938435852603626, + "grad_norm": 3.346549402658093, + "learning_rate": 1.4056752918042898e-07, + "loss": 0.9366, + "step": 8499 + }, + { + "epoch": 0.8939487557022099, + "grad_norm": 2.5730479469223897, + "learning_rate": 1.4029167422908107e-07, + "loss": 0.9992, + "step": 8500 + }, + { + "epoch": 0.8940539261440572, + "grad_norm": 2.7809454904611313, + "learning_rate": 1.4001608240354236e-07, + "loss": 0.9954, + "step": 8501 + }, + { + "epoch": 0.8941590965859045, + "grad_norm": 2.063097454423752, + "learning_rate": 1.3974075373454383e-07, + "loss": 0.9769, + "step": 8502 + }, + { + "epoch": 0.8942642670277519, + "grad_norm": 1.8225435784652433, + "learning_rate": 1.3946568825278618e-07, + "loss": 0.9645, + "step": 8503 + }, + { + "epoch": 0.8943694374695992, + "grad_norm": 2.755707682683513, + "learning_rate": 1.3919088598894154e-07, + "loss": 0.9932, + "step": 8504 + }, + { + "epoch": 0.8944746079114465, + "grad_norm": 3.3674579224287773, + "learning_rate": 1.3891634697365207e-07, + "loss": 1.0107, + "step": 8505 + }, + { + "epoch": 0.8945797783532938, + "grad_norm": 3.130174869282355, + "learning_rate": 1.386420712375322e-07, + "loss": 1.0157, + "step": 8506 + }, + { + "epoch": 0.8946849487951412, + "grad_norm": 2.0725928871095154, + "learning_rate": 1.3836805881116495e-07, + "loss": 0.9675, + "step": 8507 + }, + { + "epoch": 0.8947901192369885, + "grad_norm": 2.345929950085412, + "learning_rate": 1.3809430972510446e-07, + "loss": 0.9821, + "step": 8508 + }, + { + "epoch": 0.8948952896788358, + "grad_norm": 2.7903627116773, + "learning_rate": 1.378208240098766e-07, + "loss": 0.9848, + "step": 8509 + }, + { + "epoch": 0.8950004601206831, + "grad_norm": 2.328892581734959, + "learning_rate": 1.3754760169597676e-07, + "loss": 1.0026, + "step": 8510 + }, + { + "epoch": 0.8951056305625305, + "grad_norm": 2.8229279376291934, + "learning_rate": 1.3727464281387186e-07, + "loss": 0.9352, + "step": 8511 + }, + { + "epoch": 0.8952108010043777, + "grad_norm": 2.8468987303756284, + "learning_rate": 1.3700194739399848e-07, + "loss": 1.0079, + "step": 8512 + }, + { + "epoch": 0.895315971446225, + "grad_norm": 2.015499855514806, + "learning_rate": 1.3672951546676444e-07, + "loss": 0.9678, + "step": 8513 + }, + { + "epoch": 0.8954211418880723, + "grad_norm": 1.976934703918547, + "learning_rate": 1.364573470625477e-07, + "loss": 0.9829, + "step": 8514 + }, + { + "epoch": 0.8955263123299196, + "grad_norm": 2.31984836197906, + "learning_rate": 1.3618544221169732e-07, + "loss": 0.9756, + "step": 8515 + }, + { + "epoch": 0.895631482771767, + "grad_norm": 2.665380278443687, + "learning_rate": 1.359138009445335e-07, + "loss": 0.9988, + "step": 8516 + }, + { + "epoch": 0.8957366532136143, + "grad_norm": 1.9964367122956426, + "learning_rate": 1.3564242329134502e-07, + "loss": 0.944, + "step": 8517 + }, + { + "epoch": 0.8958418236554616, + "grad_norm": 2.576231474046989, + "learning_rate": 1.3537130928239383e-07, + "loss": 0.9717, + "step": 8518 + }, + { + "epoch": 0.8959469940973089, + "grad_norm": 2.5238951016013367, + "learning_rate": 1.3510045894791018e-07, + "loss": 0.9593, + "step": 8519 + }, + { + "epoch": 0.8960521645391563, + "grad_norm": 3.0112910269228275, + "learning_rate": 1.3482987231809714e-07, + "loss": 1.0174, + "step": 8520 + }, + { + "epoch": 0.8961573349810036, + "grad_norm": 2.728647164800237, + "learning_rate": 1.345595494231261e-07, + "loss": 1.0167, + "step": 8521 + }, + { + "epoch": 0.8962625054228509, + "grad_norm": 3.110617555783042, + "learning_rate": 1.3428949029314132e-07, + "loss": 1.0145, + "step": 8522 + }, + { + "epoch": 0.8963676758646982, + "grad_norm": 2.1989572332927216, + "learning_rate": 1.3401969495825534e-07, + "loss": 0.9382, + "step": 8523 + }, + { + "epoch": 0.8964728463065456, + "grad_norm": 2.530350556427271, + "learning_rate": 1.3375016344855302e-07, + "loss": 1.0152, + "step": 8524 + }, + { + "epoch": 0.8965780167483929, + "grad_norm": 3.24507065070509, + "learning_rate": 1.3348089579408946e-07, + "loss": 1.0011, + "step": 8525 + }, + { + "epoch": 0.8966831871902402, + "grad_norm": 2.292129085968337, + "learning_rate": 1.3321189202488955e-07, + "loss": 0.9794, + "step": 8526 + }, + { + "epoch": 0.8967883576320875, + "grad_norm": 2.5251603922264785, + "learning_rate": 1.3294315217094956e-07, + "loss": 1.0009, + "step": 8527 + }, + { + "epoch": 0.8968935280739349, + "grad_norm": 1.9633155096515325, + "learning_rate": 1.3267467626223606e-07, + "loss": 0.959, + "step": 8528 + }, + { + "epoch": 0.8969986985157822, + "grad_norm": 3.11725878427488, + "learning_rate": 1.32406464328686e-07, + "loss": 1.0017, + "step": 8529 + }, + { + "epoch": 0.8971038689576295, + "grad_norm": 1.9422340766869772, + "learning_rate": 1.321385164002076e-07, + "loss": 0.9736, + "step": 8530 + }, + { + "epoch": 0.8972090393994768, + "grad_norm": 2.3021774888473225, + "learning_rate": 1.3187083250667865e-07, + "loss": 0.9732, + "step": 8531 + }, + { + "epoch": 0.8973142098413242, + "grad_norm": 2.6462765296135227, + "learning_rate": 1.3160341267794778e-07, + "loss": 0.9473, + "step": 8532 + }, + { + "epoch": 0.8974193802831714, + "grad_norm": 2.9968459492356376, + "learning_rate": 1.3133625694383506e-07, + "loss": 1.0007, + "step": 8533 + }, + { + "epoch": 0.8975245507250187, + "grad_norm": 2.8871273274281726, + "learning_rate": 1.310693653341305e-07, + "loss": 0.9901, + "step": 8534 + }, + { + "epoch": 0.897629721166866, + "grad_norm": 2.4909489473224817, + "learning_rate": 1.3080273787859367e-07, + "loss": 1.0059, + "step": 8535 + }, + { + "epoch": 0.8977348916087133, + "grad_norm": 2.775824788943004, + "learning_rate": 1.3053637460695655e-07, + "loss": 1.0172, + "step": 8536 + }, + { + "epoch": 0.8978400620505607, + "grad_norm": 2.405665330311999, + "learning_rate": 1.3027027554891992e-07, + "loss": 1.0053, + "step": 8537 + }, + { + "epoch": 0.897945232492408, + "grad_norm": 2.194682374380952, + "learning_rate": 1.3000444073415637e-07, + "loss": 0.9454, + "step": 8538 + }, + { + "epoch": 0.8980504029342553, + "grad_norm": 2.5244706470088323, + "learning_rate": 1.2973887019230885e-07, + "loss": 0.9712, + "step": 8539 + }, + { + "epoch": 0.8981555733761026, + "grad_norm": 2.9834146366006844, + "learning_rate": 1.2947356395298956e-07, + "loss": 0.9946, + "step": 8540 + }, + { + "epoch": 0.89826074381795, + "grad_norm": 2.246603088988245, + "learning_rate": 1.292085220457834e-07, + "loss": 0.9229, + "step": 8541 + }, + { + "epoch": 0.8983659142597973, + "grad_norm": 2.4711768931739004, + "learning_rate": 1.2894374450024338e-07, + "loss": 0.9644, + "step": 8542 + }, + { + "epoch": 0.8984710847016446, + "grad_norm": 1.99224760834784, + "learning_rate": 1.2867923134589483e-07, + "loss": 0.9821, + "step": 8543 + }, + { + "epoch": 0.8985762551434919, + "grad_norm": 1.6913963223277704, + "learning_rate": 1.2841498261223355e-07, + "loss": 0.9516, + "step": 8544 + }, + { + "epoch": 0.8986814255853393, + "grad_norm": 2.161369459321979, + "learning_rate": 1.2815099832872463e-07, + "loss": 0.9564, + "step": 8545 + }, + { + "epoch": 0.8987865960271866, + "grad_norm": 2.1550070381084114, + "learning_rate": 1.278872785248042e-07, + "loss": 0.9686, + "step": 8546 + }, + { + "epoch": 0.8988917664690339, + "grad_norm": 2.840197703849838, + "learning_rate": 1.276238232298796e-07, + "loss": 0.9632, + "step": 8547 + }, + { + "epoch": 0.8989969369108812, + "grad_norm": 2.5769335426004982, + "learning_rate": 1.273606324733284e-07, + "loss": 0.9816, + "step": 8548 + }, + { + "epoch": 0.8991021073527286, + "grad_norm": 2.6707662326181425, + "learning_rate": 1.2709770628449776e-07, + "loss": 0.9601, + "step": 8549 + }, + { + "epoch": 0.8992072777945759, + "grad_norm": 1.7310467132195133, + "learning_rate": 1.2683504469270636e-07, + "loss": 0.9591, + "step": 8550 + }, + { + "epoch": 0.8993124482364232, + "grad_norm": 2.548589763012564, + "learning_rate": 1.2657264772724247e-07, + "loss": 0.993, + "step": 8551 + }, + { + "epoch": 0.8994176186782705, + "grad_norm": 2.2542436406644684, + "learning_rate": 1.2631051541736578e-07, + "loss": 0.9368, + "step": 8552 + }, + { + "epoch": 0.8995227891201177, + "grad_norm": 2.5992564379457455, + "learning_rate": 1.2604864779230674e-07, + "loss": 0.9385, + "step": 8553 + }, + { + "epoch": 0.8996279595619651, + "grad_norm": 2.3697543945400343, + "learning_rate": 1.2578704488126426e-07, + "loss": 0.9866, + "step": 8554 + }, + { + "epoch": 0.8997331300038124, + "grad_norm": 2.31815816883965, + "learning_rate": 1.2552570671340998e-07, + "loss": 0.974, + "step": 8555 + }, + { + "epoch": 0.8998383004456597, + "grad_norm": 2.6932418190961913, + "learning_rate": 1.2526463331788503e-07, + "loss": 1.0144, + "step": 8556 + }, + { + "epoch": 0.899943470887507, + "grad_norm": 2.969325960064139, + "learning_rate": 1.250038247238014e-07, + "loss": 1.0272, + "step": 8557 + }, + { + "epoch": 0.9000486413293544, + "grad_norm": 2.781868699429922, + "learning_rate": 1.2474328096024086e-07, + "loss": 0.999, + "step": 8558 + }, + { + "epoch": 0.9001538117712017, + "grad_norm": 3.139713598126397, + "learning_rate": 1.244830020562557e-07, + "loss": 0.9944, + "step": 8559 + }, + { + "epoch": 0.900258982213049, + "grad_norm": 2.2804116496124767, + "learning_rate": 1.2422298804086963e-07, + "loss": 0.9445, + "step": 8560 + }, + { + "epoch": 0.9003641526548963, + "grad_norm": 2.1291441323106786, + "learning_rate": 1.2396323894307587e-07, + "loss": 0.9773, + "step": 8561 + }, + { + "epoch": 0.9004693230967437, + "grad_norm": 2.7682785401579455, + "learning_rate": 1.2370375479183905e-07, + "loss": 0.9852, + "step": 8562 + }, + { + "epoch": 0.900574493538591, + "grad_norm": 2.906960990437752, + "learning_rate": 1.234445356160932e-07, + "loss": 0.958, + "step": 8563 + }, + { + "epoch": 0.9006796639804383, + "grad_norm": 3.5815863699736807, + "learning_rate": 1.2318558144474303e-07, + "loss": 1.064, + "step": 8564 + }, + { + "epoch": 0.9007848344222856, + "grad_norm": 2.7121326512924866, + "learning_rate": 1.2292689230666482e-07, + "loss": 0.9298, + "step": 8565 + }, + { + "epoch": 0.900890004864133, + "grad_norm": 2.7103260334531956, + "learning_rate": 1.226684682307036e-07, + "loss": 0.9481, + "step": 8566 + }, + { + "epoch": 0.9009951753059803, + "grad_norm": 2.534376466134969, + "learning_rate": 1.2241030924567603e-07, + "loss": 1.0111, + "step": 8567 + }, + { + "epoch": 0.9011003457478276, + "grad_norm": 2.1693292560203092, + "learning_rate": 1.2215241538036853e-07, + "loss": 0.9746, + "step": 8568 + }, + { + "epoch": 0.9012055161896749, + "grad_norm": 1.8401531315355144, + "learning_rate": 1.2189478666353865e-07, + "loss": 1.0031, + "step": 8569 + }, + { + "epoch": 0.9013106866315223, + "grad_norm": 2.575023205233234, + "learning_rate": 1.2163742312391342e-07, + "loss": 0.9486, + "step": 8570 + }, + { + "epoch": 0.9014158570733696, + "grad_norm": 1.9758094923843876, + "learning_rate": 1.2138032479019206e-07, + "loss": 0.9974, + "step": 8571 + }, + { + "epoch": 0.9015210275152169, + "grad_norm": 2.2170223011196706, + "learning_rate": 1.2112349169104172e-07, + "loss": 0.9782, + "step": 8572 + }, + { + "epoch": 0.9016261979570641, + "grad_norm": 1.7838799688859874, + "learning_rate": 1.208669238551019e-07, + "loss": 0.9755, + "step": 8573 + }, + { + "epoch": 0.9017313683989114, + "grad_norm": 2.4567274019385765, + "learning_rate": 1.2061062131098174e-07, + "loss": 0.9455, + "step": 8574 + }, + { + "epoch": 0.9018365388407588, + "grad_norm": 1.722594664429003, + "learning_rate": 1.203545840872611e-07, + "loss": 0.9946, + "step": 8575 + }, + { + "epoch": 0.9019417092826061, + "grad_norm": 2.469843612260426, + "learning_rate": 1.2009881221249047e-07, + "loss": 0.9817, + "step": 8576 + }, + { + "epoch": 0.9020468797244534, + "grad_norm": 2.763903318605096, + "learning_rate": 1.1984330571518932e-07, + "loss": 0.96, + "step": 8577 + }, + { + "epoch": 0.9021520501663007, + "grad_norm": 2.4544321149500306, + "learning_rate": 1.1958806462384953e-07, + "loss": 0.9663, + "step": 8578 + }, + { + "epoch": 0.9022572206081481, + "grad_norm": 2.977813358898067, + "learning_rate": 1.1933308896693253e-07, + "loss": 1.0148, + "step": 8579 + }, + { + "epoch": 0.9023623910499954, + "grad_norm": 1.999477072638366, + "learning_rate": 1.1907837877286943e-07, + "loss": 0.9989, + "step": 8580 + }, + { + "epoch": 0.9024675614918427, + "grad_norm": 2.2185946512743513, + "learning_rate": 1.188239340700631e-07, + "loss": 0.9987, + "step": 8581 + }, + { + "epoch": 0.90257273193369, + "grad_norm": 2.5096878993721856, + "learning_rate": 1.1856975488688555e-07, + "loss": 0.9858, + "step": 8582 + }, + { + "epoch": 0.9026779023755374, + "grad_norm": 2.1735177550715403, + "learning_rate": 1.1831584125167966e-07, + "loss": 1.0257, + "step": 8583 + }, + { + "epoch": 0.9027830728173847, + "grad_norm": 1.7897935124112625, + "learning_rate": 1.1806219319275918e-07, + "loss": 0.9778, + "step": 8584 + }, + { + "epoch": 0.902888243259232, + "grad_norm": 2.15199039194613, + "learning_rate": 1.1780881073840816e-07, + "loss": 1.0001, + "step": 8585 + }, + { + "epoch": 0.9029934137010793, + "grad_norm": 3.833073505264171, + "learning_rate": 1.1755569391687954e-07, + "loss": 1.0042, + "step": 8586 + }, + { + "epoch": 0.9030985841429267, + "grad_norm": 2.682110463618559, + "learning_rate": 1.173028427563988e-07, + "loss": 1.0197, + "step": 8587 + }, + { + "epoch": 0.903203754584774, + "grad_norm": 1.9252316839391137, + "learning_rate": 1.1705025728516089e-07, + "loss": 0.9597, + "step": 8588 + }, + { + "epoch": 0.9033089250266213, + "grad_norm": 3.407849681083908, + "learning_rate": 1.1679793753133024e-07, + "loss": 0.9905, + "step": 8589 + }, + { + "epoch": 0.9034140954684686, + "grad_norm": 2.514269869679688, + "learning_rate": 1.1654588352304347e-07, + "loss": 1.0422, + "step": 8590 + }, + { + "epoch": 0.903519265910316, + "grad_norm": 2.9042596313289244, + "learning_rate": 1.1629409528840534e-07, + "loss": 0.9899, + "step": 8591 + }, + { + "epoch": 0.9036244363521633, + "grad_norm": 1.8985121884778924, + "learning_rate": 1.1604257285549314e-07, + "loss": 0.9963, + "step": 8592 + }, + { + "epoch": 0.9037296067940106, + "grad_norm": 2.5551184251028602, + "learning_rate": 1.1579131625235356e-07, + "loss": 0.9615, + "step": 8593 + }, + { + "epoch": 0.9038347772358578, + "grad_norm": 2.1413999858159354, + "learning_rate": 1.1554032550700284e-07, + "loss": 0.9406, + "step": 8594 + }, + { + "epoch": 0.9039399476777051, + "grad_norm": 2.6069475924003704, + "learning_rate": 1.1528960064742967e-07, + "loss": 0.9775, + "step": 8595 + }, + { + "epoch": 0.9040451181195525, + "grad_norm": 1.9692405646354523, + "learning_rate": 1.1503914170159058e-07, + "loss": 0.9734, + "step": 8596 + }, + { + "epoch": 0.9041502885613998, + "grad_norm": 3.182347862653507, + "learning_rate": 1.1478894869741409e-07, + "loss": 0.9695, + "step": 8597 + }, + { + "epoch": 0.9042554590032471, + "grad_norm": 2.393093848945507, + "learning_rate": 1.1453902166279895e-07, + "loss": 0.9901, + "step": 8598 + }, + { + "epoch": 0.9043606294450944, + "grad_norm": 1.8954299188375663, + "learning_rate": 1.1428936062561402e-07, + "loss": 1.0036, + "step": 8599 + }, + { + "epoch": 0.9044657998869418, + "grad_norm": 2.0529572247299, + "learning_rate": 1.1403996561369812e-07, + "loss": 0.972, + "step": 8600 + }, + { + "epoch": 0.9045709703287891, + "grad_norm": 2.3468453698266063, + "learning_rate": 1.1379083665486068e-07, + "loss": 1.004, + "step": 8601 + }, + { + "epoch": 0.9046761407706364, + "grad_norm": 1.9568847387652115, + "learning_rate": 1.1354197377688198e-07, + "loss": 0.9641, + "step": 8602 + }, + { + "epoch": 0.9047813112124837, + "grad_norm": 2.24292284847942, + "learning_rate": 1.1329337700751147e-07, + "loss": 0.9581, + "step": 8603 + }, + { + "epoch": 0.904886481654331, + "grad_norm": 2.5786412520948656, + "learning_rate": 1.1304504637447062e-07, + "loss": 0.9849, + "step": 8604 + }, + { + "epoch": 0.9049916520961784, + "grad_norm": 2.6766324984329586, + "learning_rate": 1.1279698190544918e-07, + "loss": 0.9928, + "step": 8605 + }, + { + "epoch": 0.9050968225380257, + "grad_norm": 2.4750268462318665, + "learning_rate": 1.125491836281084e-07, + "loss": 0.9605, + "step": 8606 + }, + { + "epoch": 0.905201992979873, + "grad_norm": 2.3781929663367936, + "learning_rate": 1.1230165157008033e-07, + "loss": 1.0048, + "step": 8607 + }, + { + "epoch": 0.9053071634217204, + "grad_norm": 2.2874545893594553, + "learning_rate": 1.1205438575896677e-07, + "loss": 0.9634, + "step": 8608 + }, + { + "epoch": 0.9054123338635677, + "grad_norm": 2.526387831515657, + "learning_rate": 1.1180738622233928e-07, + "loss": 0.9509, + "step": 8609 + }, + { + "epoch": 0.905517504305415, + "grad_norm": 2.0896230571312957, + "learning_rate": 1.1156065298773972e-07, + "loss": 0.9917, + "step": 8610 + }, + { + "epoch": 0.9056226747472623, + "grad_norm": 2.022491968484562, + "learning_rate": 1.113141860826819e-07, + "loss": 1.0339, + "step": 8611 + }, + { + "epoch": 0.9057278451891096, + "grad_norm": 2.1059992344770246, + "learning_rate": 1.1106798553464804e-07, + "loss": 0.9661, + "step": 8612 + }, + { + "epoch": 0.905833015630957, + "grad_norm": 2.730011822148298, + "learning_rate": 1.1082205137109225e-07, + "loss": 0.9576, + "step": 8613 + }, + { + "epoch": 0.9059381860728042, + "grad_norm": 2.761974033618745, + "learning_rate": 1.1057638361943679e-07, + "loss": 0.9832, + "step": 8614 + }, + { + "epoch": 0.9060433565146515, + "grad_norm": 2.9220028143550656, + "learning_rate": 1.1033098230707668e-07, + "loss": 1.0099, + "step": 8615 + }, + { + "epoch": 0.9061485269564988, + "grad_norm": 3.1531693192642467, + "learning_rate": 1.1008584746137558e-07, + "loss": 0.9964, + "step": 8616 + }, + { + "epoch": 0.9062536973983462, + "grad_norm": 2.190987178982146, + "learning_rate": 1.0984097910966802e-07, + "loss": 0.9959, + "step": 8617 + }, + { + "epoch": 0.9063588678401935, + "grad_norm": 3.111299811220905, + "learning_rate": 1.0959637727925881e-07, + "loss": 1.0023, + "step": 8618 + }, + { + "epoch": 0.9064640382820408, + "grad_norm": 3.157837969992092, + "learning_rate": 1.0935204199742255e-07, + "loss": 1.0217, + "step": 8619 + }, + { + "epoch": 0.9065692087238881, + "grad_norm": 2.4262394538613283, + "learning_rate": 1.0910797329140466e-07, + "loss": 1.0104, + "step": 8620 + }, + { + "epoch": 0.9066743791657355, + "grad_norm": 2.2045284669897396, + "learning_rate": 1.0886417118842113e-07, + "loss": 0.9879, + "step": 8621 + }, + { + "epoch": 0.9067795496075828, + "grad_norm": 3.4589297463267084, + "learning_rate": 1.0862063571565773e-07, + "loss": 0.9798, + "step": 8622 + }, + { + "epoch": 0.9068847200494301, + "grad_norm": 2.8785742456954004, + "learning_rate": 1.0837736690026996e-07, + "loss": 0.9865, + "step": 8623 + }, + { + "epoch": 0.9069898904912774, + "grad_norm": 2.194186906971404, + "learning_rate": 1.0813436476938444e-07, + "loss": 0.9963, + "step": 8624 + }, + { + "epoch": 0.9070950609331248, + "grad_norm": 2.6444058808730273, + "learning_rate": 1.0789162935009839e-07, + "loss": 0.991, + "step": 8625 + }, + { + "epoch": 0.9072002313749721, + "grad_norm": 2.6206618870443417, + "learning_rate": 1.0764916066947795e-07, + "loss": 0.9771, + "step": 8626 + }, + { + "epoch": 0.9073054018168194, + "grad_norm": 1.9213999370076098, + "learning_rate": 1.0740695875456064e-07, + "loss": 0.9315, + "step": 8627 + }, + { + "epoch": 0.9074105722586667, + "grad_norm": 2.700327806526417, + "learning_rate": 1.0716502363235348e-07, + "loss": 0.9627, + "step": 8628 + }, + { + "epoch": 0.907515742700514, + "grad_norm": 2.9427305067667624, + "learning_rate": 1.069233553298346e-07, + "loss": 0.9787, + "step": 8629 + }, + { + "epoch": 0.9076209131423614, + "grad_norm": 2.7356277181692445, + "learning_rate": 1.0668195387395164e-07, + "loss": 1.0438, + "step": 8630 + }, + { + "epoch": 0.9077260835842087, + "grad_norm": 2.3404693898022777, + "learning_rate": 1.0644081929162275e-07, + "loss": 0.9924, + "step": 8631 + }, + { + "epoch": 0.907831254026056, + "grad_norm": 2.1474982707381893, + "learning_rate": 1.0619995160973645e-07, + "loss": 0.9078, + "step": 8632 + }, + { + "epoch": 0.9079364244679033, + "grad_norm": 2.233221210407833, + "learning_rate": 1.0595935085515069e-07, + "loss": 1.0021, + "step": 8633 + }, + { + "epoch": 0.9080415949097506, + "grad_norm": 2.707217846445669, + "learning_rate": 1.0571901705469567e-07, + "loss": 1.0373, + "step": 8634 + }, + { + "epoch": 0.9081467653515979, + "grad_norm": 2.0945684851181987, + "learning_rate": 1.0547895023516913e-07, + "loss": 0.9887, + "step": 8635 + }, + { + "epoch": 0.9082519357934452, + "grad_norm": 2.818236365087763, + "learning_rate": 1.0523915042334132e-07, + "loss": 0.9345, + "step": 8636 + }, + { + "epoch": 0.9083571062352925, + "grad_norm": 2.724573085747091, + "learning_rate": 1.0499961764595112e-07, + "loss": 1.036, + "step": 8637 + }, + { + "epoch": 0.9084622766771399, + "grad_norm": 2.706854585553475, + "learning_rate": 1.0476035192970857e-07, + "loss": 0.953, + "step": 8638 + }, + { + "epoch": 0.9085674471189872, + "grad_norm": 1.9228244106745358, + "learning_rate": 1.045213533012937e-07, + "loss": 0.9746, + "step": 8639 + }, + { + "epoch": 0.9086726175608345, + "grad_norm": 2.5814153527028973, + "learning_rate": 1.042826217873566e-07, + "loss": 0.9523, + "step": 8640 + }, + { + "epoch": 0.9087777880026818, + "grad_norm": 2.9607847333673694, + "learning_rate": 1.0404415741451818e-07, + "loss": 0.9877, + "step": 8641 + }, + { + "epoch": 0.9088829584445292, + "grad_norm": 2.9016091841556984, + "learning_rate": 1.0380596020936801e-07, + "loss": 1.0021, + "step": 8642 + }, + { + "epoch": 0.9089881288863765, + "grad_norm": 2.633994800512432, + "learning_rate": 1.035680301984679e-07, + "loss": 0.9496, + "step": 8643 + }, + { + "epoch": 0.9090932993282238, + "grad_norm": 3.3617291767747357, + "learning_rate": 1.0333036740834857e-07, + "loss": 0.9403, + "step": 8644 + }, + { + "epoch": 0.9091984697700711, + "grad_norm": 2.677523368933197, + "learning_rate": 1.0309297186551131e-07, + "loss": 0.9678, + "step": 8645 + }, + { + "epoch": 0.9093036402119185, + "grad_norm": 2.373476234340472, + "learning_rate": 1.0285584359642747e-07, + "loss": 0.9627, + "step": 8646 + }, + { + "epoch": 0.9094088106537658, + "grad_norm": 2.0766733599038845, + "learning_rate": 1.0261898262753811e-07, + "loss": 0.9632, + "step": 8647 + }, + { + "epoch": 0.9095139810956131, + "grad_norm": 2.4397958236252553, + "learning_rate": 1.0238238898525654e-07, + "loss": 0.9787, + "step": 8648 + }, + { + "epoch": 0.9096191515374604, + "grad_norm": 2.429982084341817, + "learning_rate": 1.0214606269596361e-07, + "loss": 0.9983, + "step": 8649 + }, + { + "epoch": 0.9097243219793077, + "grad_norm": 1.7677248006677606, + "learning_rate": 1.0191000378601213e-07, + "loss": 0.9348, + "step": 8650 + }, + { + "epoch": 0.9098294924211551, + "grad_norm": 1.8993802694959563, + "learning_rate": 1.0167421228172381e-07, + "loss": 1.0176, + "step": 8651 + }, + { + "epoch": 0.9099346628630024, + "grad_norm": 2.6748517834984593, + "learning_rate": 1.0143868820939179e-07, + "loss": 0.986, + "step": 8652 + }, + { + "epoch": 0.9100398333048497, + "grad_norm": 2.622839246677593, + "learning_rate": 1.0120343159527923e-07, + "loss": 0.9792, + "step": 8653 + }, + { + "epoch": 0.910145003746697, + "grad_norm": 1.9757068387048435, + "learning_rate": 1.0096844246561794e-07, + "loss": 0.9901, + "step": 8654 + }, + { + "epoch": 0.9102501741885443, + "grad_norm": 2.095050670569624, + "learning_rate": 1.0073372084661193e-07, + "loss": 0.9605, + "step": 8655 + }, + { + "epoch": 0.9103553446303916, + "grad_norm": 2.750980361243624, + "learning_rate": 1.0049926676443361e-07, + "loss": 0.9974, + "step": 8656 + }, + { + "epoch": 0.9104605150722389, + "grad_norm": 2.961100634051163, + "learning_rate": 1.0026508024522791e-07, + "loss": 0.981, + "step": 8657 + }, + { + "epoch": 0.9105656855140862, + "grad_norm": 1.8783234007957286, + "learning_rate": 1.0003116131510698e-07, + "loss": 0.989, + "step": 8658 + }, + { + "epoch": 0.9106708559559336, + "grad_norm": 2.671085261106148, + "learning_rate": 9.979751000015552e-08, + "loss": 0.9828, + "step": 8659 + }, + { + "epoch": 0.9107760263977809, + "grad_norm": 3.0384313224763737, + "learning_rate": 9.956412632642715e-08, + "loss": 0.9908, + "step": 8660 + }, + { + "epoch": 0.9108811968396282, + "grad_norm": 2.5512974558431227, + "learning_rate": 9.933101031994547e-08, + "loss": 0.9434, + "step": 8661 + }, + { + "epoch": 0.9109863672814755, + "grad_norm": 1.737662256811027, + "learning_rate": 9.909816200670552e-08, + "loss": 0.9693, + "step": 8662 + }, + { + "epoch": 0.9110915377233229, + "grad_norm": 3.0073945279323833, + "learning_rate": 9.886558141267127e-08, + "loss": 0.9521, + "step": 8663 + }, + { + "epoch": 0.9111967081651702, + "grad_norm": 1.884812573575343, + "learning_rate": 9.863326856377753e-08, + "loss": 0.9782, + "step": 8664 + }, + { + "epoch": 0.9113018786070175, + "grad_norm": 2.722404955393905, + "learning_rate": 9.840122348592857e-08, + "loss": 0.9983, + "step": 8665 + }, + { + "epoch": 0.9114070490488648, + "grad_norm": 2.5101444576387344, + "learning_rate": 9.816944620499952e-08, + "loss": 0.9915, + "step": 8666 + }, + { + "epoch": 0.9115122194907122, + "grad_norm": 2.763819589917786, + "learning_rate": 9.793793674683555e-08, + "loss": 1.0128, + "step": 8667 + }, + { + "epoch": 0.9116173899325595, + "grad_norm": 2.690122259083069, + "learning_rate": 9.770669513725128e-08, + "loss": 1.0093, + "step": 8668 + }, + { + "epoch": 0.9117225603744068, + "grad_norm": 2.5984553040911003, + "learning_rate": 9.747572140203221e-08, + "loss": 0.9857, + "step": 8669 + }, + { + "epoch": 0.9118277308162541, + "grad_norm": 2.3984861025276194, + "learning_rate": 9.724501556693327e-08, + "loss": 1.0068, + "step": 8670 + }, + { + "epoch": 0.9119329012581014, + "grad_norm": 3.124841252252845, + "learning_rate": 9.701457765768113e-08, + "loss": 0.9785, + "step": 8671 + }, + { + "epoch": 0.9120380716999488, + "grad_norm": 2.377091759025528, + "learning_rate": 9.678440769996994e-08, + "loss": 0.9923, + "step": 8672 + }, + { + "epoch": 0.9121432421417961, + "grad_norm": 2.439191969177443, + "learning_rate": 9.655450571946667e-08, + "loss": 0.9754, + "step": 8673 + }, + { + "epoch": 0.9122484125836434, + "grad_norm": 2.3851721865392355, + "learning_rate": 9.632487174180638e-08, + "loss": 0.9982, + "step": 8674 + }, + { + "epoch": 0.9123535830254906, + "grad_norm": 2.172576222886981, + "learning_rate": 9.609550579259497e-08, + "loss": 0.9679, + "step": 8675 + }, + { + "epoch": 0.912458753467338, + "grad_norm": 2.301925417223173, + "learning_rate": 9.586640789740948e-08, + "loss": 0.9739, + "step": 8676 + }, + { + "epoch": 0.9125639239091853, + "grad_norm": 2.478887405016733, + "learning_rate": 9.563757808179502e-08, + "loss": 1.0417, + "step": 8677 + }, + { + "epoch": 0.9126690943510326, + "grad_norm": 2.7317447487137407, + "learning_rate": 9.54090163712687e-08, + "loss": 0.9742, + "step": 8678 + }, + { + "epoch": 0.9127742647928799, + "grad_norm": 2.7528530250768046, + "learning_rate": 9.518072279131596e-08, + "loss": 0.9546, + "step": 8679 + }, + { + "epoch": 0.9128794352347273, + "grad_norm": 2.128520981516656, + "learning_rate": 9.495269736739448e-08, + "loss": 0.9033, + "step": 8680 + }, + { + "epoch": 0.9129846056765746, + "grad_norm": 2.8260209548553346, + "learning_rate": 9.472494012493034e-08, + "loss": 0.9833, + "step": 8681 + }, + { + "epoch": 0.9130897761184219, + "grad_norm": 2.710474999545366, + "learning_rate": 9.449745108931985e-08, + "loss": 0.9626, + "step": 8682 + }, + { + "epoch": 0.9131949465602692, + "grad_norm": 2.4800110937909734, + "learning_rate": 9.427023028593051e-08, + "loss": 0.9755, + "step": 8683 + }, + { + "epoch": 0.9133001170021166, + "grad_norm": 2.3592614302731683, + "learning_rate": 9.404327774009819e-08, + "loss": 0.9557, + "step": 8684 + }, + { + "epoch": 0.9134052874439639, + "grad_norm": 2.9956075352373364, + "learning_rate": 9.381659347713123e-08, + "loss": 0.9975, + "step": 8685 + }, + { + "epoch": 0.9135104578858112, + "grad_norm": 2.1831281057025613, + "learning_rate": 9.359017752230582e-08, + "loss": 0.9784, + "step": 8686 + }, + { + "epoch": 0.9136156283276585, + "grad_norm": 2.2725775975518148, + "learning_rate": 9.336402990086924e-08, + "loss": 1.0201, + "step": 8687 + }, + { + "epoch": 0.9137207987695058, + "grad_norm": 2.794970120659367, + "learning_rate": 9.313815063803883e-08, + "loss": 0.927, + "step": 8688 + }, + { + "epoch": 0.9138259692113532, + "grad_norm": 2.447763388249118, + "learning_rate": 9.291253975900138e-08, + "loss": 0.9516, + "step": 8689 + }, + { + "epoch": 0.9139311396532005, + "grad_norm": 2.5939870031479773, + "learning_rate": 9.268719728891512e-08, + "loss": 1.0149, + "step": 8690 + }, + { + "epoch": 0.9140363100950478, + "grad_norm": 2.5305551362373864, + "learning_rate": 9.24621232529066e-08, + "loss": 1.0017, + "step": 8691 + }, + { + "epoch": 0.9141414805368951, + "grad_norm": 2.901219705340327, + "learning_rate": 9.223731767607436e-08, + "loss": 0.9446, + "step": 8692 + }, + { + "epoch": 0.9142466509787425, + "grad_norm": 2.353334916914352, + "learning_rate": 9.201278058348446e-08, + "loss": 0.9655, + "step": 8693 + }, + { + "epoch": 0.9143518214205898, + "grad_norm": 2.2889584109431094, + "learning_rate": 9.17885120001763e-08, + "loss": 1.0182, + "step": 8694 + }, + { + "epoch": 0.914456991862437, + "grad_norm": 2.5187641418281723, + "learning_rate": 9.156451195115601e-08, + "loss": 0.9578, + "step": 8695 + }, + { + "epoch": 0.9145621623042843, + "grad_norm": 2.466078538297683, + "learning_rate": 9.134078046140249e-08, + "loss": 0.949, + "step": 8696 + }, + { + "epoch": 0.9146673327461317, + "grad_norm": 1.969452495220898, + "learning_rate": 9.111731755586329e-08, + "loss": 0.9636, + "step": 8697 + }, + { + "epoch": 0.914772503187979, + "grad_norm": 2.2396101458216187, + "learning_rate": 9.089412325945513e-08, + "loss": 0.9892, + "step": 8698 + }, + { + "epoch": 0.9148776736298263, + "grad_norm": 2.3519886141123956, + "learning_rate": 9.067119759706755e-08, + "loss": 0.9746, + "step": 8699 + }, + { + "epoch": 0.9149828440716736, + "grad_norm": 2.4208396168966275, + "learning_rate": 9.044854059355763e-08, + "loss": 1.05, + "step": 8700 + }, + { + "epoch": 0.915088014513521, + "grad_norm": 3.9523038988472425, + "learning_rate": 9.022615227375353e-08, + "loss": 1.0266, + "step": 8701 + }, + { + "epoch": 0.9151931849553683, + "grad_norm": 2.002467211998927, + "learning_rate": 9.000403266245294e-08, + "loss": 0.9787, + "step": 8702 + }, + { + "epoch": 0.9152983553972156, + "grad_norm": 2.0671531516467776, + "learning_rate": 8.978218178442439e-08, + "loss": 0.9251, + "step": 8703 + }, + { + "epoch": 0.9154035258390629, + "grad_norm": 2.7026551909268046, + "learning_rate": 8.956059966440583e-08, + "loss": 1.0106, + "step": 8704 + }, + { + "epoch": 0.9155086962809103, + "grad_norm": 2.3171869376715213, + "learning_rate": 8.933928632710531e-08, + "loss": 0.9916, + "step": 8705 + }, + { + "epoch": 0.9156138667227576, + "grad_norm": 2.317933352612227, + "learning_rate": 8.911824179720113e-08, + "loss": 0.973, + "step": 8706 + }, + { + "epoch": 0.9157190371646049, + "grad_norm": 2.524849884723814, + "learning_rate": 8.889746609934108e-08, + "loss": 1.0246, + "step": 8707 + }, + { + "epoch": 0.9158242076064522, + "grad_norm": 2.8343370527261627, + "learning_rate": 8.867695925814407e-08, + "loss": 0.9862, + "step": 8708 + }, + { + "epoch": 0.9159293780482995, + "grad_norm": 2.2205891500846002, + "learning_rate": 8.845672129819766e-08, + "loss": 0.9454, + "step": 8709 + }, + { + "epoch": 0.9160345484901469, + "grad_norm": 2.624543266262543, + "learning_rate": 8.823675224406052e-08, + "loss": 1.0148, + "step": 8710 + }, + { + "epoch": 0.9161397189319942, + "grad_norm": 1.924274060643869, + "learning_rate": 8.801705212026058e-08, + "loss": 0.9628, + "step": 8711 + }, + { + "epoch": 0.9162448893738415, + "grad_norm": 2.6357976887412855, + "learning_rate": 8.779762095129623e-08, + "loss": 0.9868, + "step": 8712 + }, + { + "epoch": 0.9163500598156888, + "grad_norm": 2.201860694431399, + "learning_rate": 8.757845876163601e-08, + "loss": 0.9836, + "step": 8713 + }, + { + "epoch": 0.9164552302575362, + "grad_norm": 2.6034939502760923, + "learning_rate": 8.735956557571785e-08, + "loss": 0.9164, + "step": 8714 + }, + { + "epoch": 0.9165604006993835, + "grad_norm": 2.7455401077210997, + "learning_rate": 8.714094141795026e-08, + "loss": 1.0081, + "step": 8715 + }, + { + "epoch": 0.9166655711412307, + "grad_norm": 3.016771299173225, + "learning_rate": 8.692258631271127e-08, + "loss": 0.928, + "step": 8716 + }, + { + "epoch": 0.916770741583078, + "grad_norm": 2.072039917472285, + "learning_rate": 8.670450028434946e-08, + "loss": 0.9588, + "step": 8717 + }, + { + "epoch": 0.9168759120249254, + "grad_norm": 2.987620512435773, + "learning_rate": 8.648668335718313e-08, + "loss": 0.9895, + "step": 8718 + }, + { + "epoch": 0.9169810824667727, + "grad_norm": 2.5553475573583873, + "learning_rate": 8.626913555550009e-08, + "loss": 0.9904, + "step": 8719 + }, + { + "epoch": 0.91708625290862, + "grad_norm": 3.2548118763612157, + "learning_rate": 8.605185690355927e-08, + "loss": 1.0137, + "step": 8720 + }, + { + "epoch": 0.9171914233504673, + "grad_norm": 2.524991929264644, + "learning_rate": 8.583484742558823e-08, + "loss": 0.9857, + "step": 8721 + }, + { + "epoch": 0.9172965937923147, + "grad_norm": 3.326075876639707, + "learning_rate": 8.561810714578595e-08, + "loss": 0.9922, + "step": 8722 + }, + { + "epoch": 0.917401764234162, + "grad_norm": 2.06394387394277, + "learning_rate": 8.54016360883203e-08, + "loss": 0.9517, + "step": 8723 + }, + { + "epoch": 0.9175069346760093, + "grad_norm": 2.7063497472638707, + "learning_rate": 8.518543427732951e-08, + "loss": 1.0339, + "step": 8724 + }, + { + "epoch": 0.9176121051178566, + "grad_norm": 2.6198469236901722, + "learning_rate": 8.496950173692147e-08, + "loss": 0.9994, + "step": 8725 + }, + { + "epoch": 0.917717275559704, + "grad_norm": 2.0768494403671203, + "learning_rate": 8.475383849117474e-08, + "loss": 0.9819, + "step": 8726 + }, + { + "epoch": 0.9178224460015513, + "grad_norm": 2.0259311339586747, + "learning_rate": 8.453844456413729e-08, + "loss": 0.9839, + "step": 8727 + }, + { + "epoch": 0.9179276164433986, + "grad_norm": 3.4906371799996205, + "learning_rate": 8.43233199798274e-08, + "loss": 0.9509, + "step": 8728 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 2.233812559283946, + "learning_rate": 8.410846476223283e-08, + "loss": 1.001, + "step": 8729 + }, + { + "epoch": 0.9181379573270932, + "grad_norm": 2.680065549787725, + "learning_rate": 8.389387893531137e-08, + "loss": 0.9476, + "step": 8730 + }, + { + "epoch": 0.9182431277689406, + "grad_norm": 1.8569584887135309, + "learning_rate": 8.367956252299192e-08, + "loss": 0.9769, + "step": 8731 + }, + { + "epoch": 0.9183482982107879, + "grad_norm": 2.3494913181215695, + "learning_rate": 8.346551554917204e-08, + "loss": 0.9841, + "step": 8732 + }, + { + "epoch": 0.9184534686526352, + "grad_norm": 2.450100459990453, + "learning_rate": 8.325173803771902e-08, + "loss": 0.9682, + "step": 8733 + }, + { + "epoch": 0.9185586390944825, + "grad_norm": 2.3534886505356836, + "learning_rate": 8.303823001247102e-08, + "loss": 0.9905, + "step": 8734 + }, + { + "epoch": 0.9186638095363299, + "grad_norm": 2.0769040702020463, + "learning_rate": 8.282499149723622e-08, + "loss": 0.9871, + "step": 8735 + }, + { + "epoch": 0.9187689799781771, + "grad_norm": 2.328949116074991, + "learning_rate": 8.261202251579253e-08, + "loss": 0.9732, + "step": 8736 + }, + { + "epoch": 0.9188741504200244, + "grad_norm": 2.3004852069789754, + "learning_rate": 8.239932309188681e-08, + "loss": 0.9529, + "step": 8737 + }, + { + "epoch": 0.9189793208618717, + "grad_norm": 2.467224662817621, + "learning_rate": 8.218689324923729e-08, + "loss": 0.9588, + "step": 8738 + }, + { + "epoch": 0.919084491303719, + "grad_norm": 2.2727911833425076, + "learning_rate": 8.197473301153142e-08, + "loss": 1.0025, + "step": 8739 + }, + { + "epoch": 0.9191896617455664, + "grad_norm": 2.8021991978145455, + "learning_rate": 8.176284240242638e-08, + "loss": 1.0289, + "step": 8740 + }, + { + "epoch": 0.9192948321874137, + "grad_norm": 2.5953858991707492, + "learning_rate": 8.15512214455505e-08, + "loss": 0.9927, + "step": 8741 + }, + { + "epoch": 0.919400002629261, + "grad_norm": 2.374685117659884, + "learning_rate": 8.133987016450018e-08, + "loss": 0.9481, + "step": 8742 + }, + { + "epoch": 0.9195051730711084, + "grad_norm": 2.722786813359639, + "learning_rate": 8.112878858284351e-08, + "loss": 0.9962, + "step": 8743 + }, + { + "epoch": 0.9196103435129557, + "grad_norm": 2.843740539019621, + "learning_rate": 8.091797672411666e-08, + "loss": 0.9894, + "step": 8744 + }, + { + "epoch": 0.919715513954803, + "grad_norm": 2.462450080595821, + "learning_rate": 8.070743461182807e-08, + "loss": 0.9424, + "step": 8745 + }, + { + "epoch": 0.9198206843966503, + "grad_norm": 2.200892646883302, + "learning_rate": 8.04971622694542e-08, + "loss": 0.9761, + "step": 8746 + }, + { + "epoch": 0.9199258548384976, + "grad_norm": 1.8444502246648864, + "learning_rate": 8.028715972044216e-08, + "loss": 0.9718, + "step": 8747 + }, + { + "epoch": 0.920031025280345, + "grad_norm": 2.6368564963597954, + "learning_rate": 8.007742698820848e-08, + "loss": 0.9807, + "step": 8748 + }, + { + "epoch": 0.9201361957221923, + "grad_norm": 2.85801832795167, + "learning_rate": 7.986796409614028e-08, + "loss": 0.9885, + "step": 8749 + }, + { + "epoch": 0.9202413661640396, + "grad_norm": 2.2819001687005476, + "learning_rate": 7.965877106759473e-08, + "loss": 0.9686, + "step": 8750 + }, + { + "epoch": 0.920346536605887, + "grad_norm": 2.506401312891756, + "learning_rate": 7.944984792589788e-08, + "loss": 1.021, + "step": 8751 + }, + { + "epoch": 0.9204517070477343, + "grad_norm": 2.446263624804167, + "learning_rate": 7.924119469434666e-08, + "loss": 1.0275, + "step": 8752 + }, + { + "epoch": 0.9205568774895816, + "grad_norm": 2.557300434643533, + "learning_rate": 7.90328113962069e-08, + "loss": 0.9428, + "step": 8753 + }, + { + "epoch": 0.9206620479314289, + "grad_norm": 2.3857984916068538, + "learning_rate": 7.882469805471582e-08, + "loss": 1.0545, + "step": 8754 + }, + { + "epoch": 0.9207672183732762, + "grad_norm": 2.7202727116635312, + "learning_rate": 7.861685469307905e-08, + "loss": 1.0019, + "step": 8755 + }, + { + "epoch": 0.9208723888151235, + "grad_norm": 1.5934819593693805, + "learning_rate": 7.840928133447306e-08, + "loss": 0.9311, + "step": 8756 + }, + { + "epoch": 0.9209775592569708, + "grad_norm": 2.9845596688934655, + "learning_rate": 7.82019780020435e-08, + "loss": 1.0253, + "step": 8757 + }, + { + "epoch": 0.9210827296988181, + "grad_norm": 2.046747867064963, + "learning_rate": 7.799494471890684e-08, + "loss": 1.0192, + "step": 8758 + }, + { + "epoch": 0.9211879001406654, + "grad_norm": 2.5073020601329485, + "learning_rate": 7.778818150814854e-08, + "loss": 0.9559, + "step": 8759 + }, + { + "epoch": 0.9212930705825128, + "grad_norm": 2.5720686156196897, + "learning_rate": 7.75816883928246e-08, + "loss": 1.013, + "step": 8760 + }, + { + "epoch": 0.9213982410243601, + "grad_norm": 2.4862086127778675, + "learning_rate": 7.73754653959602e-08, + "loss": 0.9639, + "step": 8761 + }, + { + "epoch": 0.9215034114662074, + "grad_norm": 2.364000470048889, + "learning_rate": 7.716951254055111e-08, + "loss": 0.9796, + "step": 8762 + }, + { + "epoch": 0.9216085819080547, + "grad_norm": 1.8736154182919098, + "learning_rate": 7.69638298495623e-08, + "loss": 0.9932, + "step": 8763 + }, + { + "epoch": 0.921713752349902, + "grad_norm": 2.178955434285585, + "learning_rate": 7.675841734592987e-08, + "loss": 1.0271, + "step": 8764 + }, + { + "epoch": 0.9218189227917494, + "grad_norm": 3.1224740330325993, + "learning_rate": 7.655327505255772e-08, + "loss": 0.9938, + "step": 8765 + }, + { + "epoch": 0.9219240932335967, + "grad_norm": 2.6693925921641553, + "learning_rate": 7.634840299232171e-08, + "loss": 0.9828, + "step": 8766 + }, + { + "epoch": 0.922029263675444, + "grad_norm": 2.7013624068102207, + "learning_rate": 7.614380118806636e-08, + "loss": 0.9895, + "step": 8767 + }, + { + "epoch": 0.9221344341172913, + "grad_norm": 2.201528972225391, + "learning_rate": 7.593946966260618e-08, + "loss": 0.9644, + "step": 8768 + }, + { + "epoch": 0.9222396045591387, + "grad_norm": 2.3707667253047697, + "learning_rate": 7.573540843872602e-08, + "loss": 1.0226, + "step": 8769 + }, + { + "epoch": 0.922344775000986, + "grad_norm": 2.1492440827092714, + "learning_rate": 7.553161753918015e-08, + "loss": 0.9661, + "step": 8770 + }, + { + "epoch": 0.9224499454428333, + "grad_norm": 2.1183442817355154, + "learning_rate": 7.532809698669263e-08, + "loss": 0.997, + "step": 8771 + }, + { + "epoch": 0.9225551158846806, + "grad_norm": 2.0129347965512996, + "learning_rate": 7.512484680395782e-08, + "loss": 0.9551, + "step": 8772 + }, + { + "epoch": 0.922660286326528, + "grad_norm": 3.3447687257492644, + "learning_rate": 7.492186701364007e-08, + "loss": 0.9852, + "step": 8773 + }, + { + "epoch": 0.9227654567683753, + "grad_norm": 2.5874808124779007, + "learning_rate": 7.471915763837268e-08, + "loss": 0.9863, + "step": 8774 + }, + { + "epoch": 0.9228706272102226, + "grad_norm": 2.247477950163746, + "learning_rate": 7.451671870075949e-08, + "loss": 0.9511, + "step": 8775 + }, + { + "epoch": 0.9229757976520699, + "grad_norm": 2.349577908913971, + "learning_rate": 7.431455022337386e-08, + "loss": 0.9971, + "step": 8776 + }, + { + "epoch": 0.9230809680939172, + "grad_norm": 2.4880434952386934, + "learning_rate": 7.411265222875913e-08, + "loss": 0.9879, + "step": 8777 + }, + { + "epoch": 0.9231861385357645, + "grad_norm": 2.6428533811226704, + "learning_rate": 7.391102473942897e-08, + "loss": 0.9675, + "step": 8778 + }, + { + "epoch": 0.9232913089776118, + "grad_norm": 2.2938047586552366, + "learning_rate": 7.370966777786564e-08, + "loss": 0.9626, + "step": 8779 + }, + { + "epoch": 0.9233964794194591, + "grad_norm": 2.684986649020841, + "learning_rate": 7.350858136652262e-08, + "loss": 1.0519, + "step": 8780 + }, + { + "epoch": 0.9235016498613065, + "grad_norm": 3.0768759986996947, + "learning_rate": 7.330776552782248e-08, + "loss": 0.9541, + "step": 8781 + }, + { + "epoch": 0.9236068203031538, + "grad_norm": 2.480420721077424, + "learning_rate": 7.310722028415762e-08, + "loss": 0.9568, + "step": 8782 + }, + { + "epoch": 0.9237119907450011, + "grad_norm": 2.7358308672297107, + "learning_rate": 7.290694565789069e-08, + "loss": 0.9606, + "step": 8783 + }, + { + "epoch": 0.9238171611868484, + "grad_norm": 2.4196600463864177, + "learning_rate": 7.270694167135356e-08, + "loss": 0.9517, + "step": 8784 + }, + { + "epoch": 0.9239223316286957, + "grad_norm": 3.033022762994752, + "learning_rate": 7.25072083468481e-08, + "loss": 1.033, + "step": 8785 + }, + { + "epoch": 0.9240275020705431, + "grad_norm": 2.710152700629641, + "learning_rate": 7.230774570664623e-08, + "loss": 0.9857, + "step": 8786 + }, + { + "epoch": 0.9241326725123904, + "grad_norm": 2.4433124063205027, + "learning_rate": 7.210855377299014e-08, + "loss": 0.9722, + "step": 8787 + }, + { + "epoch": 0.9242378429542377, + "grad_norm": 3.0106631380990425, + "learning_rate": 7.190963256809069e-08, + "loss": 0.9721, + "step": 8788 + }, + { + "epoch": 0.924343013396085, + "grad_norm": 2.482926124872465, + "learning_rate": 7.17109821141293e-08, + "loss": 0.9866, + "step": 8789 + }, + { + "epoch": 0.9244481838379324, + "grad_norm": 2.4915008975300927, + "learning_rate": 7.151260243325686e-08, + "loss": 1.024, + "step": 8790 + }, + { + "epoch": 0.9245533542797797, + "grad_norm": 2.9578504303961055, + "learning_rate": 7.13144935475943e-08, + "loss": 0.9146, + "step": 8791 + }, + { + "epoch": 0.924658524721627, + "grad_norm": 3.1081419272442856, + "learning_rate": 7.111665547923252e-08, + "loss": 0.9885, + "step": 8792 + }, + { + "epoch": 0.9247636951634743, + "grad_norm": 2.3985246667372544, + "learning_rate": 7.091908825023197e-08, + "loss": 1.0025, + "step": 8793 + }, + { + "epoch": 0.9248688656053217, + "grad_norm": 2.428656822228575, + "learning_rate": 7.072179188262252e-08, + "loss": 0.9968, + "step": 8794 + }, + { + "epoch": 0.924974036047169, + "grad_norm": 2.255455390801362, + "learning_rate": 7.052476639840489e-08, + "loss": 0.9685, + "step": 8795 + }, + { + "epoch": 0.9250792064890163, + "grad_norm": 2.3473107950280894, + "learning_rate": 7.032801181954873e-08, + "loss": 0.9967, + "step": 8796 + }, + { + "epoch": 0.9251843769308635, + "grad_norm": 2.762465947655461, + "learning_rate": 7.013152816799317e-08, + "loss": 1.0137, + "step": 8797 + }, + { + "epoch": 0.9252895473727109, + "grad_norm": 2.082339717782587, + "learning_rate": 6.993531546564874e-08, + "loss": 0.9553, + "step": 8798 + }, + { + "epoch": 0.9253947178145582, + "grad_norm": 2.4355695449007193, + "learning_rate": 6.973937373439349e-08, + "loss": 0.9761, + "step": 8799 + }, + { + "epoch": 0.9254998882564055, + "grad_norm": 2.3323761744575453, + "learning_rate": 6.954370299607715e-08, + "loss": 0.9978, + "step": 8800 + }, + { + "epoch": 0.9256050586982528, + "grad_norm": 2.7276265273799636, + "learning_rate": 6.93483032725184e-08, + "loss": 1.014, + "step": 8801 + }, + { + "epoch": 0.9257102291401001, + "grad_norm": 2.451649200404242, + "learning_rate": 6.91531745855059e-08, + "loss": 0.9806, + "step": 8802 + }, + { + "epoch": 0.9258153995819475, + "grad_norm": 2.3782834691727186, + "learning_rate": 6.895831695679756e-08, + "loss": 0.9996, + "step": 8803 + }, + { + "epoch": 0.9259205700237948, + "grad_norm": 2.725803550156134, + "learning_rate": 6.876373040812234e-08, + "loss": 0.9837, + "step": 8804 + }, + { + "epoch": 0.9260257404656421, + "grad_norm": 1.692896666258162, + "learning_rate": 6.856941496117736e-08, + "loss": 0.9844, + "step": 8805 + }, + { + "epoch": 0.9261309109074894, + "grad_norm": 2.627960374778733, + "learning_rate": 6.837537063763083e-08, + "loss": 0.9603, + "step": 8806 + }, + { + "epoch": 0.9262360813493368, + "grad_norm": 2.4123212586639045, + "learning_rate": 6.818159745911989e-08, + "loss": 0.9272, + "step": 8807 + }, + { + "epoch": 0.9263412517911841, + "grad_norm": 2.672047655684633, + "learning_rate": 6.798809544725171e-08, + "loss": 0.9814, + "step": 8808 + }, + { + "epoch": 0.9264464222330314, + "grad_norm": 2.8051273113292434, + "learning_rate": 6.779486462360346e-08, + "loss": 0.9619, + "step": 8809 + }, + { + "epoch": 0.9265515926748787, + "grad_norm": 1.930413903321851, + "learning_rate": 6.760190500972208e-08, + "loss": 0.9405, + "step": 8810 + }, + { + "epoch": 0.9266567631167261, + "grad_norm": 2.0349253679776975, + "learning_rate": 6.740921662712368e-08, + "loss": 0.9864, + "step": 8811 + }, + { + "epoch": 0.9267619335585734, + "grad_norm": 2.6944705175063954, + "learning_rate": 6.721679949729499e-08, + "loss": 0.9921, + "step": 8812 + }, + { + "epoch": 0.9268671040004207, + "grad_norm": 2.118350315003092, + "learning_rate": 6.702465364169103e-08, + "loss": 0.9487, + "step": 8813 + }, + { + "epoch": 0.926972274442268, + "grad_norm": 2.6969741905193363, + "learning_rate": 6.683277908173858e-08, + "loss": 0.9865, + "step": 8814 + }, + { + "epoch": 0.9270774448841154, + "grad_norm": 2.557886102693444, + "learning_rate": 6.664117583883272e-08, + "loss": 1.017, + "step": 8815 + }, + { + "epoch": 0.9271826153259627, + "grad_norm": 2.0970620367491635, + "learning_rate": 6.64498439343389e-08, + "loss": 1.0267, + "step": 8816 + }, + { + "epoch": 0.9272877857678099, + "grad_norm": 2.4865425974749003, + "learning_rate": 6.625878338959168e-08, + "loss": 1.0033, + "step": 8817 + }, + { + "epoch": 0.9273929562096572, + "grad_norm": 1.8276518666416652, + "learning_rate": 6.606799422589627e-08, + "loss": 0.9588, + "step": 8818 + }, + { + "epoch": 0.9274981266515046, + "grad_norm": 2.3070835376793304, + "learning_rate": 6.587747646452675e-08, + "loss": 0.966, + "step": 8819 + }, + { + "epoch": 0.9276032970933519, + "grad_norm": 2.5299309056388752, + "learning_rate": 6.568723012672779e-08, + "loss": 0.9866, + "step": 8820 + }, + { + "epoch": 0.9277084675351992, + "grad_norm": 2.547941361140502, + "learning_rate": 6.549725523371298e-08, + "loss": 0.992, + "step": 8821 + }, + { + "epoch": 0.9278136379770465, + "grad_norm": 2.417261025425601, + "learning_rate": 6.530755180666593e-08, + "loss": 0.981, + "step": 8822 + }, + { + "epoch": 0.9279188084188938, + "grad_norm": 2.9686003899940565, + "learning_rate": 6.511811986674028e-08, + "loss": 1.0337, + "step": 8823 + }, + { + "epoch": 0.9280239788607412, + "grad_norm": 2.8340124141461978, + "learning_rate": 6.49289594350594e-08, + "loss": 0.9824, + "step": 8824 + }, + { + "epoch": 0.9281291493025885, + "grad_norm": 3.171176281484354, + "learning_rate": 6.47400705327153e-08, + "loss": 0.9947, + "step": 8825 + }, + { + "epoch": 0.9282343197444358, + "grad_norm": 2.564414628087072, + "learning_rate": 6.455145318077144e-08, + "loss": 0.9985, + "step": 8826 + }, + { + "epoch": 0.9283394901862831, + "grad_norm": 2.4567715942774337, + "learning_rate": 6.436310740025986e-08, + "loss": 0.9892, + "step": 8827 + }, + { + "epoch": 0.9284446606281305, + "grad_norm": 2.4634151576295746, + "learning_rate": 6.41750332121821e-08, + "loss": 0.9948, + "step": 8828 + }, + { + "epoch": 0.9285498310699778, + "grad_norm": 2.2133318934099724, + "learning_rate": 6.398723063751083e-08, + "loss": 0.9897, + "step": 8829 + }, + { + "epoch": 0.9286550015118251, + "grad_norm": 2.215396747584313, + "learning_rate": 6.379969969718653e-08, + "loss": 0.9914, + "step": 8830 + }, + { + "epoch": 0.9287601719536724, + "grad_norm": 2.459138737636483, + "learning_rate": 6.361244041212078e-08, + "loss": 1.0084, + "step": 8831 + }, + { + "epoch": 0.9288653423955198, + "grad_norm": 1.934406044306015, + "learning_rate": 6.342545280319468e-08, + "loss": 0.9425, + "step": 8832 + }, + { + "epoch": 0.9289705128373671, + "grad_norm": 2.496708383635269, + "learning_rate": 6.323873689125848e-08, + "loss": 0.9742, + "step": 8833 + }, + { + "epoch": 0.9290756832792144, + "grad_norm": 2.078495436451324, + "learning_rate": 6.305229269713276e-08, + "loss": 0.9499, + "step": 8834 + }, + { + "epoch": 0.9291808537210617, + "grad_norm": 2.0553106122159948, + "learning_rate": 6.286612024160699e-08, + "loss": 0.984, + "step": 8835 + }, + { + "epoch": 0.9292860241629091, + "grad_norm": 1.9572815789421085, + "learning_rate": 6.268021954544095e-08, + "loss": 0.9922, + "step": 8836 + }, + { + "epoch": 0.9293911946047564, + "grad_norm": 2.0721426471435618, + "learning_rate": 6.249459062936447e-08, + "loss": 0.9806, + "step": 8837 + }, + { + "epoch": 0.9294963650466036, + "grad_norm": 2.9623950844860585, + "learning_rate": 6.230923351407653e-08, + "loss": 1.0178, + "step": 8838 + }, + { + "epoch": 0.9296015354884509, + "grad_norm": 2.736503439361389, + "learning_rate": 6.212414822024532e-08, + "loss": 1.0119, + "step": 8839 + }, + { + "epoch": 0.9297067059302982, + "grad_norm": 1.6867869389616887, + "learning_rate": 6.193933476850961e-08, + "loss": 0.9662, + "step": 8840 + }, + { + "epoch": 0.9298118763721456, + "grad_norm": 2.572027064275728, + "learning_rate": 6.175479317947818e-08, + "loss": 0.981, + "step": 8841 + }, + { + "epoch": 0.9299170468139929, + "grad_norm": 2.7961316524981563, + "learning_rate": 6.157052347372767e-08, + "loss": 0.978, + "step": 8842 + }, + { + "epoch": 0.9300222172558402, + "grad_norm": 2.0569779619598885, + "learning_rate": 6.138652567180658e-08, + "loss": 0.9811, + "step": 8843 + }, + { + "epoch": 0.9301273876976875, + "grad_norm": 2.1285919020070114, + "learning_rate": 6.120279979423133e-08, + "loss": 1.0071, + "step": 8844 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 2.6903412011111505, + "learning_rate": 6.101934586148938e-08, + "loss": 0.9554, + "step": 8845 + }, + { + "epoch": 0.9303377285813822, + "grad_norm": 2.8879413392379063, + "learning_rate": 6.083616389403691e-08, + "loss": 0.9768, + "step": 8846 + }, + { + "epoch": 0.9304428990232295, + "grad_norm": 2.7703622961279124, + "learning_rate": 6.065325391230032e-08, + "loss": 1.0119, + "step": 8847 + }, + { + "epoch": 0.9305480694650768, + "grad_norm": 2.2151304541476757, + "learning_rate": 6.047061593667552e-08, + "loss": 1.0256, + "step": 8848 + }, + { + "epoch": 0.9306532399069242, + "grad_norm": 2.3412586834401186, + "learning_rate": 6.028824998752764e-08, + "loss": 0.9941, + "step": 8849 + }, + { + "epoch": 0.9307584103487715, + "grad_norm": 3.0538317688803205, + "learning_rate": 6.010615608519261e-08, + "loss": 1.0315, + "step": 8850 + }, + { + "epoch": 0.9308635807906188, + "grad_norm": 2.2047583894385627, + "learning_rate": 5.992433424997473e-08, + "loss": 0.989, + "step": 8851 + }, + { + "epoch": 0.9309687512324661, + "grad_norm": 1.922856138628748, + "learning_rate": 5.974278450214893e-08, + "loss": 0.991, + "step": 8852 + }, + { + "epoch": 0.9310739216743135, + "grad_norm": 3.0208302634204536, + "learning_rate": 5.9561506861958994e-08, + "loss": 0.9735, + "step": 8853 + }, + { + "epoch": 0.9311790921161608, + "grad_norm": 2.8134158677180237, + "learning_rate": 5.9380501349619034e-08, + "loss": 0.9672, + "step": 8854 + }, + { + "epoch": 0.9312842625580081, + "grad_norm": 2.048098684977348, + "learning_rate": 5.9199767985312905e-08, + "loss": 0.993, + "step": 8855 + }, + { + "epoch": 0.9313894329998554, + "grad_norm": 2.345542378976877, + "learning_rate": 5.9019306789193374e-08, + "loss": 0.9825, + "step": 8856 + }, + { + "epoch": 0.9314946034417028, + "grad_norm": 2.3536588929040962, + "learning_rate": 5.883911778138324e-08, + "loss": 1.0004, + "step": 8857 + }, + { + "epoch": 0.93159977388355, + "grad_norm": 2.2434011250068533, + "learning_rate": 5.865920098197475e-08, + "loss": 1.0042, + "step": 8858 + }, + { + "epoch": 0.9317049443253973, + "grad_norm": 2.5379253604796577, + "learning_rate": 5.847955641103076e-08, + "loss": 0.9675, + "step": 8859 + }, + { + "epoch": 0.9318101147672446, + "grad_norm": 2.1102591640587627, + "learning_rate": 5.830018408858246e-08, + "loss": 0.9685, + "step": 8860 + }, + { + "epoch": 0.931915285209092, + "grad_norm": 2.0332852212785144, + "learning_rate": 5.8121084034631625e-08, + "loss": 0.9865, + "step": 8861 + }, + { + "epoch": 0.9320204556509393, + "grad_norm": 2.243828323356822, + "learning_rate": 5.7942256269148675e-08, + "loss": 0.9901, + "step": 8862 + }, + { + "epoch": 0.9321256260927866, + "grad_norm": 3.0058206320010403, + "learning_rate": 5.776370081207516e-08, + "loss": 1.0299, + "step": 8863 + }, + { + "epoch": 0.9322307965346339, + "grad_norm": 2.3047446223648373, + "learning_rate": 5.758541768332071e-08, + "loss": 0.9945, + "step": 8864 + }, + { + "epoch": 0.9323359669764812, + "grad_norm": 2.4665165924323453, + "learning_rate": 5.740740690276553e-08, + "loss": 0.994, + "step": 8865 + }, + { + "epoch": 0.9324411374183286, + "grad_norm": 2.5834709263911715, + "learning_rate": 5.722966849025957e-08, + "loss": 1.0037, + "step": 8866 + }, + { + "epoch": 0.9325463078601759, + "grad_norm": 2.2524845705290133, + "learning_rate": 5.7052202465621434e-08, + "loss": 1.002, + "step": 8867 + }, + { + "epoch": 0.9326514783020232, + "grad_norm": 2.4353939058546237, + "learning_rate": 5.687500884864e-08, + "loss": 0.983, + "step": 8868 + }, + { + "epoch": 0.9327566487438705, + "grad_norm": 2.5223813989202197, + "learning_rate": 5.66980876590742e-08, + "loss": 0.9392, + "step": 8869 + }, + { + "epoch": 0.9328618191857179, + "grad_norm": 2.2960675605121583, + "learning_rate": 5.652143891665157e-08, + "loss": 0.9687, + "step": 8870 + }, + { + "epoch": 0.9329669896275652, + "grad_norm": 2.10511952467411, + "learning_rate": 5.634506264107054e-08, + "loss": 0.9637, + "step": 8871 + }, + { + "epoch": 0.9330721600694125, + "grad_norm": 2.4587695553336513, + "learning_rate": 5.616895885199758e-08, + "loss": 0.9774, + "step": 8872 + }, + { + "epoch": 0.9331773305112598, + "grad_norm": 1.965729243927272, + "learning_rate": 5.5993127569070325e-08, + "loss": 0.9482, + "step": 8873 + }, + { + "epoch": 0.9332825009531072, + "grad_norm": 3.8358216369631597, + "learning_rate": 5.5817568811894763e-08, + "loss": 0.9987, + "step": 8874 + }, + { + "epoch": 0.9333876713949545, + "grad_norm": 1.5484337642536643, + "learning_rate": 5.564228260004773e-08, + "loss": 0.937, + "step": 8875 + }, + { + "epoch": 0.9334928418368018, + "grad_norm": 4.054983699134379, + "learning_rate": 5.5467268953074414e-08, + "loss": 1.0385, + "step": 8876 + }, + { + "epoch": 0.9335980122786491, + "grad_norm": 2.3147943515098715, + "learning_rate": 5.529252789049033e-08, + "loss": 1.0191, + "step": 8877 + }, + { + "epoch": 0.9337031827204963, + "grad_norm": 2.76261922574638, + "learning_rate": 5.5118059431781e-08, + "loss": 0.9745, + "step": 8878 + }, + { + "epoch": 0.9338083531623437, + "grad_norm": 3.2491525384795863, + "learning_rate": 5.4943863596400026e-08, + "loss": 0.9695, + "step": 8879 + }, + { + "epoch": 0.933913523604191, + "grad_norm": 2.4349584193925042, + "learning_rate": 5.476994040377243e-08, + "loss": 0.9973, + "step": 8880 + }, + { + "epoch": 0.9340186940460383, + "grad_norm": 1.57930103970821, + "learning_rate": 5.45962898732913e-08, + "loss": 0.9405, + "step": 8881 + }, + { + "epoch": 0.9341238644878856, + "grad_norm": 2.8444833537713805, + "learning_rate": 5.442291202432087e-08, + "loss": 1.0168, + "step": 8882 + }, + { + "epoch": 0.934229034929733, + "grad_norm": 2.0406400774301776, + "learning_rate": 5.424980687619319e-08, + "loss": 0.9987, + "step": 8883 + }, + { + "epoch": 0.9343342053715803, + "grad_norm": 1.6448616663744222, + "learning_rate": 5.407697444821169e-08, + "loss": 0.9668, + "step": 8884 + }, + { + "epoch": 0.9344393758134276, + "grad_norm": 2.320476937675156, + "learning_rate": 5.3904414759648195e-08, + "loss": 0.9719, + "step": 8885 + }, + { + "epoch": 0.9345445462552749, + "grad_norm": 2.5020571289750873, + "learning_rate": 5.3732127829743964e-08, + "loss": 0.9816, + "step": 8886 + }, + { + "epoch": 0.9346497166971223, + "grad_norm": 2.240359223042951, + "learning_rate": 5.356011367771113e-08, + "loss": 0.9784, + "step": 8887 + }, + { + "epoch": 0.9347548871389696, + "grad_norm": 1.9605700811751219, + "learning_rate": 5.338837232272992e-08, + "loss": 1.0099, + "step": 8888 + }, + { + "epoch": 0.9348600575808169, + "grad_norm": 2.0597267796298486, + "learning_rate": 5.321690378395167e-08, + "loss": 0.9846, + "step": 8889 + }, + { + "epoch": 0.9349652280226642, + "grad_norm": 1.8547486595770128, + "learning_rate": 5.304570808049553e-08, + "loss": 0.9384, + "step": 8890 + }, + { + "epoch": 0.9350703984645116, + "grad_norm": 2.197878394312316, + "learning_rate": 5.287478523145151e-08, + "loss": 0.9896, + "step": 8891 + }, + { + "epoch": 0.9351755689063589, + "grad_norm": 2.240523235271297, + "learning_rate": 5.2704135255879085e-08, + "loss": 0.9526, + "step": 8892 + }, + { + "epoch": 0.9352807393482062, + "grad_norm": 2.8082105695500172, + "learning_rate": 5.253375817280665e-08, + "loss": 0.9864, + "step": 8893 + }, + { + "epoch": 0.9353859097900535, + "grad_norm": 2.8371694669918472, + "learning_rate": 5.236365400123289e-08, + "loss": 0.9613, + "step": 8894 + }, + { + "epoch": 0.9354910802319009, + "grad_norm": 1.7047869479150661, + "learning_rate": 5.219382276012514e-08, + "loss": 0.9818, + "step": 8895 + }, + { + "epoch": 0.9355962506737482, + "grad_norm": 2.6335657856305885, + "learning_rate": 5.2024264468422125e-08, + "loss": 0.9802, + "step": 8896 + }, + { + "epoch": 0.9357014211155955, + "grad_norm": 2.1047402552338745, + "learning_rate": 5.185497914502957e-08, + "loss": 0.9918, + "step": 8897 + }, + { + "epoch": 0.9358065915574428, + "grad_norm": 2.964283660935744, + "learning_rate": 5.168596680882515e-08, + "loss": 1.0045, + "step": 8898 + }, + { + "epoch": 0.93591176199929, + "grad_norm": 2.164852522853516, + "learning_rate": 5.151722747865434e-08, + "loss": 0.9464, + "step": 8899 + }, + { + "epoch": 0.9360169324411374, + "grad_norm": 2.1258441873852463, + "learning_rate": 5.134876117333321e-08, + "loss": 0.9473, + "step": 8900 + }, + { + "epoch": 0.9361221028829847, + "grad_norm": 2.587963254415204, + "learning_rate": 5.1180567911646994e-08, + "loss": 1.0034, + "step": 8901 + }, + { + "epoch": 0.936227273324832, + "grad_norm": 2.6791686319504753, + "learning_rate": 5.1012647712350425e-08, + "loss": 0.9854, + "step": 8902 + }, + { + "epoch": 0.9363324437666793, + "grad_norm": 2.4951824060899597, + "learning_rate": 5.084500059416852e-08, + "loss": 0.9324, + "step": 8903 + }, + { + "epoch": 0.9364376142085267, + "grad_norm": 2.2357393856839405, + "learning_rate": 5.067762657579412e-08, + "loss": 0.9492, + "step": 8904 + }, + { + "epoch": 0.936542784650374, + "grad_norm": 1.9973404560425712, + "learning_rate": 5.0510525675891706e-08, + "loss": 0.9965, + "step": 8905 + }, + { + "epoch": 0.9366479550922213, + "grad_norm": 2.64657389622434, + "learning_rate": 5.0343697913093904e-08, + "loss": 0.982, + "step": 8906 + }, + { + "epoch": 0.9367531255340686, + "grad_norm": 1.9223409424201854, + "learning_rate": 5.017714330600332e-08, + "loss": 1.0052, + "step": 8907 + }, + { + "epoch": 0.936858295975916, + "grad_norm": 2.203306276144651, + "learning_rate": 5.0010861873192596e-08, + "loss": 1.022, + "step": 8908 + }, + { + "epoch": 0.9369634664177633, + "grad_norm": 2.4829606009339282, + "learning_rate": 4.984485363320218e-08, + "loss": 0.9611, + "step": 8909 + }, + { + "epoch": 0.9370686368596106, + "grad_norm": 1.9321781349119374, + "learning_rate": 4.9679118604544496e-08, + "loss": 0.9633, + "step": 8910 + }, + { + "epoch": 0.9371738073014579, + "grad_norm": 2.2454619717290285, + "learning_rate": 4.951365680569975e-08, + "loss": 0.9923, + "step": 8911 + }, + { + "epoch": 0.9372789777433053, + "grad_norm": 2.61658321641514, + "learning_rate": 4.9348468255118465e-08, + "loss": 1.0092, + "step": 8912 + }, + { + "epoch": 0.9373841481851526, + "grad_norm": 3.2178127868301494, + "learning_rate": 4.918355297122035e-08, + "loss": 1.0305, + "step": 8913 + }, + { + "epoch": 0.9374893186269999, + "grad_norm": 2.5158970965320178, + "learning_rate": 4.901891097239431e-08, + "loss": 0.932, + "step": 8914 + }, + { + "epoch": 0.9375944890688472, + "grad_norm": 2.853474567380966, + "learning_rate": 4.885454227700009e-08, + "loss": 0.9817, + "step": 8915 + }, + { + "epoch": 0.9376996595106946, + "grad_norm": 2.8015618857324105, + "learning_rate": 4.8690446903365e-08, + "loss": 0.9716, + "step": 8916 + }, + { + "epoch": 0.9378048299525419, + "grad_norm": 2.801611751147078, + "learning_rate": 4.8526624869787985e-08, + "loss": 0.9855, + "step": 8917 + }, + { + "epoch": 0.9379100003943892, + "grad_norm": 2.1624199744457426, + "learning_rate": 4.836307619453556e-08, + "loss": 0.9891, + "step": 8918 + }, + { + "epoch": 0.9380151708362364, + "grad_norm": 2.9139791008885907, + "learning_rate": 4.819980089584564e-08, + "loss": 0.9789, + "step": 8919 + }, + { + "epoch": 0.9381203412780837, + "grad_norm": 2.923200016146771, + "learning_rate": 4.8036798991923925e-08, + "loss": 0.9436, + "step": 8920 + }, + { + "epoch": 0.9382255117199311, + "grad_norm": 2.295229636965682, + "learning_rate": 4.7874070500946725e-08, + "loss": 0.975, + "step": 8921 + }, + { + "epoch": 0.9383306821617784, + "grad_norm": 3.0300460033187515, + "learning_rate": 4.771161544105951e-08, + "loss": 0.9951, + "step": 8922 + }, + { + "epoch": 0.9384358526036257, + "grad_norm": 2.115432155833796, + "learning_rate": 4.754943383037669e-08, + "loss": 0.9423, + "step": 8923 + }, + { + "epoch": 0.938541023045473, + "grad_norm": 2.2142585640978525, + "learning_rate": 4.7387525686983793e-08, + "loss": 1.0046, + "step": 8924 + }, + { + "epoch": 0.9386461934873204, + "grad_norm": 2.2974730077916714, + "learning_rate": 4.722589102893416e-08, + "loss": 0.9384, + "step": 8925 + }, + { + "epoch": 0.9387513639291677, + "grad_norm": 2.4189018597780216, + "learning_rate": 4.70645298742517e-08, + "loss": 0.9834, + "step": 8926 + }, + { + "epoch": 0.938856534371015, + "grad_norm": 2.2427060135015315, + "learning_rate": 4.690344224092924e-08, + "loss": 0.9165, + "step": 8927 + }, + { + "epoch": 0.9389617048128623, + "grad_norm": 3.2157140769814667, + "learning_rate": 4.674262814692909e-08, + "loss": 0.9877, + "step": 8928 + }, + { + "epoch": 0.9390668752547097, + "grad_norm": 2.2854910606933556, + "learning_rate": 4.658208761018357e-08, + "loss": 0.9979, + "step": 8929 + }, + { + "epoch": 0.939172045696557, + "grad_norm": 2.3119573645611307, + "learning_rate": 4.6421820648593906e-08, + "loss": 0.9796, + "step": 8930 + }, + { + "epoch": 0.9392772161384043, + "grad_norm": 2.2403463122387914, + "learning_rate": 4.626182728003165e-08, + "loss": 0.9831, + "step": 8931 + }, + { + "epoch": 0.9393823865802516, + "grad_norm": 1.9681555317969763, + "learning_rate": 4.61021075223364e-08, + "loss": 0.9454, + "step": 8932 + }, + { + "epoch": 0.939487557022099, + "grad_norm": 2.0758545826298844, + "learning_rate": 4.594266139331921e-08, + "loss": 0.9768, + "step": 8933 + }, + { + "epoch": 0.9395927274639463, + "grad_norm": 1.836114474356616, + "learning_rate": 4.57834889107589e-08, + "loss": 1.0333, + "step": 8934 + }, + { + "epoch": 0.9396978979057936, + "grad_norm": 2.439188983743261, + "learning_rate": 4.5624590092404884e-08, + "loss": 0.9691, + "step": 8935 + }, + { + "epoch": 0.9398030683476409, + "grad_norm": 2.093740532951504, + "learning_rate": 4.546596495597494e-08, + "loss": 0.9777, + "step": 8936 + }, + { + "epoch": 0.9399082387894883, + "grad_norm": 2.877703863906485, + "learning_rate": 4.530761351915741e-08, + "loss": 0.9488, + "step": 8937 + }, + { + "epoch": 0.9400134092313356, + "grad_norm": 2.757214678581196, + "learning_rate": 4.5149535799610125e-08, + "loss": 1.0115, + "step": 8938 + }, + { + "epoch": 0.9401185796731828, + "grad_norm": 3.325004102364057, + "learning_rate": 4.499173181495897e-08, + "loss": 0.9751, + "step": 8939 + }, + { + "epoch": 0.9402237501150301, + "grad_norm": 2.3523829345278915, + "learning_rate": 4.4834201582801275e-08, + "loss": 0.9737, + "step": 8940 + }, + { + "epoch": 0.9403289205568774, + "grad_norm": 3.8621952023589445, + "learning_rate": 4.467694512070242e-08, + "loss": 1.0262, + "step": 8941 + }, + { + "epoch": 0.9404340909987248, + "grad_norm": 2.185909471977706, + "learning_rate": 4.451996244619755e-08, + "loss": 1.009, + "step": 8942 + }, + { + "epoch": 0.9405392614405721, + "grad_norm": 1.7242270309634, + "learning_rate": 4.436325357679211e-08, + "loss": 0.9852, + "step": 8943 + }, + { + "epoch": 0.9406444318824194, + "grad_norm": 2.12009999134058, + "learning_rate": 4.420681852995962e-08, + "loss": 0.9544, + "step": 8944 + }, + { + "epoch": 0.9407496023242667, + "grad_norm": 2.7180402269060737, + "learning_rate": 4.4050657323144454e-08, + "loss": 0.9866, + "step": 8945 + }, + { + "epoch": 0.9408547727661141, + "grad_norm": 2.3821757021611227, + "learning_rate": 4.3894769973759075e-08, + "loss": 0.9786, + "step": 8946 + }, + { + "epoch": 0.9409599432079614, + "grad_norm": 2.4406138087963023, + "learning_rate": 4.3739156499186806e-08, + "loss": 1.0249, + "step": 8947 + }, + { + "epoch": 0.9410651136498087, + "grad_norm": 1.8333812495807653, + "learning_rate": 4.358381691677932e-08, + "loss": 0.9849, + "step": 8948 + }, + { + "epoch": 0.941170284091656, + "grad_norm": 1.973400633068133, + "learning_rate": 4.342875124385859e-08, + "loss": 0.9627, + "step": 8949 + }, + { + "epoch": 0.9412754545335034, + "grad_norm": 2.2827790185249075, + "learning_rate": 4.3273959497715234e-08, + "loss": 0.9529, + "step": 8950 + }, + { + "epoch": 0.9413806249753507, + "grad_norm": 1.789449664350765, + "learning_rate": 4.311944169560989e-08, + "loss": 0.9506, + "step": 8951 + }, + { + "epoch": 0.941485795417198, + "grad_norm": 2.0245454190254955, + "learning_rate": 4.296519785477293e-08, + "loss": 0.9788, + "step": 8952 + }, + { + "epoch": 0.9415909658590453, + "grad_norm": 2.983790376093045, + "learning_rate": 4.2811227992402834e-08, + "loss": 0.9872, + "step": 8953 + }, + { + "epoch": 0.9416961363008927, + "grad_norm": 2.448225720897263, + "learning_rate": 4.2657532125669196e-08, + "loss": 0.9493, + "step": 8954 + }, + { + "epoch": 0.94180130674274, + "grad_norm": 3.0859410244770693, + "learning_rate": 4.25041102717097e-08, + "loss": 0.9438, + "step": 8955 + }, + { + "epoch": 0.9419064771845873, + "grad_norm": 2.3832499552025466, + "learning_rate": 4.2350962447632594e-08, + "loss": 0.9902, + "step": 8956 + }, + { + "epoch": 0.9420116476264346, + "grad_norm": 2.251093370418025, + "learning_rate": 4.219808867051506e-08, + "loss": 0.9439, + "step": 8957 + }, + { + "epoch": 0.942116818068282, + "grad_norm": 3.0337285007524, + "learning_rate": 4.204548895740346e-08, + "loss": 0.9715, + "step": 8958 + }, + { + "epoch": 0.9422219885101293, + "grad_norm": 2.067055271479491, + "learning_rate": 4.18931633253139e-08, + "loss": 0.9864, + "step": 8959 + }, + { + "epoch": 0.9423271589519765, + "grad_norm": 2.198046384198305, + "learning_rate": 4.174111179123141e-08, + "loss": 0.994, + "step": 8960 + }, + { + "epoch": 0.9424323293938238, + "grad_norm": 2.2696199460704225, + "learning_rate": 4.158933437211188e-08, + "loss": 0.9611, + "step": 8961 + }, + { + "epoch": 0.9425374998356711, + "grad_norm": 2.2636911949582124, + "learning_rate": 4.1437831084878974e-08, + "loss": 0.9585, + "step": 8962 + }, + { + "epoch": 0.9426426702775185, + "grad_norm": 2.490147217829843, + "learning_rate": 4.12866019464267e-08, + "loss": 0.9212, + "step": 8963 + }, + { + "epoch": 0.9427478407193658, + "grad_norm": 2.279237319888199, + "learning_rate": 4.1135646973618214e-08, + "loss": 0.9686, + "step": 8964 + }, + { + "epoch": 0.9428530111612131, + "grad_norm": 2.9443355297300187, + "learning_rate": 4.098496618328618e-08, + "loss": 1.008, + "step": 8965 + }, + { + "epoch": 0.9429581816030604, + "grad_norm": 2.4101695450847074, + "learning_rate": 4.083455959223298e-08, + "loss": 0.9955, + "step": 8966 + }, + { + "epoch": 0.9430633520449078, + "grad_norm": 3.2074191412814916, + "learning_rate": 4.068442721722965e-08, + "loss": 1.0099, + "step": 8967 + }, + { + "epoch": 0.9431685224867551, + "grad_norm": 3.174497962789582, + "learning_rate": 4.0534569075017516e-08, + "loss": 0.9797, + "step": 8968 + }, + { + "epoch": 0.9432736929286024, + "grad_norm": 2.5583454557550014, + "learning_rate": 4.038498518230627e-08, + "loss": 1.006, + "step": 8969 + }, + { + "epoch": 0.9433788633704497, + "grad_norm": 1.8038618775066488, + "learning_rate": 4.0235675555776734e-08, + "loss": 0.9574, + "step": 8970 + }, + { + "epoch": 0.9434840338122971, + "grad_norm": 2.514174840873185, + "learning_rate": 4.008664021207698e-08, + "loss": 0.993, + "step": 8971 + }, + { + "epoch": 0.9435892042541444, + "grad_norm": 2.1814124733410387, + "learning_rate": 3.993787916782649e-08, + "loss": 0.9489, + "step": 8972 + }, + { + "epoch": 0.9436943746959917, + "grad_norm": 2.5226893485355952, + "learning_rate": 3.978939243961283e-08, + "loss": 1.0078, + "step": 8973 + }, + { + "epoch": 0.943799545137839, + "grad_norm": 2.256815087313752, + "learning_rate": 3.964118004399331e-08, + "loss": 0.9775, + "step": 8974 + }, + { + "epoch": 0.9439047155796864, + "grad_norm": 2.8223010195407947, + "learning_rate": 3.949324199749527e-08, + "loss": 1.0249, + "step": 8975 + }, + { + "epoch": 0.9440098860215337, + "grad_norm": 1.391951568158817, + "learning_rate": 3.9345578316614396e-08, + "loss": 0.9615, + "step": 8976 + }, + { + "epoch": 0.944115056463381, + "grad_norm": 2.2474764496627717, + "learning_rate": 3.919818901781669e-08, + "loss": 0.9517, + "step": 8977 + }, + { + "epoch": 0.9442202269052283, + "grad_norm": 2.472061001047028, + "learning_rate": 3.905107411753678e-08, + "loss": 0.9883, + "step": 8978 + }, + { + "epoch": 0.9443253973470757, + "grad_norm": 1.9454273863676264, + "learning_rate": 3.89042336321796e-08, + "loss": 0.9953, + "step": 8979 + }, + { + "epoch": 0.9444305677889229, + "grad_norm": 2.5983891669051045, + "learning_rate": 3.8757667578119e-08, + "loss": 0.9775, + "step": 8980 + }, + { + "epoch": 0.9445357382307702, + "grad_norm": 2.052082559249859, + "learning_rate": 3.8611375971698004e-08, + "loss": 1.0009, + "step": 8981 + }, + { + "epoch": 0.9446409086726175, + "grad_norm": 2.2703611102031402, + "learning_rate": 3.8465358829229415e-08, + "loss": 0.9423, + "step": 8982 + }, + { + "epoch": 0.9447460791144648, + "grad_norm": 2.2134942047175814, + "learning_rate": 3.831961616699464e-08, + "loss": 0.9994, + "step": 8983 + }, + { + "epoch": 0.9448512495563122, + "grad_norm": 2.9036139692399834, + "learning_rate": 3.8174148001246246e-08, + "loss": 0.9657, + "step": 8984 + }, + { + "epoch": 0.9449564199981595, + "grad_norm": 2.1246606320654897, + "learning_rate": 3.802895434820431e-08, + "loss": 0.9848, + "step": 8985 + }, + { + "epoch": 0.9450615904400068, + "grad_norm": 2.4282287550373476, + "learning_rate": 3.78840352240592e-08, + "loss": 0.9852, + "step": 8986 + }, + { + "epoch": 0.9451667608818541, + "grad_norm": 3.214310427554045, + "learning_rate": 3.773939064497051e-08, + "loss": 0.9898, + "step": 8987 + }, + { + "epoch": 0.9452719313237015, + "grad_norm": 2.344091022554252, + "learning_rate": 3.759502062706727e-08, + "loss": 0.9549, + "step": 8988 + }, + { + "epoch": 0.9453771017655488, + "grad_norm": 2.4153973988675057, + "learning_rate": 3.745092518644827e-08, + "loss": 0.9341, + "step": 8989 + }, + { + "epoch": 0.9454822722073961, + "grad_norm": 2.203918057550135, + "learning_rate": 3.730710433918039e-08, + "loss": 1.0003, + "step": 8990 + }, + { + "epoch": 0.9455874426492434, + "grad_norm": 2.265252052511168, + "learning_rate": 3.716355810130135e-08, + "loss": 1.002, + "step": 8991 + }, + { + "epoch": 0.9456926130910908, + "grad_norm": 3.0195268989146338, + "learning_rate": 3.70202864888175e-08, + "loss": 0.9745, + "step": 8992 + }, + { + "epoch": 0.9457977835329381, + "grad_norm": 2.1813144759793874, + "learning_rate": 3.687728951770497e-08, + "loss": 0.9996, + "step": 8993 + }, + { + "epoch": 0.9459029539747854, + "grad_norm": 2.382527317064494, + "learning_rate": 3.673456720390878e-08, + "loss": 1.0224, + "step": 8994 + }, + { + "epoch": 0.9460081244166327, + "grad_norm": 2.888448496222066, + "learning_rate": 3.659211956334369e-08, + "loss": 0.9979, + "step": 8995 + }, + { + "epoch": 0.9461132948584801, + "grad_norm": 2.428825105062496, + "learning_rate": 3.644994661189366e-08, + "loss": 0.9911, + "step": 8996 + }, + { + "epoch": 0.9462184653003274, + "grad_norm": 2.947866269473164, + "learning_rate": 3.630804836541213e-08, + "loss": 0.9448, + "step": 8997 + }, + { + "epoch": 0.9463236357421747, + "grad_norm": 2.5597218827599693, + "learning_rate": 3.616642483972199e-08, + "loss": 0.9859, + "step": 8998 + }, + { + "epoch": 0.946428806184022, + "grad_norm": 2.158200307069964, + "learning_rate": 3.602507605061478e-08, + "loss": 0.9825, + "step": 8999 + }, + { + "epoch": 0.9465339766258692, + "grad_norm": 3.1515542288121883, + "learning_rate": 3.588400201385289e-08, + "loss": 1.016, + "step": 9000 + }, + { + "epoch": 0.9466391470677166, + "grad_norm": 2.5215905656201505, + "learning_rate": 3.574320274516652e-08, + "loss": 0.9547, + "step": 9001 + }, + { + "epoch": 0.9467443175095639, + "grad_norm": 2.2845409836197135, + "learning_rate": 3.560267826025588e-08, + "loss": 0.969, + "step": 9002 + }, + { + "epoch": 0.9468494879514112, + "grad_norm": 2.7215628504061633, + "learning_rate": 3.546242857479093e-08, + "loss": 0.9303, + "step": 9003 + }, + { + "epoch": 0.9469546583932585, + "grad_norm": 2.9588772470370475, + "learning_rate": 3.5322453704410286e-08, + "loss": 1.0371, + "step": 9004 + }, + { + "epoch": 0.9470598288351059, + "grad_norm": 2.2593667415441954, + "learning_rate": 3.518275366472229e-08, + "loss": 0.9698, + "step": 9005 + }, + { + "epoch": 0.9471649992769532, + "grad_norm": 2.042081877507642, + "learning_rate": 3.504332847130476e-08, + "loss": 1.0017, + "step": 9006 + }, + { + "epoch": 0.9472701697188005, + "grad_norm": 2.587961130036968, + "learning_rate": 3.49041781397047e-08, + "loss": 0.9826, + "step": 9007 + }, + { + "epoch": 0.9473753401606478, + "grad_norm": 3.102076174146032, + "learning_rate": 3.4765302685438315e-08, + "loss": 0.9855, + "step": 9008 + }, + { + "epoch": 0.9474805106024952, + "grad_norm": 2.544139468740267, + "learning_rate": 3.462670212399099e-08, + "loss": 0.9721, + "step": 9009 + }, + { + "epoch": 0.9475856810443425, + "grad_norm": 2.7727335358217053, + "learning_rate": 3.4488376470818153e-08, + "loss": 0.9728, + "step": 9010 + }, + { + "epoch": 0.9476908514861898, + "grad_norm": 2.5294138314705696, + "learning_rate": 3.4350325741344114e-08, + "loss": 0.9813, + "step": 9011 + }, + { + "epoch": 0.9477960219280371, + "grad_norm": 2.7089584804818907, + "learning_rate": 3.421254995096268e-08, + "loss": 1.0279, + "step": 9012 + }, + { + "epoch": 0.9479011923698845, + "grad_norm": 2.281684751376811, + "learning_rate": 3.407504911503684e-08, + "loss": 0.9687, + "step": 9013 + }, + { + "epoch": 0.9480063628117318, + "grad_norm": 2.138661404129903, + "learning_rate": 3.3937823248899046e-08, + "loss": 0.9265, + "step": 9014 + }, + { + "epoch": 0.9481115332535791, + "grad_norm": 3.39614476261138, + "learning_rate": 3.3800872367850956e-08, + "loss": 0.9766, + "step": 9015 + }, + { + "epoch": 0.9482167036954264, + "grad_norm": 2.2640896136810955, + "learning_rate": 3.366419648716368e-08, + "loss": 0.967, + "step": 9016 + }, + { + "epoch": 0.9483218741372738, + "grad_norm": 2.047442890013021, + "learning_rate": 3.352779562207753e-08, + "loss": 0.9985, + "step": 9017 + }, + { + "epoch": 0.9484270445791211, + "grad_norm": 1.9785722794844491, + "learning_rate": 3.339166978780256e-08, + "loss": 0.9326, + "step": 9018 + }, + { + "epoch": 0.9485322150209684, + "grad_norm": 2.1691668115058733, + "learning_rate": 3.3255818999517465e-08, + "loss": 1.017, + "step": 9019 + }, + { + "epoch": 0.9486373854628157, + "grad_norm": 2.4799530161132863, + "learning_rate": 3.3120243272371236e-08, + "loss": 0.9924, + "step": 9020 + }, + { + "epoch": 0.9487425559046629, + "grad_norm": 2.353650653525652, + "learning_rate": 3.298494262148122e-08, + "loss": 1.0099, + "step": 9021 + }, + { + "epoch": 0.9488477263465103, + "grad_norm": 2.2157690574353905, + "learning_rate": 3.2849917061934245e-08, + "loss": 1.0099, + "step": 9022 + }, + { + "epoch": 0.9489528967883576, + "grad_norm": 2.6170941163454726, + "learning_rate": 3.2715166608787426e-08, + "loss": 1.0147, + "step": 9023 + }, + { + "epoch": 0.9490580672302049, + "grad_norm": 2.252136933422275, + "learning_rate": 3.2580691277065704e-08, + "loss": 0.9959, + "step": 9024 + }, + { + "epoch": 0.9491632376720522, + "grad_norm": 2.0628624156474906, + "learning_rate": 3.2446491081764566e-08, + "loss": 0.9494, + "step": 9025 + }, + { + "epoch": 0.9492684081138996, + "grad_norm": 2.8179669861299055, + "learning_rate": 3.2312566037848437e-08, + "loss": 0.9782, + "step": 9026 + }, + { + "epoch": 0.9493735785557469, + "grad_norm": 2.4587844515129365, + "learning_rate": 3.217891616025065e-08, + "loss": 0.9678, + "step": 9027 + }, + { + "epoch": 0.9494787489975942, + "grad_norm": 2.836843623950099, + "learning_rate": 3.204554146387456e-08, + "loss": 1.0119, + "step": 9028 + }, + { + "epoch": 0.9495839194394415, + "grad_norm": 2.6786242065548675, + "learning_rate": 3.191244196359244e-08, + "loss": 0.9626, + "step": 9029 + }, + { + "epoch": 0.9496890898812889, + "grad_norm": 1.9157598537055311, + "learning_rate": 3.1779617674245754e-08, + "loss": 0.9927, + "step": 9030 + }, + { + "epoch": 0.9497942603231362, + "grad_norm": 2.8959657091729114, + "learning_rate": 3.1647068610645706e-08, + "loss": 1.0009, + "step": 9031 + }, + { + "epoch": 0.9498994307649835, + "grad_norm": 2.5929815782861603, + "learning_rate": 3.151479478757186e-08, + "loss": 1.0032, + "step": 9032 + }, + { + "epoch": 0.9500046012068308, + "grad_norm": 2.5834124688388633, + "learning_rate": 3.1382796219774634e-08, + "loss": 1.0217, + "step": 9033 + }, + { + "epoch": 0.9501097716486782, + "grad_norm": 3.2272054253254363, + "learning_rate": 3.125107292197227e-08, + "loss": 1.0221, + "step": 9034 + }, + { + "epoch": 0.9502149420905255, + "grad_norm": 2.6054764436911895, + "learning_rate": 3.1119624908853286e-08, + "loss": 0.9901, + "step": 9035 + }, + { + "epoch": 0.9503201125323728, + "grad_norm": 3.169144160863623, + "learning_rate": 3.0988452195075127e-08, + "loss": 0.9878, + "step": 9036 + }, + { + "epoch": 0.9504252829742201, + "grad_norm": 2.6138918264045765, + "learning_rate": 3.085755479526442e-08, + "loss": 1.035, + "step": 9037 + }, + { + "epoch": 0.9505304534160675, + "grad_norm": 2.2611828427142058, + "learning_rate": 3.072693272401756e-08, + "loss": 0.9684, + "step": 9038 + }, + { + "epoch": 0.9506356238579148, + "grad_norm": 2.9501126652696676, + "learning_rate": 3.059658599589926e-08, + "loss": 0.9749, + "step": 9039 + }, + { + "epoch": 0.9507407942997621, + "grad_norm": 2.1468682912165087, + "learning_rate": 3.046651462544487e-08, + "loss": 0.9761, + "step": 9040 + }, + { + "epoch": 0.9508459647416093, + "grad_norm": 2.4497313864823753, + "learning_rate": 3.0336718627158035e-08, + "loss": 0.9701, + "step": 9041 + }, + { + "epoch": 0.9509511351834566, + "grad_norm": 2.153394730671303, + "learning_rate": 3.0207198015512195e-08, + "loss": 0.9441, + "step": 9042 + }, + { + "epoch": 0.951056305625304, + "grad_norm": 3.031401492377512, + "learning_rate": 3.007795280494996e-08, + "loss": 1.0247, + "step": 9043 + }, + { + "epoch": 0.9511614760671513, + "grad_norm": 2.5059187840965604, + "learning_rate": 2.994898300988258e-08, + "loss": 0.9365, + "step": 9044 + }, + { + "epoch": 0.9512666465089986, + "grad_norm": 2.4460236039465615, + "learning_rate": 2.9820288644692166e-08, + "loss": 0.9831, + "step": 9045 + }, + { + "epoch": 0.9513718169508459, + "grad_norm": 2.608884100112733, + "learning_rate": 2.969186972372806e-08, + "loss": 1.0081, + "step": 9046 + }, + { + "epoch": 0.9514769873926933, + "grad_norm": 2.7708175506380286, + "learning_rate": 2.9563726261310767e-08, + "loss": 0.9921, + "step": 9047 + }, + { + "epoch": 0.9515821578345406, + "grad_norm": 2.9910305301759186, + "learning_rate": 2.9435858271728845e-08, + "loss": 0.9821, + "step": 9048 + }, + { + "epoch": 0.9516873282763879, + "grad_norm": 2.9234495857772553, + "learning_rate": 2.9308265769240894e-08, + "loss": 1.0157, + "step": 9049 + }, + { + "epoch": 0.9517924987182352, + "grad_norm": 1.8321103596533066, + "learning_rate": 2.9180948768074424e-08, + "loss": 0.9784, + "step": 9050 + }, + { + "epoch": 0.9518976691600826, + "grad_norm": 2.357717738638271, + "learning_rate": 2.905390728242585e-08, + "loss": 0.9425, + "step": 9051 + }, + { + "epoch": 0.9520028396019299, + "grad_norm": 3.189128618030638, + "learning_rate": 2.8927141326461903e-08, + "loss": 1.0047, + "step": 9052 + }, + { + "epoch": 0.9521080100437772, + "grad_norm": 2.256843607403256, + "learning_rate": 2.8800650914317385e-08, + "loss": 0.946, + "step": 9053 + }, + { + "epoch": 0.9522131804856245, + "grad_norm": 3.062457453580989, + "learning_rate": 2.867443606009768e-08, + "loss": 1.0034, + "step": 9054 + }, + { + "epoch": 0.9523183509274719, + "grad_norm": 3.165772898246199, + "learning_rate": 2.854849677787569e-08, + "loss": 1.0181, + "step": 9055 + }, + { + "epoch": 0.9524235213693192, + "grad_norm": 2.173606329897896, + "learning_rate": 2.8422833081695466e-08, + "loss": 1.0107, + "step": 9056 + }, + { + "epoch": 0.9525286918111665, + "grad_norm": 2.1255256334631976, + "learning_rate": 2.8297444985569412e-08, + "loss": 1.0023, + "step": 9057 + }, + { + "epoch": 0.9526338622530138, + "grad_norm": 2.077684783487426, + "learning_rate": 2.8172332503479116e-08, + "loss": 0.9621, + "step": 9058 + }, + { + "epoch": 0.9527390326948612, + "grad_norm": 2.1934129457768834, + "learning_rate": 2.8047495649375366e-08, + "loss": 0.9669, + "step": 9059 + }, + { + "epoch": 0.9528442031367085, + "grad_norm": 2.8101529667642753, + "learning_rate": 2.7922934437178695e-08, + "loss": 0.9975, + "step": 9060 + }, + { + "epoch": 0.9529493735785557, + "grad_norm": 1.9383323625550897, + "learning_rate": 2.7798648880778545e-08, + "loss": 0.9921, + "step": 9061 + }, + { + "epoch": 0.953054544020403, + "grad_norm": 1.8036336585586927, + "learning_rate": 2.767463899403383e-08, + "loss": 0.9686, + "step": 9062 + }, + { + "epoch": 0.9531597144622503, + "grad_norm": 2.8407059895479696, + "learning_rate": 2.755090479077266e-08, + "loss": 0.9795, + "step": 9063 + }, + { + "epoch": 0.9532648849040977, + "grad_norm": 2.758918511717141, + "learning_rate": 2.7427446284792324e-08, + "loss": 0.9974, + "step": 9064 + }, + { + "epoch": 0.953370055345945, + "grad_norm": 3.586171387728783, + "learning_rate": 2.730426348985904e-08, + "loss": 0.9778, + "step": 9065 + }, + { + "epoch": 0.9534752257877923, + "grad_norm": 2.4256569743524983, + "learning_rate": 2.7181356419709313e-08, + "loss": 0.9593, + "step": 9066 + }, + { + "epoch": 0.9535803962296396, + "grad_norm": 2.3740228072151393, + "learning_rate": 2.7058725088047466e-08, + "loss": 0.9844, + "step": 9067 + }, + { + "epoch": 0.953685566671487, + "grad_norm": 2.483387574850686, + "learning_rate": 2.6936369508548664e-08, + "loss": 0.9624, + "step": 9068 + }, + { + "epoch": 0.9537907371133343, + "grad_norm": 3.1395779842210954, + "learning_rate": 2.681428969485589e-08, + "loss": 1.0125, + "step": 9069 + }, + { + "epoch": 0.9538959075551816, + "grad_norm": 3.2864445331020185, + "learning_rate": 2.6692485660582133e-08, + "loss": 0.9238, + "step": 9070 + }, + { + "epoch": 0.9540010779970289, + "grad_norm": 2.251033657502406, + "learning_rate": 2.6570957419309595e-08, + "loss": 0.9286, + "step": 9071 + }, + { + "epoch": 0.9541062484388763, + "grad_norm": 2.9488738742860527, + "learning_rate": 2.6449704984589652e-08, + "loss": 1.0007, + "step": 9072 + }, + { + "epoch": 0.9542114188807236, + "grad_norm": 2.3323515832683945, + "learning_rate": 2.632872836994288e-08, + "loss": 1.0012, + "step": 9073 + }, + { + "epoch": 0.9543165893225709, + "grad_norm": 3.032634694644143, + "learning_rate": 2.6208027588858765e-08, + "loss": 0.9769, + "step": 9074 + }, + { + "epoch": 0.9544217597644182, + "grad_norm": 1.9718142374783398, + "learning_rate": 2.6087602654797097e-08, + "loss": 0.9451, + "step": 9075 + }, + { + "epoch": 0.9545269302062656, + "grad_norm": 2.261250300289189, + "learning_rate": 2.5967453581185187e-08, + "loss": 1.0058, + "step": 9076 + }, + { + "epoch": 0.9546321006481129, + "grad_norm": 2.202583197447805, + "learning_rate": 2.5847580381421768e-08, + "loss": 0.9913, + "step": 9077 + }, + { + "epoch": 0.9547372710899602, + "grad_norm": 3.0052761549204203, + "learning_rate": 2.5727983068872532e-08, + "loss": 0.979, + "step": 9078 + }, + { + "epoch": 0.9548424415318075, + "grad_norm": 2.235135902420106, + "learning_rate": 2.560866165687431e-08, + "loss": 0.9828, + "step": 9079 + }, + { + "epoch": 0.9549476119736549, + "grad_norm": 1.8821520458231678, + "learning_rate": 2.548961615873202e-08, + "loss": 0.9579, + "step": 9080 + }, + { + "epoch": 0.9550527824155022, + "grad_norm": 2.1799190509368263, + "learning_rate": 2.5370846587720044e-08, + "loss": 0.9611, + "step": 9081 + }, + { + "epoch": 0.9551579528573494, + "grad_norm": 2.2231469103632016, + "learning_rate": 2.5252352957082505e-08, + "loss": 0.9672, + "step": 9082 + }, + { + "epoch": 0.9552631232991967, + "grad_norm": 2.48946023734881, + "learning_rate": 2.513413528003189e-08, + "loss": 1.0058, + "step": 9083 + }, + { + "epoch": 0.955368293741044, + "grad_norm": 2.9722438755007383, + "learning_rate": 2.5016193569750712e-08, + "loss": 0.9313, + "step": 9084 + }, + { + "epoch": 0.9554734641828914, + "grad_norm": 2.5433251733042903, + "learning_rate": 2.4898527839390118e-08, + "loss": 1.0113, + "step": 9085 + }, + { + "epoch": 0.9555786346247387, + "grad_norm": 1.991419615066436, + "learning_rate": 2.4781138102071278e-08, + "loss": 0.9612, + "step": 9086 + }, + { + "epoch": 0.955683805066586, + "grad_norm": 2.4260785139851464, + "learning_rate": 2.4664024370883444e-08, + "loss": 0.9726, + "step": 9087 + }, + { + "epoch": 0.9557889755084333, + "grad_norm": 2.623866077425562, + "learning_rate": 2.454718665888589e-08, + "loss": 0.9465, + "step": 9088 + }, + { + "epoch": 0.9558941459502807, + "grad_norm": 2.6503139564233282, + "learning_rate": 2.4430624979107365e-08, + "loss": 1.0072, + "step": 9089 + }, + { + "epoch": 0.955999316392128, + "grad_norm": 4.298737564898903, + "learning_rate": 2.431433934454497e-08, + "loss": 0.9838, + "step": 9090 + }, + { + "epoch": 0.9561044868339753, + "grad_norm": 2.166453454009922, + "learning_rate": 2.4198329768165552e-08, + "loss": 0.9526, + "step": 9091 + }, + { + "epoch": 0.9562096572758226, + "grad_norm": 2.262782344611168, + "learning_rate": 2.4082596262904877e-08, + "loss": 0.909, + "step": 9092 + }, + { + "epoch": 0.95631482771767, + "grad_norm": 2.4483911651903343, + "learning_rate": 2.396713884166818e-08, + "loss": 0.9938, + "step": 9093 + }, + { + "epoch": 0.9564199981595173, + "grad_norm": 3.1826776450887686, + "learning_rate": 2.385195751733044e-08, + "loss": 0.9652, + "step": 9094 + }, + { + "epoch": 0.9565251686013646, + "grad_norm": 2.0215236542665114, + "learning_rate": 2.3737052302734432e-08, + "loss": 0.9577, + "step": 9095 + }, + { + "epoch": 0.9566303390432119, + "grad_norm": 2.988986889944041, + "learning_rate": 2.36224232106938e-08, + "loss": 0.9887, + "step": 9096 + }, + { + "epoch": 0.9567355094850593, + "grad_norm": 2.795418561696307, + "learning_rate": 2.3508070253989712e-08, + "loss": 0.9974, + "step": 9097 + }, + { + "epoch": 0.9568406799269066, + "grad_norm": 2.7766248044059876, + "learning_rate": 2.3393993445374187e-08, + "loss": 1.005, + "step": 9098 + }, + { + "epoch": 0.9569458503687539, + "grad_norm": 2.559873723889261, + "learning_rate": 2.3280192797567046e-08, + "loss": 0.9853, + "step": 9099 + }, + { + "epoch": 0.9570510208106012, + "grad_norm": 1.838309819852872, + "learning_rate": 2.31666683232587e-08, + "loss": 0.9774, + "step": 9100 + }, + { + "epoch": 0.9571561912524486, + "grad_norm": 2.3283682967266404, + "learning_rate": 2.3053420035107077e-08, + "loss": 0.9614, + "step": 9101 + }, + { + "epoch": 0.9572613616942958, + "grad_norm": 2.0531537931400696, + "learning_rate": 2.294044794574096e-08, + "loss": 0.996, + "step": 9102 + }, + { + "epoch": 0.9573665321361431, + "grad_norm": 2.738491776846182, + "learning_rate": 2.2827752067757224e-08, + "loss": 0.9989, + "step": 9103 + }, + { + "epoch": 0.9574717025779904, + "grad_norm": 2.8541624802569086, + "learning_rate": 2.271533241372248e-08, + "loss": 0.9925, + "step": 9104 + }, + { + "epoch": 0.9575768730198377, + "grad_norm": 2.561463355615611, + "learning_rate": 2.260318899617281e-08, + "loss": 0.995, + "step": 9105 + }, + { + "epoch": 0.9576820434616851, + "grad_norm": 2.4009062245111807, + "learning_rate": 2.24913218276121e-08, + "loss": 1.0175, + "step": 9106 + }, + { + "epoch": 0.9577872139035324, + "grad_norm": 1.9085583128594759, + "learning_rate": 2.2379730920515096e-08, + "loss": 0.966, + "step": 9107 + }, + { + "epoch": 0.9578923843453797, + "grad_norm": 2.3035781917937923, + "learning_rate": 2.2268416287325178e-08, + "loss": 0.9906, + "step": 9108 + }, + { + "epoch": 0.957997554787227, + "grad_norm": 2.9253676588000084, + "learning_rate": 2.215737794045436e-08, + "loss": 0.9913, + "step": 9109 + }, + { + "epoch": 0.9581027252290744, + "grad_norm": 2.251760542497009, + "learning_rate": 2.2046615892284685e-08, + "loss": 0.9787, + "step": 9110 + }, + { + "epoch": 0.9582078956709217, + "grad_norm": 2.6136064885904142, + "learning_rate": 2.193613015516627e-08, + "loss": 1.0012, + "step": 9111 + }, + { + "epoch": 0.958313066112769, + "grad_norm": 2.378051255621682, + "learning_rate": 2.1825920741420092e-08, + "loss": 0.9869, + "step": 9112 + }, + { + "epoch": 0.9584182365546163, + "grad_norm": 2.420924559361127, + "learning_rate": 2.171598766333466e-08, + "loss": 0.9531, + "step": 9113 + }, + { + "epoch": 0.9585234069964637, + "grad_norm": 2.1094674393987125, + "learning_rate": 2.1606330933168496e-08, + "loss": 0.9684, + "step": 9114 + }, + { + "epoch": 0.958628577438311, + "grad_norm": 1.7218971943565433, + "learning_rate": 2.149695056314932e-08, + "loss": 0.9608, + "step": 9115 + }, + { + "epoch": 0.9587337478801583, + "grad_norm": 2.6145718462197967, + "learning_rate": 2.1387846565474047e-08, + "loss": 0.9348, + "step": 9116 + }, + { + "epoch": 0.9588389183220056, + "grad_norm": 2.7778015979594595, + "learning_rate": 2.1279018952308218e-08, + "loss": 1.015, + "step": 9117 + }, + { + "epoch": 0.958944088763853, + "grad_norm": 3.616813798965242, + "learning_rate": 2.1170467735787124e-08, + "loss": 0.9358, + "step": 9118 + }, + { + "epoch": 0.9590492592057003, + "grad_norm": 3.0938864234470684, + "learning_rate": 2.1062192928015536e-08, + "loss": 0.9831, + "step": 9119 + }, + { + "epoch": 0.9591544296475476, + "grad_norm": 2.5032862302480696, + "learning_rate": 2.095419454106573e-08, + "loss": 1.0055, + "step": 9120 + }, + { + "epoch": 0.9592596000893949, + "grad_norm": 2.4155226421641234, + "learning_rate": 2.084647258698169e-08, + "loss": 0.9836, + "step": 9121 + }, + { + "epoch": 0.9593647705312421, + "grad_norm": 2.2456972979046212, + "learning_rate": 2.073902707777464e-08, + "loss": 1.0023, + "step": 9122 + }, + { + "epoch": 0.9594699409730895, + "grad_norm": 3.1418236010714846, + "learning_rate": 2.0631858025425554e-08, + "loss": 0.9944, + "step": 9123 + }, + { + "epoch": 0.9595751114149368, + "grad_norm": 2.512431786493916, + "learning_rate": 2.052496544188487e-08, + "loss": 0.9862, + "step": 9124 + }, + { + "epoch": 0.9596802818567841, + "grad_norm": 2.4317108069473514, + "learning_rate": 2.0418349339071385e-08, + "loss": 0.9816, + "step": 9125 + }, + { + "epoch": 0.9597854522986314, + "grad_norm": 2.5061823875565654, + "learning_rate": 2.031200972887448e-08, + "loss": 0.9703, + "step": 9126 + }, + { + "epoch": 0.9598906227404788, + "grad_norm": 1.8971400885405667, + "learning_rate": 2.0205946623151063e-08, + "loss": 0.9792, + "step": 9127 + }, + { + "epoch": 0.9599957931823261, + "grad_norm": 2.8512234421263516, + "learning_rate": 2.010016003372861e-08, + "loss": 0.9609, + "step": 9128 + }, + { + "epoch": 0.9601009636241734, + "grad_norm": 1.8398551494029969, + "learning_rate": 1.9994649972402415e-08, + "loss": 0.9747, + "step": 9129 + }, + { + "epoch": 0.9602061340660207, + "grad_norm": 2.4990349994310415, + "learning_rate": 1.9889416450938335e-08, + "loss": 0.9787, + "step": 9130 + }, + { + "epoch": 0.9603113045078681, + "grad_norm": 1.9550532334702702, + "learning_rate": 1.9784459481070607e-08, + "loss": 0.9957, + "step": 9131 + }, + { + "epoch": 0.9604164749497154, + "grad_norm": 3.2601942510164252, + "learning_rate": 1.9679779074502636e-08, + "loss": 0.9309, + "step": 9132 + }, + { + "epoch": 0.9605216453915627, + "grad_norm": 2.862772288610713, + "learning_rate": 1.9575375242907035e-08, + "loss": 1.0278, + "step": 9133 + }, + { + "epoch": 0.96062681583341, + "grad_norm": 2.6535531520322495, + "learning_rate": 1.9471247997925324e-08, + "loss": 0.9815, + "step": 9134 + }, + { + "epoch": 0.9607319862752574, + "grad_norm": 2.6376660990295164, + "learning_rate": 1.9367397351169326e-08, + "loss": 0.9858, + "step": 9135 + }, + { + "epoch": 0.9608371567171047, + "grad_norm": 3.208400601816702, + "learning_rate": 1.9263823314218667e-08, + "loss": 0.9999, + "step": 9136 + }, + { + "epoch": 0.960942327158952, + "grad_norm": 3.117055000265947, + "learning_rate": 1.9160525898622716e-08, + "loss": 1.0176, + "step": 9137 + }, + { + "epoch": 0.9610474976007993, + "grad_norm": 2.539642788145546, + "learning_rate": 1.9057505115900043e-08, + "loss": 0.9976, + "step": 9138 + }, + { + "epoch": 0.9611526680426467, + "grad_norm": 2.622034495146119, + "learning_rate": 1.8954760977538122e-08, + "loss": 0.9342, + "step": 9139 + }, + { + "epoch": 0.961257838484494, + "grad_norm": 2.0976876781379796, + "learning_rate": 1.88522934949939e-08, + "loss": 0.939, + "step": 9140 + }, + { + "epoch": 0.9613630089263413, + "grad_norm": 3.6252618212734244, + "learning_rate": 1.875010267969296e-08, + "loss": 1.0144, + "step": 9141 + }, + { + "epoch": 0.9614681793681886, + "grad_norm": 2.2004361692134093, + "learning_rate": 1.864818854303091e-08, + "loss": 0.9742, + "step": 9142 + }, + { + "epoch": 0.9615733498100358, + "grad_norm": 2.5659276984677013, + "learning_rate": 1.8546551096371157e-08, + "loss": 0.994, + "step": 9143 + }, + { + "epoch": 0.9616785202518832, + "grad_norm": 2.2606940570606304, + "learning_rate": 1.844519035104797e-08, + "loss": 0.9785, + "step": 9144 + }, + { + "epoch": 0.9617836906937305, + "grad_norm": 3.1995639013446353, + "learning_rate": 1.834410631836342e-08, + "loss": 1.0375, + "step": 9145 + }, + { + "epoch": 0.9618888611355778, + "grad_norm": 2.8120072161352883, + "learning_rate": 1.8243299009589044e-08, + "loss": 0.9765, + "step": 9146 + }, + { + "epoch": 0.9619940315774251, + "grad_norm": 2.9702132072664504, + "learning_rate": 1.814276843596585e-08, + "loss": 0.984, + "step": 9147 + }, + { + "epoch": 0.9620992020192725, + "grad_norm": 2.36435367996225, + "learning_rate": 1.8042514608703765e-08, + "loss": 0.9905, + "step": 9148 + }, + { + "epoch": 0.9622043724611198, + "grad_norm": 2.4402173766387647, + "learning_rate": 1.7942537538981618e-08, + "loss": 0.9907, + "step": 9149 + }, + { + "epoch": 0.9623095429029671, + "grad_norm": 2.3039341033452048, + "learning_rate": 1.7842837237947997e-08, + "loss": 0.9937, + "step": 9150 + }, + { + "epoch": 0.9624147133448144, + "grad_norm": 2.5814133426313397, + "learning_rate": 1.7743413716720394e-08, + "loss": 0.9995, + "step": 9151 + }, + { + "epoch": 0.9625198837866618, + "grad_norm": 3.0310519987593323, + "learning_rate": 1.764426698638466e-08, + "loss": 0.9418, + "step": 9152 + }, + { + "epoch": 0.9626250542285091, + "grad_norm": 2.414115329844662, + "learning_rate": 1.7545397057996683e-08, + "loss": 1.0066, + "step": 9153 + }, + { + "epoch": 0.9627302246703564, + "grad_norm": 2.5905538989812484, + "learning_rate": 1.7446803942581524e-08, + "loss": 0.9571, + "step": 9154 + }, + { + "epoch": 0.9628353951122037, + "grad_norm": 2.4397822721298588, + "learning_rate": 1.7348487651132895e-08, + "loss": 0.9677, + "step": 9155 + }, + { + "epoch": 0.962940565554051, + "grad_norm": 2.369844437844643, + "learning_rate": 1.725044819461369e-08, + "loss": 0.9505, + "step": 9156 + }, + { + "epoch": 0.9630457359958984, + "grad_norm": 2.4593061093551927, + "learning_rate": 1.7152685583955995e-08, + "loss": 0.9638, + "step": 9157 + }, + { + "epoch": 0.9631509064377457, + "grad_norm": 2.058831249668448, + "learning_rate": 1.7055199830061653e-08, + "loss": 0.9734, + "step": 9158 + }, + { + "epoch": 0.963256076879593, + "grad_norm": 2.5481002332248703, + "learning_rate": 1.6957990943800574e-08, + "loss": 0.9717, + "step": 9159 + }, + { + "epoch": 0.9633612473214404, + "grad_norm": 3.1600633484938503, + "learning_rate": 1.686105893601242e-08, + "loss": 1.0042, + "step": 9160 + }, + { + "epoch": 0.9634664177632877, + "grad_norm": 3.3311087274523823, + "learning_rate": 1.6764403817506047e-08, + "loss": 1.0101, + "step": 9161 + }, + { + "epoch": 0.963571588205135, + "grad_norm": 1.9545499141129272, + "learning_rate": 1.6668025599058945e-08, + "loss": 0.9977, + "step": 9162 + }, + { + "epoch": 0.9636767586469822, + "grad_norm": 2.787786032166771, + "learning_rate": 1.6571924291418072e-08, + "loss": 0.9881, + "step": 9163 + }, + { + "epoch": 0.9637819290888295, + "grad_norm": 2.7339024071717035, + "learning_rate": 1.6476099905299857e-08, + "loss": 0.9863, + "step": 9164 + }, + { + "epoch": 0.9638870995306769, + "grad_norm": 2.3241276868769507, + "learning_rate": 1.6380552451389088e-08, + "loss": 0.985, + "step": 9165 + }, + { + "epoch": 0.9639922699725242, + "grad_norm": 2.615775704959889, + "learning_rate": 1.6285281940340016e-08, + "loss": 1.0107, + "step": 9166 + }, + { + "epoch": 0.9640974404143715, + "grad_norm": 2.61782809887508, + "learning_rate": 1.6190288382776363e-08, + "loss": 0.9895, + "step": 9167 + }, + { + "epoch": 0.9642026108562188, + "grad_norm": 2.9697250302024987, + "learning_rate": 1.60955717892905e-08, + "loss": 0.9869, + "step": 9168 + }, + { + "epoch": 0.9643077812980662, + "grad_norm": 2.1659305791507024, + "learning_rate": 1.6001132170443968e-08, + "loss": 0.9508, + "step": 9169 + }, + { + "epoch": 0.9644129517399135, + "grad_norm": 2.76396965745972, + "learning_rate": 1.5906969536767513e-08, + "loss": 0.9662, + "step": 9170 + }, + { + "epoch": 0.9645181221817608, + "grad_norm": 2.1674061651202434, + "learning_rate": 1.5813083898760793e-08, + "loss": 0.961, + "step": 9171 + }, + { + "epoch": 0.9646232926236081, + "grad_norm": 3.3828957028419944, + "learning_rate": 1.571947526689349e-08, + "loss": 0.9774, + "step": 9172 + }, + { + "epoch": 0.9647284630654555, + "grad_norm": 2.9496229688779834, + "learning_rate": 1.5626143651603087e-08, + "loss": 1.0287, + "step": 9173 + }, + { + "epoch": 0.9648336335073028, + "grad_norm": 2.576235051854255, + "learning_rate": 1.553308906329709e-08, + "loss": 1.0086, + "step": 9174 + }, + { + "epoch": 0.9649388039491501, + "grad_norm": 2.6024764773615705, + "learning_rate": 1.5440311512351646e-08, + "loss": 0.9895, + "step": 9175 + }, + { + "epoch": 0.9650439743909974, + "grad_norm": 2.7226540157491175, + "learning_rate": 1.534781100911209e-08, + "loss": 0.9951, + "step": 9176 + }, + { + "epoch": 0.9651491448328448, + "grad_norm": 2.4235683696464925, + "learning_rate": 1.5255587563893227e-08, + "loss": 1.0094, + "step": 9177 + }, + { + "epoch": 0.9652543152746921, + "grad_norm": 1.8039244408062456, + "learning_rate": 1.5163641186978216e-08, + "loss": 0.9846, + "step": 9178 + }, + { + "epoch": 0.9653594857165394, + "grad_norm": 2.208004892548754, + "learning_rate": 1.507197188862053e-08, + "loss": 0.9821, + "step": 9179 + }, + { + "epoch": 0.9654646561583867, + "grad_norm": 2.686724949615566, + "learning_rate": 1.4980579679041153e-08, + "loss": 0.9814, + "step": 9180 + }, + { + "epoch": 0.965569826600234, + "grad_norm": 2.218279129899888, + "learning_rate": 1.4889464568431656e-08, + "loss": 0.9947, + "step": 9181 + }, + { + "epoch": 0.9656749970420814, + "grad_norm": 2.619200221829575, + "learning_rate": 1.4798626566951968e-08, + "loss": 0.9968, + "step": 9182 + }, + { + "epoch": 0.9657801674839286, + "grad_norm": 2.226366283605976, + "learning_rate": 1.4708065684730932e-08, + "loss": 0.9942, + "step": 9183 + }, + { + "epoch": 0.9658853379257759, + "grad_norm": 1.8711672309528635, + "learning_rate": 1.4617781931867137e-08, + "loss": 0.9866, + "step": 9184 + }, + { + "epoch": 0.9659905083676232, + "grad_norm": 1.7591848744904266, + "learning_rate": 1.4527775318427806e-08, + "loss": 0.9644, + "step": 9185 + }, + { + "epoch": 0.9660956788094706, + "grad_norm": 2.3772749891154312, + "learning_rate": 1.4438045854449357e-08, + "loss": 0.9752, + "step": 9186 + }, + { + "epoch": 0.9662008492513179, + "grad_norm": 2.79994419425291, + "learning_rate": 1.4348593549937118e-08, + "loss": 1.0253, + "step": 9187 + }, + { + "epoch": 0.9663060196931652, + "grad_norm": 2.9430470064229963, + "learning_rate": 1.4259418414866166e-08, + "loss": 0.9725, + "step": 9188 + }, + { + "epoch": 0.9664111901350125, + "grad_norm": 2.844447657650461, + "learning_rate": 1.417052045917966e-08, + "loss": 1.0171, + "step": 9189 + }, + { + "epoch": 0.9665163605768599, + "grad_norm": 3.2049646354495067, + "learning_rate": 1.4081899692791058e-08, + "loss": 0.9617, + "step": 9190 + }, + { + "epoch": 0.9666215310187072, + "grad_norm": 2.6622301953211487, + "learning_rate": 1.399355612558162e-08, + "loss": 0.9837, + "step": 9191 + }, + { + "epoch": 0.9667267014605545, + "grad_norm": 2.122985949724431, + "learning_rate": 1.3905489767402913e-08, + "loss": 0.9636, + "step": 9192 + }, + { + "epoch": 0.9668318719024018, + "grad_norm": 2.6769674621149533, + "learning_rate": 1.3817700628074582e-08, + "loss": 0.9456, + "step": 9193 + }, + { + "epoch": 0.9669370423442492, + "grad_norm": 2.365588178179128, + "learning_rate": 1.3730188717386016e-08, + "loss": 0.9796, + "step": 9194 + }, + { + "epoch": 0.9670422127860965, + "grad_norm": 1.8396720663564547, + "learning_rate": 1.3642954045095525e-08, + "loss": 0.981, + "step": 9195 + }, + { + "epoch": 0.9671473832279438, + "grad_norm": 2.3943971715102736, + "learning_rate": 1.3555996620930323e-08, + "loss": 1.0078, + "step": 9196 + }, + { + "epoch": 0.9672525536697911, + "grad_norm": 2.419224654846345, + "learning_rate": 1.3469316454586823e-08, + "loss": 0.9814, + "step": 9197 + }, + { + "epoch": 0.9673577241116385, + "grad_norm": 2.4530714452657625, + "learning_rate": 1.3382913555730626e-08, + "loss": 0.9688, + "step": 9198 + }, + { + "epoch": 0.9674628945534858, + "grad_norm": 2.2680814906299824, + "learning_rate": 1.3296787933996246e-08, + "loss": 0.9347, + "step": 9199 + }, + { + "epoch": 0.9675680649953331, + "grad_norm": 2.189230585914418, + "learning_rate": 1.3210939598987394e-08, + "loss": 0.9303, + "step": 9200 + }, + { + "epoch": 0.9676732354371804, + "grad_norm": 2.3582843375690214, + "learning_rate": 1.3125368560276686e-08, + "loss": 1.0175, + "step": 9201 + }, + { + "epoch": 0.9677784058790277, + "grad_norm": 2.268975158784598, + "learning_rate": 1.3040074827406491e-08, + "loss": 0.9531, + "step": 9202 + }, + { + "epoch": 0.9678835763208751, + "grad_norm": 2.5739483459832098, + "learning_rate": 1.2955058409886978e-08, + "loss": 0.9481, + "step": 9203 + }, + { + "epoch": 0.9679887467627223, + "grad_norm": 1.8224473421389638, + "learning_rate": 1.2870319317198621e-08, + "loss": 0.9507, + "step": 9204 + }, + { + "epoch": 0.9680939172045696, + "grad_norm": 2.8730850996320743, + "learning_rate": 1.2785857558790526e-08, + "loss": 0.9602, + "step": 9205 + }, + { + "epoch": 0.9681990876464169, + "grad_norm": 2.724396836974365, + "learning_rate": 1.270167314408044e-08, + "loss": 0.9638, + "step": 9206 + }, + { + "epoch": 0.9683042580882643, + "grad_norm": 2.1481024531914046, + "learning_rate": 1.261776608245585e-08, + "loss": 1.0108, + "step": 9207 + }, + { + "epoch": 0.9684094285301116, + "grad_norm": 2.528448588522008, + "learning_rate": 1.2534136383272888e-08, + "loss": 0.9794, + "step": 9208 + }, + { + "epoch": 0.9685145989719589, + "grad_norm": 3.49084002666298, + "learning_rate": 1.2450784055857145e-08, + "loss": 0.9712, + "step": 9209 + }, + { + "epoch": 0.9686197694138062, + "grad_norm": 2.619678363223223, + "learning_rate": 1.2367709109503134e-08, + "loss": 0.9621, + "step": 9210 + }, + { + "epoch": 0.9687249398556536, + "grad_norm": 2.8477945371212465, + "learning_rate": 1.2284911553474e-08, + "loss": 0.9999, + "step": 9211 + }, + { + "epoch": 0.9688301102975009, + "grad_norm": 2.5699605691953225, + "learning_rate": 1.2202391397002355e-08, + "loss": 0.9942, + "step": 9212 + }, + { + "epoch": 0.9689352807393482, + "grad_norm": 2.96718335182804, + "learning_rate": 1.2120148649290008e-08, + "loss": 0.9763, + "step": 9213 + }, + { + "epoch": 0.9690404511811955, + "grad_norm": 2.7754708820964806, + "learning_rate": 1.2038183319507957e-08, + "loss": 0.9626, + "step": 9214 + }, + { + "epoch": 0.9691456216230429, + "grad_norm": 2.3177867748746466, + "learning_rate": 1.1956495416795277e-08, + "loss": 1.0024, + "step": 9215 + }, + { + "epoch": 0.9692507920648902, + "grad_norm": 2.593173481686718, + "learning_rate": 1.1875084950261351e-08, + "loss": 1.0028, + "step": 9216 + }, + { + "epoch": 0.9693559625067375, + "grad_norm": 2.9154573832597452, + "learning_rate": 1.1793951928983639e-08, + "loss": 0.9496, + "step": 9217 + }, + { + "epoch": 0.9694611329485848, + "grad_norm": 2.4979489626751343, + "learning_rate": 1.1713096362009346e-08, + "loss": 0.9504, + "step": 9218 + }, + { + "epoch": 0.9695663033904321, + "grad_norm": 1.9672731194839046, + "learning_rate": 1.1632518258354875e-08, + "loss": 0.9085, + "step": 9219 + }, + { + "epoch": 0.9696714738322795, + "grad_norm": 2.481803674239574, + "learning_rate": 1.1552217627004426e-08, + "loss": 0.966, + "step": 9220 + }, + { + "epoch": 0.9697766442741268, + "grad_norm": 2.6552953440994282, + "learning_rate": 1.1472194476913057e-08, + "loss": 1.0244, + "step": 9221 + }, + { + "epoch": 0.9698818147159741, + "grad_norm": 2.7197091300670135, + "learning_rate": 1.1392448817003354e-08, + "loss": 0.9968, + "step": 9222 + }, + { + "epoch": 0.9699869851578214, + "grad_norm": 2.067177962113043, + "learning_rate": 1.1312980656167927e-08, + "loss": 0.9947, + "step": 9223 + }, + { + "epoch": 0.9700921555996687, + "grad_norm": 2.347224718073338, + "learning_rate": 1.1233790003267741e-08, + "loss": 0.9577, + "step": 9224 + }, + { + "epoch": 0.970197326041516, + "grad_norm": 5.618522310449893, + "learning_rate": 1.115487686713379e-08, + "loss": 1.0061, + "step": 9225 + }, + { + "epoch": 0.9703024964833633, + "grad_norm": 1.7054244073511104, + "learning_rate": 1.107624125656459e-08, + "loss": 0.9655, + "step": 9226 + }, + { + "epoch": 0.9704076669252106, + "grad_norm": 2.0088677460681734, + "learning_rate": 1.0997883180329515e-08, + "loss": 0.985, + "step": 9227 + }, + { + "epoch": 0.970512837367058, + "grad_norm": 3.0153539597861982, + "learning_rate": 1.0919802647165467e-08, + "loss": 0.9946, + "step": 9228 + }, + { + "epoch": 0.9706180078089053, + "grad_norm": 3.353254395720258, + "learning_rate": 1.0841999665779368e-08, + "loss": 1.0188, + "step": 9229 + }, + { + "epoch": 0.9707231782507526, + "grad_norm": 2.081090994792908, + "learning_rate": 1.0764474244846778e-08, + "loss": 0.9783, + "step": 9230 + }, + { + "epoch": 0.9708283486925999, + "grad_norm": 3.243396799607356, + "learning_rate": 1.0687226393012173e-08, + "loss": 0.9995, + "step": 9231 + }, + { + "epoch": 0.9709335191344473, + "grad_norm": 1.8699534084309162, + "learning_rate": 1.0610256118889772e-08, + "loss": 0.9539, + "step": 9232 + }, + { + "epoch": 0.9710386895762946, + "grad_norm": 2.480474240008781, + "learning_rate": 1.0533563431062154e-08, + "loss": 0.9863, + "step": 9233 + }, + { + "epoch": 0.9711438600181419, + "grad_norm": 2.002217087521726, + "learning_rate": 1.0457148338080813e-08, + "loss": 0.9804, + "step": 9234 + }, + { + "epoch": 0.9712490304599892, + "grad_norm": 1.5904141400379188, + "learning_rate": 1.0381010848466988e-08, + "loss": 0.9646, + "step": 9235 + }, + { + "epoch": 0.9713542009018366, + "grad_norm": 2.4366903231145454, + "learning_rate": 1.0305150970710276e-08, + "loss": 0.9668, + "step": 9236 + }, + { + "epoch": 0.9714593713436839, + "grad_norm": 2.822796227094747, + "learning_rate": 1.02295687132703e-08, + "loss": 1.0271, + "step": 9237 + }, + { + "epoch": 0.9715645417855312, + "grad_norm": 1.9912340131203807, + "learning_rate": 1.0154264084574206e-08, + "loss": 0.9428, + "step": 9238 + }, + { + "epoch": 0.9716697122273785, + "grad_norm": 1.955104338884243, + "learning_rate": 1.0079237093019722e-08, + "loss": 0.9747, + "step": 9239 + }, + { + "epoch": 0.9717748826692258, + "grad_norm": 3.2050011615950793, + "learning_rate": 1.0004487746972657e-08, + "loss": 0.9594, + "step": 9240 + }, + { + "epoch": 0.9718800531110732, + "grad_norm": 2.526175674743241, + "learning_rate": 9.930016054768011e-09, + "loss": 0.9817, + "step": 9241 + }, + { + "epoch": 0.9719852235529205, + "grad_norm": 2.425815712365738, + "learning_rate": 9.85582202471025e-09, + "loss": 1.0223, + "step": 9242 + }, + { + "epoch": 0.9720903939947678, + "grad_norm": 2.1478474630955557, + "learning_rate": 9.7819056650722e-09, + "loss": 0.9611, + "step": 9243 + }, + { + "epoch": 0.972195564436615, + "grad_norm": 2.63035991433718, + "learning_rate": 9.708266984096714e-09, + "loss": 0.9549, + "step": 9244 + }, + { + "epoch": 0.9723007348784624, + "grad_norm": 2.787607325293226, + "learning_rate": 9.634905989994448e-09, + "loss": 0.9642, + "step": 9245 + }, + { + "epoch": 0.9724059053203097, + "grad_norm": 2.7011576256397056, + "learning_rate": 9.561822690946076e-09, + "loss": 0.935, + "step": 9246 + }, + { + "epoch": 0.972511075762157, + "grad_norm": 3.3865296605768545, + "learning_rate": 9.489017095100916e-09, + "loss": 0.9845, + "step": 9247 + }, + { + "epoch": 0.9726162462040043, + "grad_norm": 2.141639825480503, + "learning_rate": 9.416489210577473e-09, + "loss": 0.9714, + "step": 9248 + }, + { + "epoch": 0.9727214166458517, + "grad_norm": 2.381480065424657, + "learning_rate": 9.344239045462888e-09, + "loss": 1.0016, + "step": 9249 + }, + { + "epoch": 0.972826587087699, + "grad_norm": 3.061957347212088, + "learning_rate": 9.272266607813774e-09, + "loss": 1.0217, + "step": 9250 + }, + { + "epoch": 0.9729317575295463, + "grad_norm": 2.5614131075584416, + "learning_rate": 9.200571905655376e-09, + "loss": 0.9773, + "step": 9251 + }, + { + "epoch": 0.9730369279713936, + "grad_norm": 2.6614512876627745, + "learning_rate": 9.129154946982687e-09, + "loss": 1.009, + "step": 9252 + }, + { + "epoch": 0.973142098413241, + "grad_norm": 2.5591923309907956, + "learning_rate": 9.058015739758786e-09, + "loss": 0.9382, + "step": 9253 + }, + { + "epoch": 0.9732472688550883, + "grad_norm": 3.4612772871088255, + "learning_rate": 8.98715429191649e-09, + "loss": 0.9797, + "step": 9254 + }, + { + "epoch": 0.9733524392969356, + "grad_norm": 2.161508443645297, + "learning_rate": 8.916570611357534e-09, + "loss": 1.0028, + "step": 9255 + }, + { + "epoch": 0.9734576097387829, + "grad_norm": 2.777809434537492, + "learning_rate": 8.84626470595229e-09, + "loss": 0.9781, + "step": 9256 + }, + { + "epoch": 0.9735627801806302, + "grad_norm": 3.256651493127484, + "learning_rate": 8.776236583540321e-09, + "loss": 0.9788, + "step": 9257 + }, + { + "epoch": 0.9736679506224776, + "grad_norm": 3.137061562152753, + "learning_rate": 8.706486251930657e-09, + "loss": 0.9969, + "step": 9258 + }, + { + "epoch": 0.9737731210643249, + "grad_norm": 1.6215241145993706, + "learning_rate": 8.637013718900689e-09, + "loss": 0.9675, + "step": 9259 + }, + { + "epoch": 0.9738782915061722, + "grad_norm": 2.1994977784123075, + "learning_rate": 8.567818992197274e-09, + "loss": 0.9876, + "step": 9260 + }, + { + "epoch": 0.9739834619480195, + "grad_norm": 2.4037382184819585, + "learning_rate": 8.498902079536186e-09, + "loss": 0.9774, + "step": 9261 + }, + { + "epoch": 0.9740886323898669, + "grad_norm": 2.824206149642182, + "learning_rate": 8.430262988602389e-09, + "loss": 1.0053, + "step": 9262 + }, + { + "epoch": 0.9741938028317142, + "grad_norm": 2.9847969505243213, + "learning_rate": 8.361901727049204e-09, + "loss": 0.9833, + "step": 9263 + }, + { + "epoch": 0.9742989732735615, + "grad_norm": 2.358889620889671, + "learning_rate": 8.293818302499701e-09, + "loss": 0.9803, + "step": 9264 + }, + { + "epoch": 0.9744041437154087, + "grad_norm": 2.5381745630996044, + "learning_rate": 8.226012722545863e-09, + "loss": 0.9937, + "step": 9265 + }, + { + "epoch": 0.9745093141572561, + "grad_norm": 2.6358498110575583, + "learning_rate": 8.158484994748306e-09, + "loss": 0.9967, + "step": 9266 + }, + { + "epoch": 0.9746144845991034, + "grad_norm": 3.2362925404197846, + "learning_rate": 8.09123512663712e-09, + "loss": 1.0093, + "step": 9267 + }, + { + "epoch": 0.9747196550409507, + "grad_norm": 2.1160461048243304, + "learning_rate": 8.024263125710751e-09, + "loss": 1.0194, + "step": 9268 + }, + { + "epoch": 0.974824825482798, + "grad_norm": 2.1609581262482456, + "learning_rate": 7.95756899943767e-09, + "loss": 0.9649, + "step": 9269 + }, + { + "epoch": 0.9749299959246454, + "grad_norm": 3.1491039134025276, + "learning_rate": 7.891152755254427e-09, + "loss": 0.9911, + "step": 9270 + }, + { + "epoch": 0.9750351663664927, + "grad_norm": 2.8428124781388378, + "learning_rate": 7.825014400567044e-09, + "loss": 0.9895, + "step": 9271 + }, + { + "epoch": 0.97514033680834, + "grad_norm": 2.2055413871514307, + "learning_rate": 7.759153942750174e-09, + "loss": 0.9915, + "step": 9272 + }, + { + "epoch": 0.9752455072501873, + "grad_norm": 2.1726712220112545, + "learning_rate": 7.693571389148224e-09, + "loss": 0.9897, + "step": 9273 + }, + { + "epoch": 0.9753506776920347, + "grad_norm": 2.720353151960456, + "learning_rate": 7.628266747074231e-09, + "loss": 0.9897, + "step": 9274 + }, + { + "epoch": 0.975455848133882, + "grad_norm": 2.7591952094504424, + "learning_rate": 7.563240023809593e-09, + "loss": 0.9615, + "step": 9275 + }, + { + "epoch": 0.9755610185757293, + "grad_norm": 2.737597454130178, + "learning_rate": 7.498491226605731e-09, + "loss": 0.9927, + "step": 9276 + }, + { + "epoch": 0.9756661890175766, + "grad_norm": 2.5948670808268184, + "learning_rate": 7.434020362682703e-09, + "loss": 0.9945, + "step": 9277 + }, + { + "epoch": 0.975771359459424, + "grad_norm": 2.784078394241727, + "learning_rate": 7.369827439228927e-09, + "loss": 0.96, + "step": 9278 + }, + { + "epoch": 0.9758765299012713, + "grad_norm": 3.0703548564961394, + "learning_rate": 7.30591246340312e-09, + "loss": 0.9259, + "step": 9279 + }, + { + "epoch": 0.9759817003431186, + "grad_norm": 2.8232566736257385, + "learning_rate": 7.242275442332081e-09, + "loss": 0.9477, + "step": 9280 + }, + { + "epoch": 0.9760868707849659, + "grad_norm": 2.0272196755850214, + "learning_rate": 7.178916383111523e-09, + "loss": 0.9936, + "step": 9281 + }, + { + "epoch": 0.9761920412268132, + "grad_norm": 2.292451442175712, + "learning_rate": 7.1158352928066276e-09, + "loss": 0.9719, + "step": 9282 + }, + { + "epoch": 0.9762972116686606, + "grad_norm": 2.830967058321669, + "learning_rate": 7.053032178451769e-09, + "loss": 0.9359, + "step": 9283 + }, + { + "epoch": 0.9764023821105079, + "grad_norm": 2.5809673977523286, + "learning_rate": 6.990507047049677e-09, + "loss": 0.9662, + "step": 9284 + }, + { + "epoch": 0.9765075525523551, + "grad_norm": 2.6394164418653476, + "learning_rate": 6.928259905572277e-09, + "loss": 1.0017, + "step": 9285 + }, + { + "epoch": 0.9766127229942024, + "grad_norm": 2.486791493687076, + "learning_rate": 6.866290760960959e-09, + "loss": 1.0181, + "step": 9286 + }, + { + "epoch": 0.9767178934360498, + "grad_norm": 2.2338709539918225, + "learning_rate": 6.804599620125196e-09, + "loss": 0.9647, + "step": 9287 + }, + { + "epoch": 0.9768230638778971, + "grad_norm": 2.1691387566361082, + "learning_rate": 6.743186489944764e-09, + "loss": 0.9912, + "step": 9288 + }, + { + "epoch": 0.9769282343197444, + "grad_norm": 3.245572611882046, + "learning_rate": 6.68205137726724e-09, + "loss": 0.9692, + "step": 9289 + }, + { + "epoch": 0.9770334047615917, + "grad_norm": 3.04733299754532, + "learning_rate": 6.621194288909671e-09, + "loss": 0.9791, + "step": 9290 + }, + { + "epoch": 0.977138575203439, + "grad_norm": 1.8878390554596858, + "learning_rate": 6.560615231658296e-09, + "loss": 0.9996, + "step": 9291 + }, + { + "epoch": 0.9772437456452864, + "grad_norm": 2.2627442103553017, + "learning_rate": 6.500314212267989e-09, + "loss": 1.0112, + "step": 9292 + }, + { + "epoch": 0.9773489160871337, + "grad_norm": 2.218078785102874, + "learning_rate": 6.440291237462815e-09, + "loss": 0.9641, + "step": 9293 + }, + { + "epoch": 0.977454086528981, + "grad_norm": 1.8552740152450433, + "learning_rate": 6.380546313935754e-09, + "loss": 0.9732, + "step": 9294 + }, + { + "epoch": 0.9775592569708283, + "grad_norm": 2.4608837759222735, + "learning_rate": 6.321079448348977e-09, + "loss": 0.9696, + "step": 9295 + }, + { + "epoch": 0.9776644274126757, + "grad_norm": 2.552592842090153, + "learning_rate": 6.261890647333568e-09, + "loss": 0.9417, + "step": 9296 + }, + { + "epoch": 0.977769597854523, + "grad_norm": 2.009057625011869, + "learning_rate": 6.202979917489249e-09, + "loss": 0.9384, + "step": 9297 + }, + { + "epoch": 0.9778747682963703, + "grad_norm": 1.9808092212897492, + "learning_rate": 6.144347265384931e-09, + "loss": 0.9631, + "step": 9298 + }, + { + "epoch": 0.9779799387382176, + "grad_norm": 3.045529761172713, + "learning_rate": 6.085992697559273e-09, + "loss": 0.9743, + "step": 9299 + }, + { + "epoch": 0.978085109180065, + "grad_norm": 3.059783975457296, + "learning_rate": 6.027916220518459e-09, + "loss": 0.9701, + "step": 9300 + }, + { + "epoch": 0.9781902796219123, + "grad_norm": 3.0504326853392993, + "learning_rate": 5.970117840738976e-09, + "loss": 0.9683, + "step": 9301 + }, + { + "epoch": 0.9782954500637596, + "grad_norm": 2.7901710880220065, + "learning_rate": 5.912597564665667e-09, + "loss": 1.0016, + "step": 9302 + }, + { + "epoch": 0.9784006205056069, + "grad_norm": 2.4240633625968497, + "learning_rate": 5.85535539871257e-09, + "loss": 1.0195, + "step": 9303 + }, + { + "epoch": 0.9785057909474543, + "grad_norm": 2.4233089687632283, + "learning_rate": 5.798391349262356e-09, + "loss": 0.9825, + "step": 9304 + }, + { + "epoch": 0.9786109613893015, + "grad_norm": 1.8446993064170987, + "learning_rate": 5.741705422667443e-09, + "loss": 1.0112, + "step": 9305 + }, + { + "epoch": 0.9787161318311488, + "grad_norm": 2.2841403028774048, + "learning_rate": 5.685297625248054e-09, + "loss": 0.9792, + "step": 9306 + }, + { + "epoch": 0.9788213022729961, + "grad_norm": 2.755338437254536, + "learning_rate": 5.62916796329499e-09, + "loss": 0.9768, + "step": 9307 + }, + { + "epoch": 0.9789264727148435, + "grad_norm": 2.7700700654544432, + "learning_rate": 5.573316443066301e-09, + "loss": 0.9787, + "step": 9308 + }, + { + "epoch": 0.9790316431566908, + "grad_norm": 2.8359479362123223, + "learning_rate": 5.517743070790061e-09, + "loss": 0.9891, + "step": 9309 + }, + { + "epoch": 0.9791368135985381, + "grad_norm": 2.7613280805123583, + "learning_rate": 5.462447852663532e-09, + "loss": 0.9876, + "step": 9310 + }, + { + "epoch": 0.9792419840403854, + "grad_norm": 2.4322910758293825, + "learning_rate": 5.407430794852342e-09, + "loss": 1.0221, + "step": 9311 + }, + { + "epoch": 0.9793471544822328, + "grad_norm": 1.9160133184659012, + "learning_rate": 5.352691903491303e-09, + "loss": 1.0014, + "step": 9312 + }, + { + "epoch": 0.9794523249240801, + "grad_norm": 2.3481712518499207, + "learning_rate": 5.2982311846841436e-09, + "loss": 0.9383, + "step": 9313 + }, + { + "epoch": 0.9795574953659274, + "grad_norm": 2.588135744749517, + "learning_rate": 5.2440486445037855e-09, + "loss": 0.9673, + "step": 9314 + }, + { + "epoch": 0.9796626658077747, + "grad_norm": 3.1082126273430664, + "learning_rate": 5.190144288991783e-09, + "loss": 1.0052, + "step": 9315 + }, + { + "epoch": 0.979767836249622, + "grad_norm": 2.1785968088651773, + "learning_rate": 5.136518124159162e-09, + "loss": 0.9681, + "step": 9316 + }, + { + "epoch": 0.9798730066914694, + "grad_norm": 2.5729066271800622, + "learning_rate": 5.0831701559855835e-09, + "loss": 1.0088, + "step": 9317 + }, + { + "epoch": 0.9799781771333167, + "grad_norm": 1.9530621342015528, + "learning_rate": 5.030100390419623e-09, + "loss": 0.9577, + "step": 9318 + }, + { + "epoch": 0.980083347575164, + "grad_norm": 2.267823738676506, + "learning_rate": 4.977308833379324e-09, + "loss": 1.018, + "step": 9319 + }, + { + "epoch": 0.9801885180170113, + "grad_norm": 2.2816893245073584, + "learning_rate": 4.924795490750811e-09, + "loss": 0.9672, + "step": 9320 + }, + { + "epoch": 0.9802936884588587, + "grad_norm": 1.9259466407396693, + "learning_rate": 4.872560368390233e-09, + "loss": 0.9453, + "step": 9321 + }, + { + "epoch": 0.980398858900706, + "grad_norm": 1.7974805667836606, + "learning_rate": 4.820603472121821e-09, + "loss": 0.9947, + "step": 9322 + }, + { + "epoch": 0.9805040293425533, + "grad_norm": 2.3791236198376646, + "learning_rate": 4.768924807739273e-09, + "loss": 0.967, + "step": 9323 + }, + { + "epoch": 0.9806091997844006, + "grad_norm": 2.4419036752163743, + "learning_rate": 4.717524381005478e-09, + "loss": 0.9996, + "step": 9324 + }, + { + "epoch": 0.980714370226248, + "grad_norm": 2.7332285105011347, + "learning_rate": 4.666402197651687e-09, + "loss": 0.9636, + "step": 9325 + }, + { + "epoch": 0.9808195406680952, + "grad_norm": 1.8101579312807496, + "learning_rate": 4.615558263378617e-09, + "loss": 1.0082, + "step": 9326 + }, + { + "epoch": 0.9809247111099425, + "grad_norm": 3.2434727994808683, + "learning_rate": 4.5649925838553435e-09, + "loss": 1.0134, + "step": 9327 + }, + { + "epoch": 0.9810298815517898, + "grad_norm": 2.391788541909677, + "learning_rate": 4.5147051647206895e-09, + "loss": 1.0044, + "step": 9328 + }, + { + "epoch": 0.9811350519936372, + "grad_norm": 2.25620752561833, + "learning_rate": 4.464696011582115e-09, + "loss": 0.9879, + "step": 9329 + }, + { + "epoch": 0.9812402224354845, + "grad_norm": 2.4842392384829406, + "learning_rate": 4.414965130015991e-09, + "loss": 1.0062, + "step": 9330 + }, + { + "epoch": 0.9813453928773318, + "grad_norm": 2.2203661202851284, + "learning_rate": 4.365512525567605e-09, + "loss": 0.9546, + "step": 9331 + }, + { + "epoch": 0.9814505633191791, + "grad_norm": 2.711772293584609, + "learning_rate": 4.316338203751158e-09, + "loss": 0.9978, + "step": 9332 + }, + { + "epoch": 0.9815557337610264, + "grad_norm": 2.7700162537251076, + "learning_rate": 4.267442170050318e-09, + "loss": 0.9552, + "step": 9333 + }, + { + "epoch": 0.9816609042028738, + "grad_norm": 2.2577456211849176, + "learning_rate": 4.2188244299171126e-09, + "loss": 0.9859, + "step": 9334 + }, + { + "epoch": 0.9817660746447211, + "grad_norm": 2.5568997931283657, + "learning_rate": 4.170484988773038e-09, + "loss": 0.9332, + "step": 9335 + }, + { + "epoch": 0.9818712450865684, + "grad_norm": 1.9925306382045842, + "learning_rate": 4.122423852007951e-09, + "loss": 0.9782, + "step": 9336 + }, + { + "epoch": 0.9819764155284157, + "grad_norm": 2.921254419402326, + "learning_rate": 4.0746410249814515e-09, + "loss": 0.981, + "step": 9337 + }, + { + "epoch": 0.9820815859702631, + "grad_norm": 2.267912103577595, + "learning_rate": 4.027136513021501e-09, + "loss": 0.9974, + "step": 9338 + }, + { + "epoch": 0.9821867564121104, + "grad_norm": 4.541966161576452, + "learning_rate": 3.979910321425251e-09, + "loss": 1.0016, + "step": 9339 + }, + { + "epoch": 0.9822919268539577, + "grad_norm": 3.2761474483878206, + "learning_rate": 3.932962455458489e-09, + "loss": 0.9835, + "step": 9340 + }, + { + "epoch": 0.982397097295805, + "grad_norm": 2.228104089247583, + "learning_rate": 3.886292920356749e-09, + "loss": 1.0091, + "step": 9341 + }, + { + "epoch": 0.9825022677376524, + "grad_norm": 5.889922296145182, + "learning_rate": 3.839901721323925e-09, + "loss": 0.9941, + "step": 9342 + }, + { + "epoch": 0.9826074381794997, + "grad_norm": 1.9237694450938765, + "learning_rate": 3.793788863532822e-09, + "loss": 0.9619, + "step": 9343 + }, + { + "epoch": 0.982712608621347, + "grad_norm": 2.5296195092756477, + "learning_rate": 3.747954352125438e-09, + "loss": 0.958, + "step": 9344 + }, + { + "epoch": 0.9828177790631943, + "grad_norm": 2.864791152152932, + "learning_rate": 3.702398192212686e-09, + "loss": 0.9767, + "step": 9345 + }, + { + "epoch": 0.9829229495050416, + "grad_norm": 2.4375684665326727, + "learning_rate": 3.6571203888746685e-09, + "loss": 0.9733, + "step": 9346 + }, + { + "epoch": 0.9830281199468889, + "grad_norm": 2.0742623744580695, + "learning_rate": 3.6121209471595697e-09, + "loss": 1.0078, + "step": 9347 + }, + { + "epoch": 0.9831332903887362, + "grad_norm": 2.4063986427828614, + "learning_rate": 3.567399872085875e-09, + "loss": 0.998, + "step": 9348 + }, + { + "epoch": 0.9832384608305835, + "grad_norm": 2.383631755468102, + "learning_rate": 3.522957168640151e-09, + "loss": 0.9822, + "step": 9349 + }, + { + "epoch": 0.9833436312724309, + "grad_norm": 2.393452033459606, + "learning_rate": 3.4787928417778783e-09, + "loss": 0.9491, + "step": 9350 + }, + { + "epoch": 0.9834488017142782, + "grad_norm": 2.1318023663228094, + "learning_rate": 3.4349068964240063e-09, + "loss": 1.0221, + "step": 9351 + }, + { + "epoch": 0.9835539721561255, + "grad_norm": 2.126532018705832, + "learning_rate": 3.391299337471843e-09, + "loss": 1.0536, + "step": 9352 + }, + { + "epoch": 0.9836591425979728, + "grad_norm": 2.4227691688428643, + "learning_rate": 3.3479701697841647e-09, + "loss": 0.9667, + "step": 9353 + }, + { + "epoch": 0.9837643130398201, + "grad_norm": 2.472893734369122, + "learning_rate": 3.304919398192663e-09, + "loss": 0.961, + "step": 9354 + }, + { + "epoch": 0.9838694834816675, + "grad_norm": 2.646191353766477, + "learning_rate": 3.2621470274973867e-09, + "loss": 1.0272, + "step": 9355 + }, + { + "epoch": 0.9839746539235148, + "grad_norm": 3.2371826662759817, + "learning_rate": 3.219653062468131e-09, + "loss": 0.9586, + "step": 9356 + }, + { + "epoch": 0.9840798243653621, + "grad_norm": 2.6252562394371273, + "learning_rate": 3.1774375078433284e-09, + "loss": 1.0218, + "step": 9357 + }, + { + "epoch": 0.9841849948072094, + "grad_norm": 2.883626529975225, + "learning_rate": 3.1355003683303243e-09, + "loss": 1.0251, + "step": 9358 + }, + { + "epoch": 0.9842901652490568, + "grad_norm": 2.101061184091713, + "learning_rate": 3.0938416486051004e-09, + "loss": 0.9871, + "step": 9359 + }, + { + "epoch": 0.9843953356909041, + "grad_norm": 2.825052483693705, + "learning_rate": 3.0524613533133853e-09, + "loss": 1.0467, + "step": 9360 + }, + { + "epoch": 0.9845005061327514, + "grad_norm": 2.1917761304189898, + "learning_rate": 3.011359487068988e-09, + "loss": 0.9531, + "step": 9361 + }, + { + "epoch": 0.9846056765745987, + "grad_norm": 2.087827643664742, + "learning_rate": 2.970536054455464e-09, + "loss": 0.9927, + "step": 9362 + }, + { + "epoch": 0.9847108470164461, + "grad_norm": 2.849510889632073, + "learning_rate": 2.9299910600247285e-09, + "loss": 0.9576, + "step": 9363 + }, + { + "epoch": 0.9848160174582934, + "grad_norm": 2.3562660790190324, + "learning_rate": 2.8897245082978863e-09, + "loss": 0.9732, + "step": 9364 + }, + { + "epoch": 0.9849211879001407, + "grad_norm": 2.3002496067135803, + "learning_rate": 2.849736403764958e-09, + "loss": 0.9836, + "step": 9365 + }, + { + "epoch": 0.985026358341988, + "grad_norm": 1.726852714144181, + "learning_rate": 2.810026750885153e-09, + "loss": 0.9782, + "step": 9366 + }, + { + "epoch": 0.9851315287838353, + "grad_norm": 2.796815170845847, + "learning_rate": 2.770595554086042e-09, + "loss": 0.9365, + "step": 9367 + }, + { + "epoch": 0.9852366992256826, + "grad_norm": 2.6158136518772763, + "learning_rate": 2.7314428177646622e-09, + "loss": 0.9473, + "step": 9368 + }, + { + "epoch": 0.9853418696675299, + "grad_norm": 2.2921022351525555, + "learning_rate": 2.6925685462869664e-09, + "loss": 0.9774, + "step": 9369 + }, + { + "epoch": 0.9854470401093772, + "grad_norm": 1.8665257532281978, + "learning_rate": 2.6539727439875427e-09, + "loss": 0.9579, + "step": 9370 + }, + { + "epoch": 0.9855522105512246, + "grad_norm": 2.0803248457969206, + "learning_rate": 2.6156554151704484e-09, + "loss": 0.9765, + "step": 9371 + }, + { + "epoch": 0.9856573809930719, + "grad_norm": 3.028285191313156, + "learning_rate": 2.5776165641080986e-09, + "loss": 0.9999, + "step": 9372 + }, + { + "epoch": 0.9857625514349192, + "grad_norm": 3.576679364872881, + "learning_rate": 2.539856195041823e-09, + "loss": 0.977, + "step": 9373 + }, + { + "epoch": 0.9858677218767665, + "grad_norm": 2.4305778370700826, + "learning_rate": 2.5023743121826983e-09, + "loss": 1.0111, + "step": 9374 + }, + { + "epoch": 0.9859728923186138, + "grad_norm": 2.47884229852146, + "learning_rate": 2.465170919710158e-09, + "loss": 0.9983, + "step": 9375 + }, + { + "epoch": 0.9860780627604612, + "grad_norm": 2.7349159526364994, + "learning_rate": 2.428246021772551e-09, + "loss": 0.991, + "step": 9376 + }, + { + "epoch": 0.9861832332023085, + "grad_norm": 2.637560167507425, + "learning_rate": 2.391599622487417e-09, + "loss": 0.9667, + "step": 9377 + }, + { + "epoch": 0.9862884036441558, + "grad_norm": 1.9300449879076964, + "learning_rate": 2.3552317259409317e-09, + "loss": 1.0041, + "step": 9378 + }, + { + "epoch": 0.9863935740860031, + "grad_norm": 2.6765740954561363, + "learning_rate": 2.3191423361884626e-09, + "loss": 1.0011, + "step": 9379 + }, + { + "epoch": 0.9864987445278505, + "grad_norm": 2.616883579256923, + "learning_rate": 2.28333145725429e-09, + "loss": 1.0176, + "step": 9380 + }, + { + "epoch": 0.9866039149696978, + "grad_norm": 1.8573269651299802, + "learning_rate": 2.247799093131886e-09, + "loss": 0.969, + "step": 9381 + }, + { + "epoch": 0.9867090854115451, + "grad_norm": 2.60301904370189, + "learning_rate": 2.2125452477828047e-09, + "loss": 0.9769, + "step": 9382 + }, + { + "epoch": 0.9868142558533924, + "grad_norm": 1.9682961895311704, + "learning_rate": 2.1775699251383455e-09, + "loss": 0.9771, + "step": 9383 + }, + { + "epoch": 0.9869194262952398, + "grad_norm": 3.191223466664238, + "learning_rate": 2.1428731290987215e-09, + "loss": 1.046, + "step": 9384 + }, + { + "epoch": 0.9870245967370871, + "grad_norm": 2.892416445308266, + "learning_rate": 2.1084548635327828e-09, + "loss": 0.9734, + "step": 9385 + }, + { + "epoch": 0.9871297671789344, + "grad_norm": 3.095226990473517, + "learning_rate": 2.0743151322785703e-09, + "loss": 0.9719, + "step": 9386 + }, + { + "epoch": 0.9872349376207816, + "grad_norm": 2.7327344515972545, + "learning_rate": 2.0404539391427614e-09, + "loss": 0.9671, + "step": 9387 + }, + { + "epoch": 0.987340108062629, + "grad_norm": 2.9017278864296263, + "learning_rate": 2.0068712879009468e-09, + "loss": 1.0231, + "step": 9388 + }, + { + "epoch": 0.9874452785044763, + "grad_norm": 3.392301107777531, + "learning_rate": 1.973567182298186e-09, + "loss": 0.9928, + "step": 9389 + }, + { + "epoch": 0.9875504489463236, + "grad_norm": 2.3644487612910856, + "learning_rate": 1.9405416260481757e-09, + "loss": 0.9683, + "step": 9390 + }, + { + "epoch": 0.9876556193881709, + "grad_norm": 2.6559023017178194, + "learning_rate": 1.907794622833248e-09, + "loss": 0.9537, + "step": 9391 + }, + { + "epoch": 0.9877607898300182, + "grad_norm": 2.6251794536087494, + "learning_rate": 1.875326176304926e-09, + "loss": 1.0005, + "step": 9392 + }, + { + "epoch": 0.9878659602718656, + "grad_norm": 2.1131946840820253, + "learning_rate": 1.8431362900839244e-09, + "loss": 0.9866, + "step": 9393 + }, + { + "epoch": 0.9879711307137129, + "grad_norm": 3.060384152042341, + "learning_rate": 1.8112249677598726e-09, + "loss": 1.0004, + "step": 9394 + }, + { + "epoch": 0.9880763011555602, + "grad_norm": 2.7092939576558366, + "learning_rate": 1.7795922128904796e-09, + "loss": 0.9733, + "step": 9395 + }, + { + "epoch": 0.9881814715974075, + "grad_norm": 3.15026244560097, + "learning_rate": 1.7482380290034795e-09, + "loss": 0.9553, + "step": 9396 + }, + { + "epoch": 0.9882866420392549, + "grad_norm": 2.3143889813143637, + "learning_rate": 1.7171624195952418e-09, + "loss": 0.9775, + "step": 9397 + }, + { + "epoch": 0.9883918124811022, + "grad_norm": 1.749676378937021, + "learning_rate": 1.6863653881307728e-09, + "loss": 1.0136, + "step": 9398 + }, + { + "epoch": 0.9884969829229495, + "grad_norm": 2.4561058588577636, + "learning_rate": 1.6558469380439923e-09, + "loss": 0.9778, + "step": 9399 + }, + { + "epoch": 0.9886021533647968, + "grad_norm": 2.869598988661229, + "learning_rate": 1.6256070727380114e-09, + "loss": 0.9961, + "step": 9400 + }, + { + "epoch": 0.9887073238066442, + "grad_norm": 1.3519472395346488, + "learning_rate": 1.5956457955848547e-09, + "loss": 0.9565, + "step": 9401 + }, + { + "epoch": 0.9888124942484915, + "grad_norm": 2.550759243468463, + "learning_rate": 1.5659631099257389e-09, + "loss": 0.9986, + "step": 9402 + }, + { + "epoch": 0.9889176646903388, + "grad_norm": 2.2780299064704668, + "learning_rate": 1.5365590190699608e-09, + "loss": 0.9818, + "step": 9403 + }, + { + "epoch": 0.9890228351321861, + "grad_norm": 2.5188559087529003, + "learning_rate": 1.507433526296842e-09, + "loss": 0.9911, + "step": 9404 + }, + { + "epoch": 0.9891280055740335, + "grad_norm": 2.188706310697837, + "learning_rate": 1.4785866348537848e-09, + "loss": 0.9693, + "step": 9405 + }, + { + "epoch": 0.9892331760158808, + "grad_norm": 1.809924536593071, + "learning_rate": 1.4500183479573825e-09, + "loss": 0.9738, + "step": 9406 + }, + { + "epoch": 0.989338346457728, + "grad_norm": 3.093352924429147, + "learning_rate": 1.4217286687936982e-09, + "loss": 1.0348, + "step": 9407 + }, + { + "epoch": 0.9894435168995753, + "grad_norm": 2.7413192203962407, + "learning_rate": 1.3937176005165975e-09, + "loss": 1.0094, + "step": 9408 + }, + { + "epoch": 0.9895486873414227, + "grad_norm": 1.9967362712782808, + "learning_rate": 1.3659851462499708e-09, + "loss": 0.9604, + "step": 9409 + }, + { + "epoch": 0.98965385778327, + "grad_norm": 3.023690617394417, + "learning_rate": 1.3385313090857888e-09, + "loss": 0.9499, + "step": 9410 + }, + { + "epoch": 0.9897590282251173, + "grad_norm": 3.0257951189785106, + "learning_rate": 1.3113560920860468e-09, + "loss": 0.9351, + "step": 9411 + }, + { + "epoch": 0.9898641986669646, + "grad_norm": 2.413833812119721, + "learning_rate": 1.284459498280266e-09, + "loss": 1.0105, + "step": 9412 + }, + { + "epoch": 0.989969369108812, + "grad_norm": 2.037001468545561, + "learning_rate": 1.257841530668269e-09, + "loss": 0.9782, + "step": 9413 + }, + { + "epoch": 0.9900745395506593, + "grad_norm": 1.6511342738858803, + "learning_rate": 1.2315021922176817e-09, + "loss": 0.9412, + "step": 9414 + }, + { + "epoch": 0.9901797099925066, + "grad_norm": 2.8902872997925804, + "learning_rate": 1.2054414858655995e-09, + "loss": 0.9708, + "step": 9415 + }, + { + "epoch": 0.9902848804343539, + "grad_norm": 4.027061186967938, + "learning_rate": 1.1796594145183081e-09, + "loss": 1.0044, + "step": 9416 + }, + { + "epoch": 0.9903900508762012, + "grad_norm": 2.0376936451035483, + "learning_rate": 1.1541559810504532e-09, + "loss": 0.9808, + "step": 9417 + }, + { + "epoch": 0.9904952213180486, + "grad_norm": 2.5382357729640037, + "learning_rate": 1.1289311883058707e-09, + "loss": 0.9738, + "step": 9418 + }, + { + "epoch": 0.9906003917598959, + "grad_norm": 1.7785026493838247, + "learning_rate": 1.1039850390973107e-09, + "loss": 0.9819, + "step": 9419 + }, + { + "epoch": 0.9907055622017432, + "grad_norm": 2.6469976920281453, + "learning_rate": 1.0793175362067143e-09, + "loss": 0.9929, + "step": 9420 + }, + { + "epoch": 0.9908107326435905, + "grad_norm": 2.0316413833430387, + "learning_rate": 1.054928682384382e-09, + "loss": 0.9638, + "step": 9421 + }, + { + "epoch": 0.9909159030854379, + "grad_norm": 2.6241976883897, + "learning_rate": 1.0308184803498046e-09, + "loss": 0.9819, + "step": 9422 + }, + { + "epoch": 0.9910210735272852, + "grad_norm": 2.229629028463766, + "learning_rate": 1.0069869327919424e-09, + "loss": 0.9947, + "step": 9423 + }, + { + "epoch": 0.9911262439691325, + "grad_norm": 2.788186432126167, + "learning_rate": 9.834340423678368e-10, + "loss": 0.9522, + "step": 9424 + }, + { + "epoch": 0.9912314144109798, + "grad_norm": 2.698863781368917, + "learning_rate": 9.601598117037202e-10, + "loss": 0.9553, + "step": 9425 + }, + { + "epoch": 0.9913365848528272, + "grad_norm": 2.3066739529849736, + "learning_rate": 9.371642433952943e-10, + "loss": 0.9726, + "step": 9426 + }, + { + "epoch": 0.9914417552946745, + "grad_norm": 2.7776673360748387, + "learning_rate": 9.144473400060639e-10, + "loss": 1.005, + "step": 9427 + }, + { + "epoch": 0.9915469257365217, + "grad_norm": 2.033776373402885, + "learning_rate": 8.920091040698353e-10, + "loss": 0.9883, + "step": 9428 + }, + { + "epoch": 0.991652096178369, + "grad_norm": 2.9626974448308045, + "learning_rate": 8.698495380882188e-10, + "loss": 1.0091, + "step": 9429 + }, + { + "epoch": 0.9917572666202163, + "grad_norm": 2.412674687043095, + "learning_rate": 8.479686445322932e-10, + "loss": 0.9696, + "step": 9430 + }, + { + "epoch": 0.9918624370620637, + "grad_norm": 2.280018692073066, + "learning_rate": 8.263664258420512e-10, + "loss": 0.952, + "step": 9431 + }, + { + "epoch": 0.991967607503911, + "grad_norm": 2.536989287684638, + "learning_rate": 8.050428844261215e-10, + "loss": 1.0399, + "step": 9432 + }, + { + "epoch": 0.9920727779457583, + "grad_norm": 2.4868353766120803, + "learning_rate": 7.839980226623245e-10, + "loss": 0.9822, + "step": 9433 + }, + { + "epoch": 0.9921779483876056, + "grad_norm": 2.2345599750935303, + "learning_rate": 7.632318428976715e-10, + "loss": 0.9763, + "step": 9434 + }, + { + "epoch": 0.992283118829453, + "grad_norm": 2.7320164282700006, + "learning_rate": 7.427443474469776e-10, + "loss": 0.9614, + "step": 9435 + }, + { + "epoch": 0.9923882892713003, + "grad_norm": 2.693307353444875, + "learning_rate": 7.225355385953592e-10, + "loss": 0.9834, + "step": 9436 + }, + { + "epoch": 0.9924934597131476, + "grad_norm": 2.7375269267620266, + "learning_rate": 7.02605418596014e-10, + "loss": 1.0151, + "step": 9437 + }, + { + "epoch": 0.9925986301549949, + "grad_norm": 1.9154701081163854, + "learning_rate": 6.829539896716087e-10, + "loss": 0.9614, + "step": 9438 + }, + { + "epoch": 0.9927038005968423, + "grad_norm": 2.7318506358078727, + "learning_rate": 6.635812540131681e-10, + "loss": 0.9716, + "step": 9439 + }, + { + "epoch": 0.9928089710386896, + "grad_norm": 2.35587299097919, + "learning_rate": 6.444872137806313e-10, + "loss": 0.9932, + "step": 9440 + }, + { + "epoch": 0.9929141414805369, + "grad_norm": 2.7986557506186425, + "learning_rate": 6.256718711036835e-10, + "loss": 0.9715, + "step": 9441 + }, + { + "epoch": 0.9930193119223842, + "grad_norm": 2.958571293660485, + "learning_rate": 6.071352280800912e-10, + "loss": 0.988, + "step": 9442 + }, + { + "epoch": 0.9931244823642316, + "grad_norm": 3.0504234064777522, + "learning_rate": 5.888772867770897e-10, + "loss": 0.9765, + "step": 9443 + }, + { + "epoch": 0.9932296528060789, + "grad_norm": 2.2184835377963266, + "learning_rate": 5.708980492302729e-10, + "loss": 0.9987, + "step": 9444 + }, + { + "epoch": 0.9933348232479262, + "grad_norm": 1.9829209945604436, + "learning_rate": 5.531975174444259e-10, + "loss": 0.9188, + "step": 9445 + }, + { + "epoch": 0.9934399936897735, + "grad_norm": 2.833298967270071, + "learning_rate": 5.357756933935254e-10, + "loss": 1.0, + "step": 9446 + }, + { + "epoch": 0.9935451641316209, + "grad_norm": 2.299238400231606, + "learning_rate": 5.186325790199065e-10, + "loss": 0.9838, + "step": 9447 + }, + { + "epoch": 0.9936503345734681, + "grad_norm": 2.1636055939503707, + "learning_rate": 5.017681762356507e-10, + "loss": 0.9973, + "step": 9448 + }, + { + "epoch": 0.9937555050153154, + "grad_norm": 2.607403264070489, + "learning_rate": 4.851824869211985e-10, + "loss": 1.0123, + "step": 9449 + }, + { + "epoch": 0.9938606754571627, + "grad_norm": 2.921227734664202, + "learning_rate": 4.688755129256262e-10, + "loss": 1.0149, + "step": 9450 + }, + { + "epoch": 0.99396584589901, + "grad_norm": 2.106489666904462, + "learning_rate": 4.5284725606747903e-10, + "loss": 0.9528, + "step": 9451 + }, + { + "epoch": 0.9940710163408574, + "grad_norm": 2.2948653311351146, + "learning_rate": 4.3709771813393864e-10, + "loss": 0.9819, + "step": 9452 + }, + { + "epoch": 0.9941761867827047, + "grad_norm": 2.5397350328464, + "learning_rate": 4.216269008813778e-10, + "loss": 1.0424, + "step": 9453 + }, + { + "epoch": 0.994281357224552, + "grad_norm": 2.6859348273107098, + "learning_rate": 4.0643480603480554e-10, + "loss": 0.9152, + "step": 9454 + }, + { + "epoch": 0.9943865276663993, + "grad_norm": 2.305716938487438, + "learning_rate": 3.9152143528842226e-10, + "loss": 0.9636, + "step": 9455 + }, + { + "epoch": 0.9944916981082467, + "grad_norm": 2.317768982172829, + "learning_rate": 3.768867903047868e-10, + "loss": 0.9895, + "step": 9456 + }, + { + "epoch": 0.994596868550094, + "grad_norm": 1.9307647936153862, + "learning_rate": 3.625308727162047e-10, + "loss": 0.9515, + "step": 9457 + }, + { + "epoch": 0.9947020389919413, + "grad_norm": 2.9486289730118234, + "learning_rate": 3.4845368412306235e-10, + "loss": 0.9608, + "step": 9458 + }, + { + "epoch": 0.9948072094337886, + "grad_norm": 2.821432702813321, + "learning_rate": 3.346552260954927e-10, + "loss": 1.0182, + "step": 9459 + }, + { + "epoch": 0.994912379875636, + "grad_norm": 2.0682478422118526, + "learning_rate": 3.2113550017198734e-10, + "loss": 0.9518, + "step": 9460 + }, + { + "epoch": 0.9950175503174833, + "grad_norm": 2.4469821699306205, + "learning_rate": 3.0789450785995155e-10, + "loss": 1.0303, + "step": 9461 + }, + { + "epoch": 0.9951227207593306, + "grad_norm": 2.318744276916468, + "learning_rate": 2.94932250635982e-10, + "loss": 0.9428, + "step": 9462 + }, + { + "epoch": 0.9952278912011779, + "grad_norm": 1.9099911442636568, + "learning_rate": 2.8224872994558895e-10, + "loss": 0.9649, + "step": 9463 + }, + { + "epoch": 0.9953330616430253, + "grad_norm": 2.244161598411751, + "learning_rate": 2.6984394720291904e-10, + "loss": 0.9282, + "step": 9464 + }, + { + "epoch": 0.9954382320848726, + "grad_norm": 2.0228783659285217, + "learning_rate": 2.5771790379103267e-10, + "loss": 1.0379, + "step": 9465 + }, + { + "epoch": 0.9955434025267199, + "grad_norm": 2.7663617676626044, + "learning_rate": 2.45870601062459e-10, + "loss": 0.9732, + "step": 9466 + }, + { + "epoch": 0.9956485729685672, + "grad_norm": 2.783584442780796, + "learning_rate": 2.3430204033808577e-10, + "loss": 0.981, + "step": 9467 + }, + { + "epoch": 0.9957537434104144, + "grad_norm": 2.2784586979725168, + "learning_rate": 2.2301222290771473e-10, + "loss": 0.9731, + "step": 9468 + }, + { + "epoch": 0.9958589138522618, + "grad_norm": 2.474615759545288, + "learning_rate": 2.1200115003061628e-10, + "loss": 0.9718, + "step": 9469 + }, + { + "epoch": 0.9959640842941091, + "grad_norm": 2.7375505869626546, + "learning_rate": 2.012688229344195e-10, + "loss": 0.9472, + "step": 9470 + }, + { + "epoch": 0.9960692547359564, + "grad_norm": 3.181469509796547, + "learning_rate": 1.9081524281566732e-10, + "loss": 0.9785, + "step": 9471 + }, + { + "epoch": 0.9961744251778037, + "grad_norm": 3.8615566778110924, + "learning_rate": 1.8064041084037143e-10, + "loss": 0.9676, + "step": 9472 + }, + { + "epoch": 0.9962795956196511, + "grad_norm": 3.381624272619375, + "learning_rate": 1.707443281429022e-10, + "loss": 1.0194, + "step": 9473 + }, + { + "epoch": 0.9963847660614984, + "grad_norm": 3.1413976288468852, + "learning_rate": 1.6112699582682135e-10, + "loss": 1.0142, + "step": 9474 + }, + { + "epoch": 0.9964899365033457, + "grad_norm": 2.6439768248020177, + "learning_rate": 1.517884149646043e-10, + "loss": 0.9825, + "step": 9475 + }, + { + "epoch": 0.996595106945193, + "grad_norm": 2.47459179696731, + "learning_rate": 1.4272858659736265e-10, + "loss": 0.9735, + "step": 9476 + }, + { + "epoch": 0.9967002773870404, + "grad_norm": 1.6572520116151759, + "learning_rate": 1.33947511735677e-10, + "loss": 0.967, + "step": 9477 + }, + { + "epoch": 0.9968054478288877, + "grad_norm": 2.1650133863954624, + "learning_rate": 1.2544519135820887e-10, + "loss": 0.9935, + "step": 9478 + }, + { + "epoch": 0.996910618270735, + "grad_norm": 2.6395311326751396, + "learning_rate": 1.1722162641336633e-10, + "loss": 0.9942, + "step": 9479 + }, + { + "epoch": 0.9970157887125823, + "grad_norm": 2.3369728830368817, + "learning_rate": 1.0927681781819354e-10, + "loss": 0.9496, + "step": 9480 + }, + { + "epoch": 0.9971209591544297, + "grad_norm": 2.9099781993657454, + "learning_rate": 1.0161076645837098e-10, + "loss": 0.9617, + "step": 9481 + }, + { + "epoch": 0.997226129596277, + "grad_norm": 1.9322457713584655, + "learning_rate": 9.422347318877034e-11, + "loss": 1.0036, + "step": 9482 + }, + { + "epoch": 0.9973313000381243, + "grad_norm": 3.2635257365367516, + "learning_rate": 8.711493883317712e-11, + "loss": 0.9886, + "step": 9483 + }, + { + "epoch": 0.9974364704799716, + "grad_norm": 2.220418457360787, + "learning_rate": 8.028516418456811e-11, + "loss": 0.9676, + "step": 9484 + }, + { + "epoch": 0.997541640921819, + "grad_norm": 2.3115695199846384, + "learning_rate": 7.37341500040012e-11, + "loss": 0.9517, + "step": 9485 + }, + { + "epoch": 0.9976468113636663, + "grad_norm": 2.013983684435446, + "learning_rate": 6.746189702228067e-11, + "loss": 0.9492, + "step": 9486 + }, + { + "epoch": 0.9977519818055136, + "grad_norm": 2.816746556589395, + "learning_rate": 6.14684059385695e-11, + "loss": 0.9404, + "step": 9487 + }, + { + "epoch": 0.9978571522473609, + "grad_norm": 2.0503969665852066, + "learning_rate": 5.575367742149951e-11, + "loss": 0.9634, + "step": 9488 + }, + { + "epoch": 0.9979623226892081, + "grad_norm": 1.724401911640861, + "learning_rate": 5.0317712108061135e-11, + "loss": 0.9739, + "step": 9489 + }, + { + "epoch": 0.9980674931310555, + "grad_norm": 2.1346980648205705, + "learning_rate": 4.5160510604436156e-11, + "loss": 0.9721, + "step": 9490 + }, + { + "epoch": 0.9981726635729028, + "grad_norm": 2.090678339510078, + "learning_rate": 4.028207348572011e-11, + "loss": 0.9147, + "step": 9491 + }, + { + "epoch": 0.9982778340147501, + "grad_norm": 2.392059438797523, + "learning_rate": 3.568240129619982e-11, + "loss": 1.0114, + "step": 9492 + }, + { + "epoch": 0.9983830044565974, + "grad_norm": 2.506048349048106, + "learning_rate": 3.1361494548243224e-11, + "loss": 0.9004, + "step": 9493 + }, + { + "epoch": 0.9984881748984448, + "grad_norm": 2.7270228934874545, + "learning_rate": 2.7319353723964657e-11, + "loss": 0.9964, + "step": 9494 + }, + { + "epoch": 0.9985933453402921, + "grad_norm": 2.3101031627179864, + "learning_rate": 2.355597927411468e-11, + "loss": 0.9873, + "step": 9495 + }, + { + "epoch": 0.9986985157821394, + "grad_norm": 1.8549205557474275, + "learning_rate": 2.0071371618080037e-11, + "loss": 1.0167, + "step": 9496 + }, + { + "epoch": 0.9988036862239867, + "grad_norm": 2.4247474783252194, + "learning_rate": 1.686553114471634e-11, + "loss": 1.0092, + "step": 9497 + }, + { + "epoch": 0.9989088566658341, + "grad_norm": 2.5711740916551227, + "learning_rate": 1.3938458211515404e-11, + "loss": 0.996, + "step": 9498 + }, + { + "epoch": 0.9990140271076814, + "grad_norm": 2.6142784134493415, + "learning_rate": 1.1290153144605243e-11, + "loss": 0.936, + "step": 9499 + }, + { + "epoch": 0.9991191975495287, + "grad_norm": 2.4638656000993873, + "learning_rate": 8.920616239582735e-12, + "loss": 0.9754, + "step": 9500 + }, + { + "epoch": 0.999224367991376, + "grad_norm": 2.5496610952882843, + "learning_rate": 6.829847760403408e-12, + "loss": 0.9836, + "step": 9501 + }, + { + "epoch": 0.9993295384332234, + "grad_norm": 2.000881646427515, + "learning_rate": 5.017847940214093e-12, + "loss": 0.988, + "step": 9502 + }, + { + "epoch": 0.9994347088750707, + "grad_norm": 2.4163049388267765, + "learning_rate": 3.4846169813529394e-12, + "loss": 0.9964, + "step": 9503 + }, + { + "epoch": 0.999539879316918, + "grad_norm": 2.5790803945058864, + "learning_rate": 2.230155054516736e-12, + "loss": 0.9758, + "step": 9504 + }, + { + "epoch": 0.9996450497587653, + "grad_norm": 2.555299051963278, + "learning_rate": 1.2544622998711398e-12, + "loss": 1.0216, + "step": 9505 + }, + { + "epoch": 0.9997502202006127, + "grad_norm": 2.506038816699829, + "learning_rate": 5.575388259404512e-13, + "loss": 0.9635, + "step": 9506 + }, + { + "epoch": 0.99985539064246, + "grad_norm": 2.9286325043640606, + "learning_rate": 1.3938471044028234e-13, + "loss": 0.9874, + "step": 9507 + }, + { + "epoch": 0.9999605610843073, + "grad_norm": 1.8297745470230433, + "learning_rate": 0.0, + "loss": 0.948, + "step": 9508 + }, + { + "epoch": 0.9999605610843073, + "step": 9508, + "total_flos": 7610494285643776.0, + "train_loss": 1.0062551115383658, + "train_runtime": 85567.0625, + "train_samples_per_second": 7.112, + "train_steps_per_second": 0.111 + } + ], + "logging_steps": 1.0, + "max_steps": 9508, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7610494285643776.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}