diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,57006 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8139, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012286521685710776, + "grad_norm": 5.513866090258891, + "learning_rate": 3.685503685503685e-08, + "loss": 0.9282, + "step": 1 + }, + { + "epoch": 0.0002457304337142155, + "grad_norm": 4.9554528632421695, + "learning_rate": 7.37100737100737e-08, + "loss": 0.9808, + "step": 2 + }, + { + "epoch": 0.00036859565057132326, + "grad_norm": 4.462062461574065, + "learning_rate": 1.1056511056511058e-07, + "loss": 0.9555, + "step": 3 + }, + { + "epoch": 0.000491460867428431, + "grad_norm": 4.656553225564516, + "learning_rate": 1.474201474201474e-07, + "loss": 0.923, + "step": 4 + }, + { + "epoch": 0.0006143260842855387, + "grad_norm": 4.3910806878746405, + "learning_rate": 1.8427518427518426e-07, + "loss": 0.9445, + "step": 5 + }, + { + "epoch": 0.0007371913011426465, + "grad_norm": 5.394970800217896, + "learning_rate": 2.2113022113022115e-07, + "loss": 1.0469, + "step": 6 + }, + { + "epoch": 0.0008600565179997543, + "grad_norm": 5.357960943161164, + "learning_rate": 2.57985257985258e-07, + "loss": 0.9159, + "step": 7 + }, + { + "epoch": 0.000982921734856862, + "grad_norm": 5.023035223958168, + "learning_rate": 2.948402948402948e-07, + "loss": 0.9649, + "step": 8 + }, + { + "epoch": 0.0011057869517139699, + "grad_norm": 5.6997600638412225, + "learning_rate": 3.3169533169533167e-07, + "loss": 0.9858, + "step": 9 + }, + { + "epoch": 0.0012286521685710775, + "grad_norm": 4.822320421005481, + "learning_rate": 3.685503685503685e-07, + "loss": 0.9486, + "step": 10 + }, + { + "epoch": 0.0013515173854281852, + "grad_norm": 6.200074438855475, + "learning_rate": 4.0540540540540546e-07, + "loss": 1.083, + "step": 11 + }, + { + "epoch": 0.001474382602285293, + "grad_norm": 5.925483423081176, + "learning_rate": 4.422604422604423e-07, + "loss": 0.9326, + "step": 12 + }, + { + "epoch": 0.0015972478191424008, + "grad_norm": 5.36625544305861, + "learning_rate": 4.791154791154791e-07, + "loss": 1.0109, + "step": 13 + }, + { + "epoch": 0.0017201130359995086, + "grad_norm": 4.321108070196379, + "learning_rate": 5.15970515970516e-07, + "loss": 0.8756, + "step": 14 + }, + { + "epoch": 0.0018429782528566164, + "grad_norm": 4.939830306048706, + "learning_rate": 5.528255528255528e-07, + "loss": 1.0655, + "step": 15 + }, + { + "epoch": 0.001965843469713724, + "grad_norm": 4.799782704105224, + "learning_rate": 5.896805896805896e-07, + "loss": 1.093, + "step": 16 + }, + { + "epoch": 0.0020887086865708318, + "grad_norm": 4.308163412038154, + "learning_rate": 6.265356265356265e-07, + "loss": 0.9188, + "step": 17 + }, + { + "epoch": 0.0022115739034279398, + "grad_norm": 5.033423055391164, + "learning_rate": 6.633906633906633e-07, + "loss": 0.9692, + "step": 18 + }, + { + "epoch": 0.0023344391202850473, + "grad_norm": 4.393822453744325, + "learning_rate": 7.002457002457002e-07, + "loss": 1.0138, + "step": 19 + }, + { + "epoch": 0.002457304337142155, + "grad_norm": 4.2097510698903, + "learning_rate": 7.37100737100737e-07, + "loss": 0.9279, + "step": 20 + }, + { + "epoch": 0.002580169553999263, + "grad_norm": 3.9117326521215965, + "learning_rate": 7.73955773955774e-07, + "loss": 1.0043, + "step": 21 + }, + { + "epoch": 0.0027030347708563705, + "grad_norm": 4.3969351698596295, + "learning_rate": 8.108108108108109e-07, + "loss": 0.9832, + "step": 22 + }, + { + "epoch": 0.0028258999877134785, + "grad_norm": 4.003253123142959, + "learning_rate": 8.476658476658478e-07, + "loss": 0.9429, + "step": 23 + }, + { + "epoch": 0.002948765204570586, + "grad_norm": 3.4494151704322524, + "learning_rate": 8.845208845208846e-07, + "loss": 0.9876, + "step": 24 + }, + { + "epoch": 0.0030716304214276936, + "grad_norm": 3.6501311378231653, + "learning_rate": 9.213759213759215e-07, + "loss": 0.867, + "step": 25 + }, + { + "epoch": 0.0031944956382848016, + "grad_norm": 3.3634474475404894, + "learning_rate": 9.582309582309582e-07, + "loss": 0.8617, + "step": 26 + }, + { + "epoch": 0.003317360855141909, + "grad_norm": 4.117736000713901, + "learning_rate": 9.95085995085995e-07, + "loss": 1.0234, + "step": 27 + }, + { + "epoch": 0.003440226071999017, + "grad_norm": 3.5647953413926996, + "learning_rate": 1.031941031941032e-06, + "loss": 0.9908, + "step": 28 + }, + { + "epoch": 0.003563091288856125, + "grad_norm": 3.866749586946385, + "learning_rate": 1.0687960687960688e-06, + "loss": 0.9723, + "step": 29 + }, + { + "epoch": 0.003685956505713233, + "grad_norm": 4.06338403512261, + "learning_rate": 1.1056511056511056e-06, + "loss": 0.9832, + "step": 30 + }, + { + "epoch": 0.0038088217225703404, + "grad_norm": 4.605165542662463, + "learning_rate": 1.1425061425061425e-06, + "loss": 0.9528, + "step": 31 + }, + { + "epoch": 0.003931686939427448, + "grad_norm": 3.4567611655777437, + "learning_rate": 1.1793611793611793e-06, + "loss": 0.8588, + "step": 32 + }, + { + "epoch": 0.004054552156284556, + "grad_norm": 3.4951438002510287, + "learning_rate": 1.2162162162162162e-06, + "loss": 0.9919, + "step": 33 + }, + { + "epoch": 0.0041774173731416635, + "grad_norm": 3.035179925207047, + "learning_rate": 1.253071253071253e-06, + "loss": 0.9158, + "step": 34 + }, + { + "epoch": 0.004300282589998771, + "grad_norm": 3.071876797860922, + "learning_rate": 1.28992628992629e-06, + "loss": 0.9149, + "step": 35 + }, + { + "epoch": 0.0044231478068558795, + "grad_norm": 4.111005566745022, + "learning_rate": 1.3267813267813267e-06, + "loss": 0.95, + "step": 36 + }, + { + "epoch": 0.004546013023712987, + "grad_norm": 3.8168902975192087, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.871, + "step": 37 + }, + { + "epoch": 0.004668878240570095, + "grad_norm": 3.8555521925282537, + "learning_rate": 1.4004914004914004e-06, + "loss": 0.9557, + "step": 38 + }, + { + "epoch": 0.004791743457427202, + "grad_norm": 2.939536985558811, + "learning_rate": 1.4373464373464373e-06, + "loss": 0.8727, + "step": 39 + }, + { + "epoch": 0.00491460867428431, + "grad_norm": 3.034058598134351, + "learning_rate": 1.474201474201474e-06, + "loss": 0.9908, + "step": 40 + }, + { + "epoch": 0.005037473891141418, + "grad_norm": 2.845652611291133, + "learning_rate": 1.5110565110565112e-06, + "loss": 0.8475, + "step": 41 + }, + { + "epoch": 0.005160339107998526, + "grad_norm": 2.2463314144132607, + "learning_rate": 1.547911547911548e-06, + "loss": 0.8375, + "step": 42 + }, + { + "epoch": 0.005283204324855633, + "grad_norm": 2.8150265768403635, + "learning_rate": 1.584766584766585e-06, + "loss": 0.8842, + "step": 43 + }, + { + "epoch": 0.005406069541712741, + "grad_norm": 3.400436755110308, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.9731, + "step": 44 + }, + { + "epoch": 0.0055289347585698485, + "grad_norm": 2.5875092640144626, + "learning_rate": 1.6584766584766586e-06, + "loss": 0.8952, + "step": 45 + }, + { + "epoch": 0.005651799975426957, + "grad_norm": 2.9967583649325187, + "learning_rate": 1.6953316953316955e-06, + "loss": 0.8758, + "step": 46 + }, + { + "epoch": 0.0057746651922840645, + "grad_norm": 3.726508092699981, + "learning_rate": 1.7321867321867323e-06, + "loss": 0.8588, + "step": 47 + }, + { + "epoch": 0.005897530409141172, + "grad_norm": 2.4185906853044563, + "learning_rate": 1.7690417690417692e-06, + "loss": 0.9433, + "step": 48 + }, + { + "epoch": 0.00602039562599828, + "grad_norm": 2.1300107100086225, + "learning_rate": 1.805896805896806e-06, + "loss": 0.8694, + "step": 49 + }, + { + "epoch": 0.006143260842855387, + "grad_norm": 1.9725055247468293, + "learning_rate": 1.842751842751843e-06, + "loss": 0.9715, + "step": 50 + }, + { + "epoch": 0.006266126059712496, + "grad_norm": 2.424151075834017, + "learning_rate": 1.8796068796068799e-06, + "loss": 0.8716, + "step": 51 + }, + { + "epoch": 0.006388991276569603, + "grad_norm": 2.47301723561335, + "learning_rate": 1.9164619164619164e-06, + "loss": 0.8766, + "step": 52 + }, + { + "epoch": 0.006511856493426711, + "grad_norm": 2.306221114798729, + "learning_rate": 1.9533169533169534e-06, + "loss": 0.9108, + "step": 53 + }, + { + "epoch": 0.006634721710283818, + "grad_norm": 2.7998718379596172, + "learning_rate": 1.99017199017199e-06, + "loss": 0.8301, + "step": 54 + }, + { + "epoch": 0.006757586927140926, + "grad_norm": 1.9316957290525076, + "learning_rate": 2.0270270270270273e-06, + "loss": 0.8245, + "step": 55 + }, + { + "epoch": 0.006880452143998034, + "grad_norm": 2.3225785839846447, + "learning_rate": 2.063882063882064e-06, + "loss": 0.7795, + "step": 56 + }, + { + "epoch": 0.007003317360855142, + "grad_norm": 2.1506352887629903, + "learning_rate": 2.1007371007371007e-06, + "loss": 0.8121, + "step": 57 + }, + { + "epoch": 0.00712618257771225, + "grad_norm": 2.2414978938850094, + "learning_rate": 2.1375921375921377e-06, + "loss": 0.8411, + "step": 58 + }, + { + "epoch": 0.007249047794569357, + "grad_norm": 2.217372090144132, + "learning_rate": 2.1744471744471746e-06, + "loss": 0.8757, + "step": 59 + }, + { + "epoch": 0.007371913011426466, + "grad_norm": 2.2817429591369165, + "learning_rate": 2.211302211302211e-06, + "loss": 0.7842, + "step": 60 + }, + { + "epoch": 0.007494778228283573, + "grad_norm": 2.5386942795509246, + "learning_rate": 2.248157248157248e-06, + "loss": 0.9066, + "step": 61 + }, + { + "epoch": 0.007617643445140681, + "grad_norm": 2.324567330178371, + "learning_rate": 2.285012285012285e-06, + "loss": 0.8358, + "step": 62 + }, + { + "epoch": 0.007740508661997788, + "grad_norm": 2.4843663767121793, + "learning_rate": 2.321867321867322e-06, + "loss": 0.8161, + "step": 63 + }, + { + "epoch": 0.007863373878854897, + "grad_norm": 2.2400952081124554, + "learning_rate": 2.3587223587223586e-06, + "loss": 0.8063, + "step": 64 + }, + { + "epoch": 0.007986239095712003, + "grad_norm": 2.4470283289037416, + "learning_rate": 2.3955773955773955e-06, + "loss": 0.8059, + "step": 65 + }, + { + "epoch": 0.008109104312569112, + "grad_norm": 2.0603183792683124, + "learning_rate": 2.4324324324324325e-06, + "loss": 0.8019, + "step": 66 + }, + { + "epoch": 0.008231969529426219, + "grad_norm": 2.2266532809288075, + "learning_rate": 2.4692874692874694e-06, + "loss": 0.7921, + "step": 67 + }, + { + "epoch": 0.008354834746283327, + "grad_norm": 2.1533768123746198, + "learning_rate": 2.506142506142506e-06, + "loss": 0.7185, + "step": 68 + }, + { + "epoch": 0.008477699963140435, + "grad_norm": 2.2697446351636352, + "learning_rate": 2.542997542997543e-06, + "loss": 0.8926, + "step": 69 + }, + { + "epoch": 0.008600565179997542, + "grad_norm": 1.9547412434546492, + "learning_rate": 2.57985257985258e-06, + "loss": 0.7148, + "step": 70 + }, + { + "epoch": 0.00872343039685465, + "grad_norm": 1.9371145887095345, + "learning_rate": 2.616707616707617e-06, + "loss": 0.7846, + "step": 71 + }, + { + "epoch": 0.008846295613711759, + "grad_norm": 2.537280666949752, + "learning_rate": 2.6535626535626533e-06, + "loss": 0.7594, + "step": 72 + }, + { + "epoch": 0.008969160830568866, + "grad_norm": 2.0134445444127267, + "learning_rate": 2.6904176904176907e-06, + "loss": 0.8376, + "step": 73 + }, + { + "epoch": 0.009092026047425974, + "grad_norm": 2.375681185480166, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.8542, + "step": 74 + }, + { + "epoch": 0.009214891264283081, + "grad_norm": 2.2284015663099996, + "learning_rate": 2.764127764127764e-06, + "loss": 0.7682, + "step": 75 + }, + { + "epoch": 0.00933775648114019, + "grad_norm": 2.1223572049006183, + "learning_rate": 2.8009828009828007e-06, + "loss": 0.8151, + "step": 76 + }, + { + "epoch": 0.009460621697997298, + "grad_norm": 1.9498992886735789, + "learning_rate": 2.837837837837838e-06, + "loss": 0.8871, + "step": 77 + }, + { + "epoch": 0.009583486914854404, + "grad_norm": 1.761698645327371, + "learning_rate": 2.8746928746928746e-06, + "loss": 0.7584, + "step": 78 + }, + { + "epoch": 0.009706352131711513, + "grad_norm": 2.124914597987514, + "learning_rate": 2.9115479115479116e-06, + "loss": 0.8169, + "step": 79 + }, + { + "epoch": 0.00982921734856862, + "grad_norm": 2.0919909269488186, + "learning_rate": 2.948402948402948e-06, + "loss": 0.6499, + "step": 80 + }, + { + "epoch": 0.009952082565425728, + "grad_norm": 2.099324623037599, + "learning_rate": 2.9852579852579855e-06, + "loss": 0.6526, + "step": 81 + }, + { + "epoch": 0.010074947782282836, + "grad_norm": 1.92075738433415, + "learning_rate": 3.0221130221130224e-06, + "loss": 0.9446, + "step": 82 + }, + { + "epoch": 0.010197812999139943, + "grad_norm": 2.4867794775340966, + "learning_rate": 3.058968058968059e-06, + "loss": 0.744, + "step": 83 + }, + { + "epoch": 0.010320678215997052, + "grad_norm": 2.151973857031448, + "learning_rate": 3.095823095823096e-06, + "loss": 0.7708, + "step": 84 + }, + { + "epoch": 0.010443543432854158, + "grad_norm": 2.2078944058232524, + "learning_rate": 3.132678132678133e-06, + "loss": 0.7984, + "step": 85 + }, + { + "epoch": 0.010566408649711267, + "grad_norm": 2.112322210243022, + "learning_rate": 3.16953316953317e-06, + "loss": 0.9349, + "step": 86 + }, + { + "epoch": 0.010689273866568375, + "grad_norm": 1.8966018599814107, + "learning_rate": 3.2063882063882063e-06, + "loss": 0.9199, + "step": 87 + }, + { + "epoch": 0.010812139083425482, + "grad_norm": 2.3004075590576076, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.8567, + "step": 88 + }, + { + "epoch": 0.01093500430028259, + "grad_norm": 2.0541443243309727, + "learning_rate": 3.2800982800982802e-06, + "loss": 0.8415, + "step": 89 + }, + { + "epoch": 0.011057869517139697, + "grad_norm": 1.7361116289703336, + "learning_rate": 3.316953316953317e-06, + "loss": 0.7144, + "step": 90 + }, + { + "epoch": 0.011180734733996806, + "grad_norm": 1.989445636753907, + "learning_rate": 3.3538083538083537e-06, + "loss": 0.691, + "step": 91 + }, + { + "epoch": 0.011303599950853914, + "grad_norm": 1.9607075864202708, + "learning_rate": 3.390663390663391e-06, + "loss": 0.709, + "step": 92 + }, + { + "epoch": 0.01142646516771102, + "grad_norm": 1.9180637242155694, + "learning_rate": 3.4275184275184276e-06, + "loss": 0.8674, + "step": 93 + }, + { + "epoch": 0.011549330384568129, + "grad_norm": 2.176961099785198, + "learning_rate": 3.4643734643734646e-06, + "loss": 0.7773, + "step": 94 + }, + { + "epoch": 0.011672195601425236, + "grad_norm": 1.8711385480214902, + "learning_rate": 3.501228501228501e-06, + "loss": 0.8228, + "step": 95 + }, + { + "epoch": 0.011795060818282344, + "grad_norm": 2.7093646366375186, + "learning_rate": 3.5380835380835385e-06, + "loss": 0.8478, + "step": 96 + }, + { + "epoch": 0.011917926035139453, + "grad_norm": 1.9521329019033107, + "learning_rate": 3.574938574938575e-06, + "loss": 0.8348, + "step": 97 + }, + { + "epoch": 0.01204079125199656, + "grad_norm": 2.7549800721483524, + "learning_rate": 3.611793611793612e-06, + "loss": 0.7688, + "step": 98 + }, + { + "epoch": 0.012163656468853668, + "grad_norm": 2.123924485994918, + "learning_rate": 3.648648648648649e-06, + "loss": 0.7667, + "step": 99 + }, + { + "epoch": 0.012286521685710775, + "grad_norm": 1.7943724330319255, + "learning_rate": 3.685503685503686e-06, + "loss": 0.7188, + "step": 100 + }, + { + "epoch": 0.012409386902567883, + "grad_norm": 1.7062001924272912, + "learning_rate": 3.7223587223587224e-06, + "loss": 0.7607, + "step": 101 + }, + { + "epoch": 0.012532252119424991, + "grad_norm": 2.096044564947959, + "learning_rate": 3.7592137592137598e-06, + "loss": 0.7247, + "step": 102 + }, + { + "epoch": 0.012655117336282098, + "grad_norm": 1.9271775214159095, + "learning_rate": 3.796068796068796e-06, + "loss": 0.7519, + "step": 103 + }, + { + "epoch": 0.012777982553139207, + "grad_norm": 1.755264391118941, + "learning_rate": 3.832923832923833e-06, + "loss": 0.8325, + "step": 104 + }, + { + "epoch": 0.012900847769996313, + "grad_norm": 1.9989465666278592, + "learning_rate": 3.869778869778871e-06, + "loss": 0.689, + "step": 105 + }, + { + "epoch": 0.013023712986853422, + "grad_norm": 2.0000267967175565, + "learning_rate": 3.906633906633907e-06, + "loss": 0.6583, + "step": 106 + }, + { + "epoch": 0.01314657820371053, + "grad_norm": 1.9893807661366398, + "learning_rate": 3.943488943488944e-06, + "loss": 0.7455, + "step": 107 + }, + { + "epoch": 0.013269443420567637, + "grad_norm": 2.3835493881692096, + "learning_rate": 3.98034398034398e-06, + "loss": 0.7274, + "step": 108 + }, + { + "epoch": 0.013392308637424745, + "grad_norm": 1.7164620683578935, + "learning_rate": 4.0171990171990176e-06, + "loss": 0.7556, + "step": 109 + }, + { + "epoch": 0.013515173854281852, + "grad_norm": 2.2908311887870694, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.835, + "step": 110 + }, + { + "epoch": 0.01363803907113896, + "grad_norm": 2.039544661558442, + "learning_rate": 4.090909090909091e-06, + "loss": 0.8806, + "step": 111 + }, + { + "epoch": 0.013760904287996069, + "grad_norm": 1.9367676780592882, + "learning_rate": 4.127764127764128e-06, + "loss": 0.8225, + "step": 112 + }, + { + "epoch": 0.013883769504853176, + "grad_norm": 2.1901025979950868, + "learning_rate": 4.164619164619165e-06, + "loss": 0.827, + "step": 113 + }, + { + "epoch": 0.014006634721710284, + "grad_norm": 1.9334515380670565, + "learning_rate": 4.2014742014742015e-06, + "loss": 0.7683, + "step": 114 + }, + { + "epoch": 0.014129499938567392, + "grad_norm": 1.7940119923486164, + "learning_rate": 4.2383292383292384e-06, + "loss": 0.8543, + "step": 115 + }, + { + "epoch": 0.0142523651554245, + "grad_norm": 1.646323482780624, + "learning_rate": 4.275184275184275e-06, + "loss": 0.8729, + "step": 116 + }, + { + "epoch": 0.014375230372281608, + "grad_norm": 1.9306814941290429, + "learning_rate": 4.312039312039312e-06, + "loss": 0.7837, + "step": 117 + }, + { + "epoch": 0.014498095589138714, + "grad_norm": 1.6747792227115856, + "learning_rate": 4.348894348894349e-06, + "loss": 0.7657, + "step": 118 + }, + { + "epoch": 0.014620960805995823, + "grad_norm": 2.5116743414061493, + "learning_rate": 4.385749385749385e-06, + "loss": 0.7912, + "step": 119 + }, + { + "epoch": 0.014743826022852931, + "grad_norm": 2.2543920257744796, + "learning_rate": 4.422604422604422e-06, + "loss": 0.7871, + "step": 120 + }, + { + "epoch": 0.014866691239710038, + "grad_norm": 1.9945810171170197, + "learning_rate": 4.45945945945946e-06, + "loss": 0.7342, + "step": 121 + }, + { + "epoch": 0.014989556456567146, + "grad_norm": 2.2204859893567943, + "learning_rate": 4.496314496314496e-06, + "loss": 0.8786, + "step": 122 + }, + { + "epoch": 0.015112421673424253, + "grad_norm": 2.002919416848398, + "learning_rate": 4.533169533169533e-06, + "loss": 0.7228, + "step": 123 + }, + { + "epoch": 0.015235286890281361, + "grad_norm": 1.868216079800274, + "learning_rate": 4.57002457002457e-06, + "loss": 0.742, + "step": 124 + }, + { + "epoch": 0.01535815210713847, + "grad_norm": 1.8005633082126062, + "learning_rate": 4.606879606879607e-06, + "loss": 0.8712, + "step": 125 + }, + { + "epoch": 0.015481017323995577, + "grad_norm": 1.8166686764177278, + "learning_rate": 4.643734643734644e-06, + "loss": 0.9068, + "step": 126 + }, + { + "epoch": 0.015603882540852685, + "grad_norm": 2.0515222597254805, + "learning_rate": 4.680589680589681e-06, + "loss": 0.8018, + "step": 127 + }, + { + "epoch": 0.015726747757709793, + "grad_norm": 2.279636341044896, + "learning_rate": 4.717444717444717e-06, + "loss": 0.7718, + "step": 128 + }, + { + "epoch": 0.0158496129745669, + "grad_norm": 1.8378117347640532, + "learning_rate": 4.754299754299755e-06, + "loss": 0.6841, + "step": 129 + }, + { + "epoch": 0.015972478191424007, + "grad_norm": 2.218430722039516, + "learning_rate": 4.791154791154791e-06, + "loss": 0.8657, + "step": 130 + }, + { + "epoch": 0.016095343408281115, + "grad_norm": 1.916287061479979, + "learning_rate": 4.828009828009828e-06, + "loss": 0.6776, + "step": 131 + }, + { + "epoch": 0.016218208625138224, + "grad_norm": 1.7620338730219163, + "learning_rate": 4.864864864864865e-06, + "loss": 0.7414, + "step": 132 + }, + { + "epoch": 0.016341073841995332, + "grad_norm": 1.7543931040893224, + "learning_rate": 4.901719901719902e-06, + "loss": 0.7906, + "step": 133 + }, + { + "epoch": 0.016463939058852437, + "grad_norm": 1.5496112007083318, + "learning_rate": 4.938574938574939e-06, + "loss": 0.8203, + "step": 134 + }, + { + "epoch": 0.016586804275709546, + "grad_norm": 1.7438331967702376, + "learning_rate": 4.975429975429976e-06, + "loss": 0.8933, + "step": 135 + }, + { + "epoch": 0.016709669492566654, + "grad_norm": 1.8605660015672187, + "learning_rate": 5.012285012285012e-06, + "loss": 0.7005, + "step": 136 + }, + { + "epoch": 0.016832534709423762, + "grad_norm": 2.2229713433830343, + "learning_rate": 5.04914004914005e-06, + "loss": 0.7777, + "step": 137 + }, + { + "epoch": 0.01695539992628087, + "grad_norm": 1.9301500618954412, + "learning_rate": 5.085995085995086e-06, + "loss": 0.7667, + "step": 138 + }, + { + "epoch": 0.01707826514313798, + "grad_norm": 1.9473626434903029, + "learning_rate": 5.122850122850123e-06, + "loss": 0.7669, + "step": 139 + }, + { + "epoch": 0.017201130359995084, + "grad_norm": 1.9268872807220647, + "learning_rate": 5.15970515970516e-06, + "loss": 0.8654, + "step": 140 + }, + { + "epoch": 0.017323995576852193, + "grad_norm": 2.0841169041870757, + "learning_rate": 5.196560196560197e-06, + "loss": 0.8258, + "step": 141 + }, + { + "epoch": 0.0174468607937093, + "grad_norm": 1.8341245823766783, + "learning_rate": 5.233415233415234e-06, + "loss": 0.9055, + "step": 142 + }, + { + "epoch": 0.01756972601056641, + "grad_norm": 1.988196597099631, + "learning_rate": 5.2702702702702705e-06, + "loss": 0.7418, + "step": 143 + }, + { + "epoch": 0.017692591227423518, + "grad_norm": 2.163121152614218, + "learning_rate": 5.307125307125307e-06, + "loss": 0.9437, + "step": 144 + }, + { + "epoch": 0.017815456444280623, + "grad_norm": 2.2416333125988923, + "learning_rate": 5.3439803439803444e-06, + "loss": 0.7348, + "step": 145 + }, + { + "epoch": 0.01793832166113773, + "grad_norm": 2.1561442382301426, + "learning_rate": 5.380835380835381e-06, + "loss": 0.7212, + "step": 146 + }, + { + "epoch": 0.01806118687799484, + "grad_norm": 2.2981656479629393, + "learning_rate": 5.4176904176904175e-06, + "loss": 0.7829, + "step": 147 + }, + { + "epoch": 0.01818405209485195, + "grad_norm": 2.0617711334227113, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.7847, + "step": 148 + }, + { + "epoch": 0.018306917311709057, + "grad_norm": 1.7819338394255597, + "learning_rate": 5.491400491400491e-06, + "loss": 0.6362, + "step": 149 + }, + { + "epoch": 0.018429782528566162, + "grad_norm": 1.8454195227172059, + "learning_rate": 5.528255528255528e-06, + "loss": 0.7743, + "step": 150 + }, + { + "epoch": 0.01855264774542327, + "grad_norm": 1.9648565931926607, + "learning_rate": 5.565110565110565e-06, + "loss": 0.6764, + "step": 151 + }, + { + "epoch": 0.01867551296228038, + "grad_norm": 2.0118392830540253, + "learning_rate": 5.601965601965601e-06, + "loss": 0.6951, + "step": 152 + }, + { + "epoch": 0.018798378179137487, + "grad_norm": 1.803135424324761, + "learning_rate": 5.638820638820639e-06, + "loss": 0.818, + "step": 153 + }, + { + "epoch": 0.018921243395994596, + "grad_norm": 1.9795101473740746, + "learning_rate": 5.675675675675676e-06, + "loss": 0.8511, + "step": 154 + }, + { + "epoch": 0.0190441086128517, + "grad_norm": 1.7322842434464771, + "learning_rate": 5.712530712530712e-06, + "loss": 0.7128, + "step": 155 + }, + { + "epoch": 0.01916697382970881, + "grad_norm": 1.9785431181483861, + "learning_rate": 5.749385749385749e-06, + "loss": 0.732, + "step": 156 + }, + { + "epoch": 0.019289839046565917, + "grad_norm": 2.193855981127751, + "learning_rate": 5.786240786240787e-06, + "loss": 0.8811, + "step": 157 + }, + { + "epoch": 0.019412704263423026, + "grad_norm": 2.1286480959222667, + "learning_rate": 5.823095823095823e-06, + "loss": 0.7895, + "step": 158 + }, + { + "epoch": 0.019535569480280134, + "grad_norm": 2.1859847721366954, + "learning_rate": 5.85995085995086e-06, + "loss": 0.8027, + "step": 159 + }, + { + "epoch": 0.01965843469713724, + "grad_norm": 1.9324074142237098, + "learning_rate": 5.896805896805896e-06, + "loss": 0.7395, + "step": 160 + }, + { + "epoch": 0.019781299913994348, + "grad_norm": 1.800633659071209, + "learning_rate": 5.933660933660934e-06, + "loss": 0.7415, + "step": 161 + }, + { + "epoch": 0.019904165130851456, + "grad_norm": 2.643996006904747, + "learning_rate": 5.970515970515971e-06, + "loss": 0.8222, + "step": 162 + }, + { + "epoch": 0.020027030347708565, + "grad_norm": 1.967104685269245, + "learning_rate": 6.007371007371007e-06, + "loss": 0.6736, + "step": 163 + }, + { + "epoch": 0.020149895564565673, + "grad_norm": 2.3088778995166312, + "learning_rate": 6.044226044226045e-06, + "loss": 0.7391, + "step": 164 + }, + { + "epoch": 0.020272760781422778, + "grad_norm": 1.8578933501590726, + "learning_rate": 6.081081081081082e-06, + "loss": 0.6349, + "step": 165 + }, + { + "epoch": 0.020395625998279886, + "grad_norm": 1.5977353959892007, + "learning_rate": 6.117936117936118e-06, + "loss": 0.856, + "step": 166 + }, + { + "epoch": 0.020518491215136995, + "grad_norm": 2.41916844601463, + "learning_rate": 6.154791154791155e-06, + "loss": 0.7527, + "step": 167 + }, + { + "epoch": 0.020641356431994103, + "grad_norm": 2.0022315718215435, + "learning_rate": 6.191646191646192e-06, + "loss": 0.7005, + "step": 168 + }, + { + "epoch": 0.02076422164885121, + "grad_norm": 1.9040703311352212, + "learning_rate": 6.228501228501229e-06, + "loss": 0.6852, + "step": 169 + }, + { + "epoch": 0.020887086865708317, + "grad_norm": 2.176320105062913, + "learning_rate": 6.265356265356266e-06, + "loss": 0.8448, + "step": 170 + }, + { + "epoch": 0.021009952082565425, + "grad_norm": 1.9266691571891912, + "learning_rate": 6.302211302211302e-06, + "loss": 0.7955, + "step": 171 + }, + { + "epoch": 0.021132817299422534, + "grad_norm": 1.8114027476423795, + "learning_rate": 6.33906633906634e-06, + "loss": 0.7434, + "step": 172 + }, + { + "epoch": 0.021255682516279642, + "grad_norm": 1.9515491664797833, + "learning_rate": 6.3759213759213766e-06, + "loss": 0.7566, + "step": 173 + }, + { + "epoch": 0.02137854773313675, + "grad_norm": 2.025956196474469, + "learning_rate": 6.412776412776413e-06, + "loss": 0.9448, + "step": 174 + }, + { + "epoch": 0.021501412949993855, + "grad_norm": 1.7634395032917143, + "learning_rate": 6.44963144963145e-06, + "loss": 0.7636, + "step": 175 + }, + { + "epoch": 0.021624278166850964, + "grad_norm": 2.070407439272014, + "learning_rate": 6.486486486486487e-06, + "loss": 0.8912, + "step": 176 + }, + { + "epoch": 0.021747143383708072, + "grad_norm": 2.2960900402022246, + "learning_rate": 6.5233415233415235e-06, + "loss": 0.7653, + "step": 177 + }, + { + "epoch": 0.02187000860056518, + "grad_norm": 2.1754850194688036, + "learning_rate": 6.5601965601965605e-06, + "loss": 0.9228, + "step": 178 + }, + { + "epoch": 0.02199287381742229, + "grad_norm": 2.0374135782794585, + "learning_rate": 6.5970515970515966e-06, + "loss": 0.6933, + "step": 179 + }, + { + "epoch": 0.022115739034279394, + "grad_norm": 2.0661917296130095, + "learning_rate": 6.633906633906634e-06, + "loss": 0.6648, + "step": 180 + }, + { + "epoch": 0.022238604251136503, + "grad_norm": 1.8722756101140505, + "learning_rate": 6.670761670761671e-06, + "loss": 0.7699, + "step": 181 + }, + { + "epoch": 0.02236146946799361, + "grad_norm": 2.1776684326905933, + "learning_rate": 6.707616707616707e-06, + "loss": 0.7335, + "step": 182 + }, + { + "epoch": 0.02248433468485072, + "grad_norm": 1.6129926525625968, + "learning_rate": 6.744471744471744e-06, + "loss": 0.7357, + "step": 183 + }, + { + "epoch": 0.022607199901707828, + "grad_norm": 2.1277756112782567, + "learning_rate": 6.781326781326782e-06, + "loss": 0.7046, + "step": 184 + }, + { + "epoch": 0.022730065118564933, + "grad_norm": 2.3391912703074724, + "learning_rate": 6.818181818181818e-06, + "loss": 0.7216, + "step": 185 + }, + { + "epoch": 0.02285293033542204, + "grad_norm": 2.1511290166475003, + "learning_rate": 6.855036855036855e-06, + "loss": 0.8211, + "step": 186 + }, + { + "epoch": 0.02297579555227915, + "grad_norm": 1.6719111505737243, + "learning_rate": 6.891891891891892e-06, + "loss": 0.7054, + "step": 187 + }, + { + "epoch": 0.023098660769136258, + "grad_norm": 2.5515281191195154, + "learning_rate": 6.928746928746929e-06, + "loss": 0.8233, + "step": 188 + }, + { + "epoch": 0.023221525985993367, + "grad_norm": 1.900669341574493, + "learning_rate": 6.965601965601966e-06, + "loss": 0.7284, + "step": 189 + }, + { + "epoch": 0.02334439120285047, + "grad_norm": 1.8850453862694156, + "learning_rate": 7.002457002457002e-06, + "loss": 0.7119, + "step": 190 + }, + { + "epoch": 0.02346725641970758, + "grad_norm": 1.9640177846783462, + "learning_rate": 7.039312039312039e-06, + "loss": 0.6226, + "step": 191 + }, + { + "epoch": 0.02359012163656469, + "grad_norm": 1.78285590490415, + "learning_rate": 7.076167076167077e-06, + "loss": 0.7527, + "step": 192 + }, + { + "epoch": 0.023712986853421797, + "grad_norm": 1.909904047542188, + "learning_rate": 7.113022113022113e-06, + "loss": 0.6273, + "step": 193 + }, + { + "epoch": 0.023835852070278905, + "grad_norm": 1.5774860471993846, + "learning_rate": 7.14987714987715e-06, + "loss": 0.7097, + "step": 194 + }, + { + "epoch": 0.02395871728713601, + "grad_norm": 1.7822894593923615, + "learning_rate": 7.186732186732187e-06, + "loss": 0.7927, + "step": 195 + }, + { + "epoch": 0.02408158250399312, + "grad_norm": 1.9467034435413708, + "learning_rate": 7.223587223587224e-06, + "loss": 0.8011, + "step": 196 + }, + { + "epoch": 0.024204447720850227, + "grad_norm": 2.198717952759409, + "learning_rate": 7.260442260442261e-06, + "loss": 0.772, + "step": 197 + }, + { + "epoch": 0.024327312937707336, + "grad_norm": 2.183689896839222, + "learning_rate": 7.297297297297298e-06, + "loss": 0.783, + "step": 198 + }, + { + "epoch": 0.024450178154564444, + "grad_norm": 1.8304193568098985, + "learning_rate": 7.334152334152334e-06, + "loss": 0.6804, + "step": 199 + }, + { + "epoch": 0.02457304337142155, + "grad_norm": 2.1180414532189467, + "learning_rate": 7.371007371007372e-06, + "loss": 0.6441, + "step": 200 + }, + { + "epoch": 0.024695908588278657, + "grad_norm": 2.22447091842928, + "learning_rate": 7.407862407862408e-06, + "loss": 0.8026, + "step": 201 + }, + { + "epoch": 0.024818773805135766, + "grad_norm": 1.9875736683941647, + "learning_rate": 7.444717444717445e-06, + "loss": 0.7405, + "step": 202 + }, + { + "epoch": 0.024941639021992874, + "grad_norm": 1.988612777537409, + "learning_rate": 7.481572481572482e-06, + "loss": 0.7623, + "step": 203 + }, + { + "epoch": 0.025064504238849983, + "grad_norm": 1.983215440853281, + "learning_rate": 7.5184275184275195e-06, + "loss": 0.7147, + "step": 204 + }, + { + "epoch": 0.025187369455707088, + "grad_norm": 2.0591468153698536, + "learning_rate": 7.555282555282556e-06, + "loss": 0.8068, + "step": 205 + }, + { + "epoch": 0.025310234672564196, + "grad_norm": 2.424275565078959, + "learning_rate": 7.592137592137592e-06, + "loss": 0.6661, + "step": 206 + }, + { + "epoch": 0.025433099889421305, + "grad_norm": 1.8556122946270948, + "learning_rate": 7.6289926289926295e-06, + "loss": 0.7364, + "step": 207 + }, + { + "epoch": 0.025555965106278413, + "grad_norm": 2.0226528495706426, + "learning_rate": 7.665847665847666e-06, + "loss": 0.7476, + "step": 208 + }, + { + "epoch": 0.02567883032313552, + "grad_norm": 2.0839462629230394, + "learning_rate": 7.702702702702703e-06, + "loss": 0.7402, + "step": 209 + }, + { + "epoch": 0.025801695539992626, + "grad_norm": 1.7157059567245307, + "learning_rate": 7.739557739557741e-06, + "loss": 0.6697, + "step": 210 + }, + { + "epoch": 0.025924560756849735, + "grad_norm": 2.221003192137898, + "learning_rate": 7.776412776412776e-06, + "loss": 0.7494, + "step": 211 + }, + { + "epoch": 0.026047425973706843, + "grad_norm": 2.0068010262632767, + "learning_rate": 7.813267813267813e-06, + "loss": 0.6485, + "step": 212 + }, + { + "epoch": 0.026170291190563952, + "grad_norm": 1.9835837288566343, + "learning_rate": 7.85012285012285e-06, + "loss": 0.7184, + "step": 213 + }, + { + "epoch": 0.02629315640742106, + "grad_norm": 1.81943468759955, + "learning_rate": 7.886977886977887e-06, + "loss": 0.7361, + "step": 214 + }, + { + "epoch": 0.026416021624278165, + "grad_norm": 2.196400858375408, + "learning_rate": 7.923832923832924e-06, + "loss": 0.7529, + "step": 215 + }, + { + "epoch": 0.026538886841135274, + "grad_norm": 2.278266713871535, + "learning_rate": 7.96068796068796e-06, + "loss": 0.6606, + "step": 216 + }, + { + "epoch": 0.026661752057992382, + "grad_norm": 2.7429671536295697, + "learning_rate": 7.997542997542998e-06, + "loss": 0.7663, + "step": 217 + }, + { + "epoch": 0.02678461727484949, + "grad_norm": 1.948658312851503, + "learning_rate": 8.034398034398035e-06, + "loss": 0.6791, + "step": 218 + }, + { + "epoch": 0.0269074824917066, + "grad_norm": 2.1016293365734637, + "learning_rate": 8.07125307125307e-06, + "loss": 0.6664, + "step": 219 + }, + { + "epoch": 0.027030347708563704, + "grad_norm": 2.165590672020245, + "learning_rate": 8.108108108108109e-06, + "loss": 0.8209, + "step": 220 + }, + { + "epoch": 0.027153212925420812, + "grad_norm": 1.8489369609520914, + "learning_rate": 8.144963144963144e-06, + "loss": 0.7198, + "step": 221 + }, + { + "epoch": 0.02727607814227792, + "grad_norm": 1.8035798718548126, + "learning_rate": 8.181818181818181e-06, + "loss": 0.7851, + "step": 222 + }, + { + "epoch": 0.02739894335913503, + "grad_norm": 1.9709884836797689, + "learning_rate": 8.21867321867322e-06, + "loss": 0.7337, + "step": 223 + }, + { + "epoch": 0.027521808575992138, + "grad_norm": 2.0840307607391977, + "learning_rate": 8.255528255528255e-06, + "loss": 0.7496, + "step": 224 + }, + { + "epoch": 0.027644673792849243, + "grad_norm": 2.355604284748557, + "learning_rate": 8.292383292383292e-06, + "loss": 0.6245, + "step": 225 + }, + { + "epoch": 0.02776753900970635, + "grad_norm": 2.0079503059460353, + "learning_rate": 8.32923832923833e-06, + "loss": 0.7352, + "step": 226 + }, + { + "epoch": 0.02789040422656346, + "grad_norm": 2.0985545780787294, + "learning_rate": 8.366093366093366e-06, + "loss": 0.6448, + "step": 227 + }, + { + "epoch": 0.028013269443420568, + "grad_norm": 1.7189616249257684, + "learning_rate": 8.402948402948403e-06, + "loss": 0.6683, + "step": 228 + }, + { + "epoch": 0.028136134660277676, + "grad_norm": 2.010324383403565, + "learning_rate": 8.43980343980344e-06, + "loss": 0.6502, + "step": 229 + }, + { + "epoch": 0.028258999877134785, + "grad_norm": 2.093256992408434, + "learning_rate": 8.476658476658477e-06, + "loss": 0.8893, + "step": 230 + }, + { + "epoch": 0.02838186509399189, + "grad_norm": 1.9484291029299183, + "learning_rate": 8.513513513513514e-06, + "loss": 0.7864, + "step": 231 + }, + { + "epoch": 0.028504730310849, + "grad_norm": 2.071568161786432, + "learning_rate": 8.55036855036855e-06, + "loss": 0.7149, + "step": 232 + }, + { + "epoch": 0.028627595527706107, + "grad_norm": 2.0271669833556056, + "learning_rate": 8.587223587223588e-06, + "loss": 0.6132, + "step": 233 + }, + { + "epoch": 0.028750460744563215, + "grad_norm": 1.9102956251761058, + "learning_rate": 8.624078624078625e-06, + "loss": 0.712, + "step": 234 + }, + { + "epoch": 0.028873325961420324, + "grad_norm": 1.6778675992552967, + "learning_rate": 8.66093366093366e-06, + "loss": 0.7827, + "step": 235 + }, + { + "epoch": 0.02899619117827743, + "grad_norm": 2.0948610150941485, + "learning_rate": 8.697788697788699e-06, + "loss": 0.7089, + "step": 236 + }, + { + "epoch": 0.029119056395134537, + "grad_norm": 1.8346975899689884, + "learning_rate": 8.734643734643734e-06, + "loss": 0.727, + "step": 237 + }, + { + "epoch": 0.029241921611991645, + "grad_norm": 1.8247128431766366, + "learning_rate": 8.77149877149877e-06, + "loss": 0.6293, + "step": 238 + }, + { + "epoch": 0.029364786828848754, + "grad_norm": 1.7470054252402831, + "learning_rate": 8.80835380835381e-06, + "loss": 0.6507, + "step": 239 + }, + { + "epoch": 0.029487652045705862, + "grad_norm": 2.1064269646877927, + "learning_rate": 8.845208845208845e-06, + "loss": 0.707, + "step": 240 + }, + { + "epoch": 0.029610517262562967, + "grad_norm": 1.859638512263217, + "learning_rate": 8.882063882063882e-06, + "loss": 0.7611, + "step": 241 + }, + { + "epoch": 0.029733382479420076, + "grad_norm": 1.9257602100834517, + "learning_rate": 8.91891891891892e-06, + "loss": 0.8156, + "step": 242 + }, + { + "epoch": 0.029856247696277184, + "grad_norm": 2.026208368297337, + "learning_rate": 8.955773955773956e-06, + "loss": 0.7262, + "step": 243 + }, + { + "epoch": 0.029979112913134293, + "grad_norm": 1.9594383548407612, + "learning_rate": 8.992628992628992e-06, + "loss": 0.7067, + "step": 244 + }, + { + "epoch": 0.0301019781299914, + "grad_norm": 2.048343815790902, + "learning_rate": 9.02948402948403e-06, + "loss": 0.7218, + "step": 245 + }, + { + "epoch": 0.030224843346848506, + "grad_norm": 2.312213736657857, + "learning_rate": 9.066339066339066e-06, + "loss": 0.7519, + "step": 246 + }, + { + "epoch": 0.030347708563705614, + "grad_norm": 2.1483915077471387, + "learning_rate": 9.103194103194103e-06, + "loss": 0.6721, + "step": 247 + }, + { + "epoch": 0.030470573780562723, + "grad_norm": 1.7973362119561997, + "learning_rate": 9.14004914004914e-06, + "loss": 0.6936, + "step": 248 + }, + { + "epoch": 0.03059343899741983, + "grad_norm": 1.810351343394265, + "learning_rate": 9.176904176904177e-06, + "loss": 0.6041, + "step": 249 + }, + { + "epoch": 0.03071630421427694, + "grad_norm": 2.110068365426576, + "learning_rate": 9.213759213759214e-06, + "loss": 0.6788, + "step": 250 + }, + { + "epoch": 0.030839169431134045, + "grad_norm": 2.122011315946288, + "learning_rate": 9.250614250614251e-06, + "loss": 0.6693, + "step": 251 + }, + { + "epoch": 0.030962034647991153, + "grad_norm": 1.9509953443662824, + "learning_rate": 9.287469287469288e-06, + "loss": 0.8107, + "step": 252 + }, + { + "epoch": 0.03108489986484826, + "grad_norm": 2.1759954795663727, + "learning_rate": 9.324324324324325e-06, + "loss": 0.6855, + "step": 253 + }, + { + "epoch": 0.03120776508170537, + "grad_norm": 1.968246155991691, + "learning_rate": 9.361179361179362e-06, + "loss": 0.6639, + "step": 254 + }, + { + "epoch": 0.03133063029856248, + "grad_norm": 1.9567072014668756, + "learning_rate": 9.398034398034399e-06, + "loss": 0.735, + "step": 255 + }, + { + "epoch": 0.03145349551541959, + "grad_norm": 1.902831366116418, + "learning_rate": 9.434889434889434e-06, + "loss": 0.6908, + "step": 256 + }, + { + "epoch": 0.031576360732276695, + "grad_norm": 2.289894777988885, + "learning_rate": 9.471744471744471e-06, + "loss": 0.8442, + "step": 257 + }, + { + "epoch": 0.0316992259491338, + "grad_norm": 2.500635044381565, + "learning_rate": 9.50859950859951e-06, + "loss": 0.7555, + "step": 258 + }, + { + "epoch": 0.031822091165990905, + "grad_norm": 1.8382057052032, + "learning_rate": 9.545454545454545e-06, + "loss": 0.7777, + "step": 259 + }, + { + "epoch": 0.031944956382848014, + "grad_norm": 1.6402173157463737, + "learning_rate": 9.582309582309582e-06, + "loss": 0.7489, + "step": 260 + }, + { + "epoch": 0.03206782159970512, + "grad_norm": 1.9348524563006024, + "learning_rate": 9.61916461916462e-06, + "loss": 0.6943, + "step": 261 + }, + { + "epoch": 0.03219068681656223, + "grad_norm": 2.47299864257624, + "learning_rate": 9.656019656019656e-06, + "loss": 0.6839, + "step": 262 + }, + { + "epoch": 0.03231355203341934, + "grad_norm": 2.1400603176111836, + "learning_rate": 9.692874692874693e-06, + "loss": 0.8084, + "step": 263 + }, + { + "epoch": 0.03243641725027645, + "grad_norm": 1.9102694201056012, + "learning_rate": 9.72972972972973e-06, + "loss": 0.7101, + "step": 264 + }, + { + "epoch": 0.032559282467133556, + "grad_norm": 1.8135314598207293, + "learning_rate": 9.766584766584767e-06, + "loss": 0.8603, + "step": 265 + }, + { + "epoch": 0.032682147683990664, + "grad_norm": 1.780865286156098, + "learning_rate": 9.803439803439804e-06, + "loss": 0.7228, + "step": 266 + }, + { + "epoch": 0.03280501290084777, + "grad_norm": 2.0720944619287947, + "learning_rate": 9.84029484029484e-06, + "loss": 0.6952, + "step": 267 + }, + { + "epoch": 0.032927878117704874, + "grad_norm": 2.2665325688511726, + "learning_rate": 9.877149877149878e-06, + "loss": 0.864, + "step": 268 + }, + { + "epoch": 0.03305074333456198, + "grad_norm": 1.8809441165375158, + "learning_rate": 9.914004914004915e-06, + "loss": 0.7674, + "step": 269 + }, + { + "epoch": 0.03317360855141909, + "grad_norm": 1.7822626965815573, + "learning_rate": 9.950859950859952e-06, + "loss": 0.7876, + "step": 270 + }, + { + "epoch": 0.0332964737682762, + "grad_norm": 2.1108938104666266, + "learning_rate": 9.987714987714989e-06, + "loss": 0.7215, + "step": 271 + }, + { + "epoch": 0.03341933898513331, + "grad_norm": 1.9966211299175396, + "learning_rate": 1.0024570024570024e-05, + "loss": 0.7674, + "step": 272 + }, + { + "epoch": 0.033542204201990417, + "grad_norm": 2.2680348042610348, + "learning_rate": 1.0061425061425062e-05, + "loss": 0.8111, + "step": 273 + }, + { + "epoch": 0.033665069418847525, + "grad_norm": 1.9324203050010555, + "learning_rate": 1.00982800982801e-05, + "loss": 0.6647, + "step": 274 + }, + { + "epoch": 0.03378793463570463, + "grad_norm": 2.155792456819831, + "learning_rate": 1.0135135135135135e-05, + "loss": 0.6672, + "step": 275 + }, + { + "epoch": 0.03391079985256174, + "grad_norm": 2.232518991826889, + "learning_rate": 1.0171990171990172e-05, + "loss": 0.7007, + "step": 276 + }, + { + "epoch": 0.03403366506941885, + "grad_norm": 2.316018402937038, + "learning_rate": 1.020884520884521e-05, + "loss": 0.7158, + "step": 277 + }, + { + "epoch": 0.03415653028627596, + "grad_norm": 2.10917506496764, + "learning_rate": 1.0245700245700245e-05, + "loss": 0.7441, + "step": 278 + }, + { + "epoch": 0.03427939550313306, + "grad_norm": 2.121907339930492, + "learning_rate": 1.0282555282555282e-05, + "loss": 0.7577, + "step": 279 + }, + { + "epoch": 0.03440226071999017, + "grad_norm": 1.963810644603433, + "learning_rate": 1.031941031941032e-05, + "loss": 0.7163, + "step": 280 + }, + { + "epoch": 0.03452512593684728, + "grad_norm": 1.9330640317734664, + "learning_rate": 1.0356265356265356e-05, + "loss": 0.7213, + "step": 281 + }, + { + "epoch": 0.034647991153704386, + "grad_norm": 2.516811453904035, + "learning_rate": 1.0393120393120393e-05, + "loss": 0.8406, + "step": 282 + }, + { + "epoch": 0.034770856370561494, + "grad_norm": 1.9277517959623651, + "learning_rate": 1.042997542997543e-05, + "loss": 0.7377, + "step": 283 + }, + { + "epoch": 0.0348937215874186, + "grad_norm": 2.0099179877124667, + "learning_rate": 1.0466830466830467e-05, + "loss": 0.7708, + "step": 284 + }, + { + "epoch": 0.03501658680427571, + "grad_norm": 8.185672833271616, + "learning_rate": 1.0503685503685504e-05, + "loss": 0.7543, + "step": 285 + }, + { + "epoch": 0.03513945202113282, + "grad_norm": 2.2633720920972746, + "learning_rate": 1.0540540540540541e-05, + "loss": 0.7156, + "step": 286 + }, + { + "epoch": 0.03526231723798993, + "grad_norm": 1.8016495469574831, + "learning_rate": 1.0577395577395578e-05, + "loss": 0.7324, + "step": 287 + }, + { + "epoch": 0.035385182454847036, + "grad_norm": 1.955712121244304, + "learning_rate": 1.0614250614250613e-05, + "loss": 0.7083, + "step": 288 + }, + { + "epoch": 0.03550804767170414, + "grad_norm": 2.043471854335718, + "learning_rate": 1.0651105651105652e-05, + "loss": 0.7764, + "step": 289 + }, + { + "epoch": 0.035630912888561246, + "grad_norm": 2.0961118593081283, + "learning_rate": 1.0687960687960689e-05, + "loss": 0.6612, + "step": 290 + }, + { + "epoch": 0.035753778105418355, + "grad_norm": 2.1177177146109747, + "learning_rate": 1.0724815724815724e-05, + "loss": 0.6344, + "step": 291 + }, + { + "epoch": 0.03587664332227546, + "grad_norm": 1.9609345218613277, + "learning_rate": 1.0761670761670763e-05, + "loss": 0.7421, + "step": 292 + }, + { + "epoch": 0.03599950853913257, + "grad_norm": 1.7653847079983935, + "learning_rate": 1.07985257985258e-05, + "loss": 0.7348, + "step": 293 + }, + { + "epoch": 0.03612237375598968, + "grad_norm": 2.285989107622271, + "learning_rate": 1.0835380835380835e-05, + "loss": 0.63, + "step": 294 + }, + { + "epoch": 0.03624523897284679, + "grad_norm": 1.8304219610778436, + "learning_rate": 1.0872235872235874e-05, + "loss": 0.6845, + "step": 295 + }, + { + "epoch": 0.0363681041897039, + "grad_norm": 2.060934670201375, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.793, + "step": 296 + }, + { + "epoch": 0.036490969406561005, + "grad_norm": 2.1625614978969936, + "learning_rate": 1.0945945945945946e-05, + "loss": 0.7189, + "step": 297 + }, + { + "epoch": 0.036613834623418114, + "grad_norm": 1.8167775026327464, + "learning_rate": 1.0982800982800983e-05, + "loss": 0.7288, + "step": 298 + }, + { + "epoch": 0.036736699840275215, + "grad_norm": 1.8268824852907397, + "learning_rate": 1.101965601965602e-05, + "loss": 0.7162, + "step": 299 + }, + { + "epoch": 0.036859565057132324, + "grad_norm": 1.8856787707561768, + "learning_rate": 1.1056511056511057e-05, + "loss": 0.7354, + "step": 300 + }, + { + "epoch": 0.03698243027398943, + "grad_norm": 1.6989978896187465, + "learning_rate": 1.1093366093366094e-05, + "loss": 0.6707, + "step": 301 + }, + { + "epoch": 0.03710529549084654, + "grad_norm": 2.1523214233302137, + "learning_rate": 1.113022113022113e-05, + "loss": 0.7256, + "step": 302 + }, + { + "epoch": 0.03722816070770365, + "grad_norm": 1.7887421119223106, + "learning_rate": 1.1167076167076168e-05, + "loss": 0.7572, + "step": 303 + }, + { + "epoch": 0.03735102592456076, + "grad_norm": 1.9200546760873909, + "learning_rate": 1.1203931203931203e-05, + "loss": 0.7215, + "step": 304 + }, + { + "epoch": 0.037473891141417866, + "grad_norm": 2.004864587313459, + "learning_rate": 1.1240786240786241e-05, + "loss": 0.8387, + "step": 305 + }, + { + "epoch": 0.037596756358274974, + "grad_norm": 2.3253393869500907, + "learning_rate": 1.1277641277641278e-05, + "loss": 0.6206, + "step": 306 + }, + { + "epoch": 0.03771962157513208, + "grad_norm": 2.116835304787679, + "learning_rate": 1.1314496314496314e-05, + "loss": 0.6894, + "step": 307 + }, + { + "epoch": 0.03784248679198919, + "grad_norm": 2.0780156430341252, + "learning_rate": 1.1351351351351352e-05, + "loss": 0.8229, + "step": 308 + }, + { + "epoch": 0.03796535200884629, + "grad_norm": 1.9603628702683005, + "learning_rate": 1.138820638820639e-05, + "loss": 0.6491, + "step": 309 + }, + { + "epoch": 0.0380882172257034, + "grad_norm": 2.11419079720498, + "learning_rate": 1.1425061425061425e-05, + "loss": 0.7261, + "step": 310 + }, + { + "epoch": 0.03821108244256051, + "grad_norm": 2.1494880122991376, + "learning_rate": 1.1461916461916463e-05, + "loss": 0.7398, + "step": 311 + }, + { + "epoch": 0.03833394765941762, + "grad_norm": 1.8312491638881796, + "learning_rate": 1.1498771498771498e-05, + "loss": 0.7131, + "step": 312 + }, + { + "epoch": 0.038456812876274726, + "grad_norm": 1.8121074511932032, + "learning_rate": 1.1535626535626535e-05, + "loss": 0.5967, + "step": 313 + }, + { + "epoch": 0.038579678093131835, + "grad_norm": 2.262581966738921, + "learning_rate": 1.1572481572481574e-05, + "loss": 0.6846, + "step": 314 + }, + { + "epoch": 0.03870254330998894, + "grad_norm": 1.7831974722387802, + "learning_rate": 1.160933660933661e-05, + "loss": 0.6213, + "step": 315 + }, + { + "epoch": 0.03882540852684605, + "grad_norm": 1.961908226643096, + "learning_rate": 1.1646191646191646e-05, + "loss": 0.6749, + "step": 316 + }, + { + "epoch": 0.03894827374370316, + "grad_norm": 2.0666673725512044, + "learning_rate": 1.1683046683046683e-05, + "loss": 0.7427, + "step": 317 + }, + { + "epoch": 0.03907113896056027, + "grad_norm": 2.134738474981189, + "learning_rate": 1.171990171990172e-05, + "loss": 0.7518, + "step": 318 + }, + { + "epoch": 0.03919400417741737, + "grad_norm": 1.908240612441246, + "learning_rate": 1.1756756756756757e-05, + "loss": 0.7798, + "step": 319 + }, + { + "epoch": 0.03931686939427448, + "grad_norm": 1.9630236970895545, + "learning_rate": 1.1793611793611792e-05, + "loss": 0.7478, + "step": 320 + }, + { + "epoch": 0.03943973461113159, + "grad_norm": 2.20652863885511, + "learning_rate": 1.1830466830466831e-05, + "loss": 0.7791, + "step": 321 + }, + { + "epoch": 0.039562599827988695, + "grad_norm": 1.6540791915803679, + "learning_rate": 1.1867321867321868e-05, + "loss": 0.648, + "step": 322 + }, + { + "epoch": 0.039685465044845804, + "grad_norm": 2.2870473710600394, + "learning_rate": 1.1904176904176903e-05, + "loss": 0.8005, + "step": 323 + }, + { + "epoch": 0.03980833026170291, + "grad_norm": 1.924694801217767, + "learning_rate": 1.1941031941031942e-05, + "loss": 0.7746, + "step": 324 + }, + { + "epoch": 0.03993119547856002, + "grad_norm": 2.693310438219356, + "learning_rate": 1.1977886977886979e-05, + "loss": 0.828, + "step": 325 + }, + { + "epoch": 0.04005406069541713, + "grad_norm": 2.2592708719945174, + "learning_rate": 1.2014742014742014e-05, + "loss": 0.6775, + "step": 326 + }, + { + "epoch": 0.04017692591227424, + "grad_norm": 1.9458228737821566, + "learning_rate": 1.2051597051597053e-05, + "loss": 0.8071, + "step": 327 + }, + { + "epoch": 0.040299791129131346, + "grad_norm": 2.3588528991691162, + "learning_rate": 1.208845208845209e-05, + "loss": 0.7265, + "step": 328 + }, + { + "epoch": 0.04042265634598845, + "grad_norm": 1.7752142204479076, + "learning_rate": 1.2125307125307125e-05, + "loss": 0.7747, + "step": 329 + }, + { + "epoch": 0.040545521562845556, + "grad_norm": 1.760637762857694, + "learning_rate": 1.2162162162162164e-05, + "loss": 0.6457, + "step": 330 + }, + { + "epoch": 0.040668386779702664, + "grad_norm": 1.897303196191167, + "learning_rate": 1.2199017199017199e-05, + "loss": 0.7169, + "step": 331 + }, + { + "epoch": 0.04079125199655977, + "grad_norm": 2.123365641908746, + "learning_rate": 1.2235872235872236e-05, + "loss": 0.7304, + "step": 332 + }, + { + "epoch": 0.04091411721341688, + "grad_norm": 2.1201082942021316, + "learning_rate": 1.2272727272727274e-05, + "loss": 0.7206, + "step": 333 + }, + { + "epoch": 0.04103698243027399, + "grad_norm": 1.8607948454585348, + "learning_rate": 1.230958230958231e-05, + "loss": 0.7294, + "step": 334 + }, + { + "epoch": 0.0411598476471311, + "grad_norm": 2.0653303507632246, + "learning_rate": 1.2346437346437347e-05, + "loss": 0.8098, + "step": 335 + }, + { + "epoch": 0.04128271286398821, + "grad_norm": 2.039458635676661, + "learning_rate": 1.2383292383292384e-05, + "loss": 0.8171, + "step": 336 + }, + { + "epoch": 0.041405578080845315, + "grad_norm": 2.2263423318760807, + "learning_rate": 1.242014742014742e-05, + "loss": 0.7632, + "step": 337 + }, + { + "epoch": 0.04152844329770242, + "grad_norm": 2.0263655977164428, + "learning_rate": 1.2457002457002457e-05, + "loss": 0.7282, + "step": 338 + }, + { + "epoch": 0.041651308514559525, + "grad_norm": 2.188768178536143, + "learning_rate": 1.2493857493857493e-05, + "loss": 0.6465, + "step": 339 + }, + { + "epoch": 0.04177417373141663, + "grad_norm": 2.3008615084097888, + "learning_rate": 1.2530712530712531e-05, + "loss": 0.8001, + "step": 340 + }, + { + "epoch": 0.04189703894827374, + "grad_norm": 1.8088028647021088, + "learning_rate": 1.2567567567567568e-05, + "loss": 0.772, + "step": 341 + }, + { + "epoch": 0.04201990416513085, + "grad_norm": 2.0507123754388115, + "learning_rate": 1.2604422604422604e-05, + "loss": 0.7904, + "step": 342 + }, + { + "epoch": 0.04214276938198796, + "grad_norm": 2.0910637741314133, + "learning_rate": 1.2641277641277642e-05, + "loss": 0.7083, + "step": 343 + }, + { + "epoch": 0.04226563459884507, + "grad_norm": 2.287615313324481, + "learning_rate": 1.267813267813268e-05, + "loss": 0.7979, + "step": 344 + }, + { + "epoch": 0.042388499815702176, + "grad_norm": 2.41383107129278, + "learning_rate": 1.2714987714987714e-05, + "loss": 0.6396, + "step": 345 + }, + { + "epoch": 0.042511365032559284, + "grad_norm": 2.1888168792339955, + "learning_rate": 1.2751842751842753e-05, + "loss": 0.7382, + "step": 346 + }, + { + "epoch": 0.04263423024941639, + "grad_norm": 2.0593995683781956, + "learning_rate": 1.2788697788697788e-05, + "loss": 0.642, + "step": 347 + }, + { + "epoch": 0.0427570954662735, + "grad_norm": 2.3467540239291114, + "learning_rate": 1.2825552825552825e-05, + "loss": 0.79, + "step": 348 + }, + { + "epoch": 0.0428799606831306, + "grad_norm": 1.8723984130808378, + "learning_rate": 1.2862407862407864e-05, + "loss": 0.8359, + "step": 349 + }, + { + "epoch": 0.04300282589998771, + "grad_norm": 2.0442233859536754, + "learning_rate": 1.28992628992629e-05, + "loss": 0.6335, + "step": 350 + }, + { + "epoch": 0.04312569111684482, + "grad_norm": 2.0757855428971785, + "learning_rate": 1.2936117936117936e-05, + "loss": 0.6809, + "step": 351 + }, + { + "epoch": 0.04324855633370193, + "grad_norm": 1.9886508996214847, + "learning_rate": 1.2972972972972975e-05, + "loss": 0.6729, + "step": 352 + }, + { + "epoch": 0.043371421550559036, + "grad_norm": 1.8690157255382642, + "learning_rate": 1.300982800982801e-05, + "loss": 0.9208, + "step": 353 + }, + { + "epoch": 0.043494286767416145, + "grad_norm": 2.3663309259170067, + "learning_rate": 1.3046683046683047e-05, + "loss": 0.5853, + "step": 354 + }, + { + "epoch": 0.04361715198427325, + "grad_norm": 2.1603833441367897, + "learning_rate": 1.3083538083538084e-05, + "loss": 0.6757, + "step": 355 + }, + { + "epoch": 0.04374001720113036, + "grad_norm": 2.593079436867142, + "learning_rate": 1.3120393120393121e-05, + "loss": 0.8396, + "step": 356 + }, + { + "epoch": 0.04386288241798747, + "grad_norm": 2.1071705131020746, + "learning_rate": 1.3157248157248158e-05, + "loss": 0.6493, + "step": 357 + }, + { + "epoch": 0.04398574763484458, + "grad_norm": 1.7589144967274257, + "learning_rate": 1.3194103194103193e-05, + "loss": 0.7332, + "step": 358 + }, + { + "epoch": 0.04410861285170168, + "grad_norm": 2.079967163095207, + "learning_rate": 1.3230958230958232e-05, + "loss": 0.6664, + "step": 359 + }, + { + "epoch": 0.04423147806855879, + "grad_norm": 2.4070186884460325, + "learning_rate": 1.3267813267813269e-05, + "loss": 0.8518, + "step": 360 + }, + { + "epoch": 0.0443543432854159, + "grad_norm": 2.2911434092266294, + "learning_rate": 1.3304668304668304e-05, + "loss": 0.8158, + "step": 361 + }, + { + "epoch": 0.044477208502273005, + "grad_norm": 2.0789129806858426, + "learning_rate": 1.3341523341523343e-05, + "loss": 0.6302, + "step": 362 + }, + { + "epoch": 0.044600073719130114, + "grad_norm": 1.8567447276652316, + "learning_rate": 1.3378378378378378e-05, + "loss": 0.8159, + "step": 363 + }, + { + "epoch": 0.04472293893598722, + "grad_norm": 1.9129061577035584, + "learning_rate": 1.3415233415233415e-05, + "loss": 0.6508, + "step": 364 + }, + { + "epoch": 0.04484580415284433, + "grad_norm": 1.9061631707224485, + "learning_rate": 1.3452088452088453e-05, + "loss": 0.8036, + "step": 365 + }, + { + "epoch": 0.04496866936970144, + "grad_norm": 1.9077811837263354, + "learning_rate": 1.3488943488943489e-05, + "loss": 0.7048, + "step": 366 + }, + { + "epoch": 0.04509153458655855, + "grad_norm": 1.8799413471907032, + "learning_rate": 1.3525798525798526e-05, + "loss": 0.6892, + "step": 367 + }, + { + "epoch": 0.045214399803415656, + "grad_norm": 1.9104689599607951, + "learning_rate": 1.3562653562653564e-05, + "loss": 0.6501, + "step": 368 + }, + { + "epoch": 0.045337265020272764, + "grad_norm": 1.7561402484592374, + "learning_rate": 1.35995085995086e-05, + "loss": 0.5894, + "step": 369 + }, + { + "epoch": 0.045460130237129866, + "grad_norm": 2.0214912839976273, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.6932, + "step": 370 + }, + { + "epoch": 0.045582995453986974, + "grad_norm": 2.2514776078535133, + "learning_rate": 1.3673218673218674e-05, + "loss": 0.7486, + "step": 371 + }, + { + "epoch": 0.04570586067084408, + "grad_norm": 1.832245316182979, + "learning_rate": 1.371007371007371e-05, + "loss": 0.7485, + "step": 372 + }, + { + "epoch": 0.04582872588770119, + "grad_norm": 1.865490268319536, + "learning_rate": 1.3746928746928747e-05, + "loss": 0.6679, + "step": 373 + }, + { + "epoch": 0.0459515911045583, + "grad_norm": 1.7623318009343716, + "learning_rate": 1.3783783783783784e-05, + "loss": 0.6638, + "step": 374 + }, + { + "epoch": 0.04607445632141541, + "grad_norm": 2.1447377121825575, + "learning_rate": 1.3820638820638821e-05, + "loss": 0.8204, + "step": 375 + }, + { + "epoch": 0.046197321538272516, + "grad_norm": 1.8155383502416222, + "learning_rate": 1.3857493857493858e-05, + "loss": 0.5779, + "step": 376 + }, + { + "epoch": 0.046320186755129625, + "grad_norm": 1.8188662077903164, + "learning_rate": 1.3894348894348894e-05, + "loss": 0.745, + "step": 377 + }, + { + "epoch": 0.04644305197198673, + "grad_norm": 1.7094151744991495, + "learning_rate": 1.3931203931203932e-05, + "loss": 0.7748, + "step": 378 + }, + { + "epoch": 0.04656591718884384, + "grad_norm": 2.115932179131684, + "learning_rate": 1.3968058968058967e-05, + "loss": 0.7132, + "step": 379 + }, + { + "epoch": 0.04668878240570094, + "grad_norm": 1.967624065092432, + "learning_rate": 1.4004914004914004e-05, + "loss": 0.6183, + "step": 380 + }, + { + "epoch": 0.04681164762255805, + "grad_norm": 2.7078433565670705, + "learning_rate": 1.4041769041769043e-05, + "loss": 0.6663, + "step": 381 + }, + { + "epoch": 0.04693451283941516, + "grad_norm": 2.383019734241531, + "learning_rate": 1.4078624078624078e-05, + "loss": 0.7197, + "step": 382 + }, + { + "epoch": 0.04705737805627227, + "grad_norm": 1.9527342939061616, + "learning_rate": 1.4115479115479115e-05, + "loss": 0.7659, + "step": 383 + }, + { + "epoch": 0.04718024327312938, + "grad_norm": 1.8797841661896242, + "learning_rate": 1.4152334152334154e-05, + "loss": 0.6423, + "step": 384 + }, + { + "epoch": 0.047303108489986485, + "grad_norm": 2.125027825620537, + "learning_rate": 1.4189189189189189e-05, + "loss": 0.6688, + "step": 385 + }, + { + "epoch": 0.047425973706843594, + "grad_norm": 1.9643590322280289, + "learning_rate": 1.4226044226044226e-05, + "loss": 0.7397, + "step": 386 + }, + { + "epoch": 0.0475488389237007, + "grad_norm": 1.927905866289698, + "learning_rate": 1.4262899262899263e-05, + "loss": 0.7264, + "step": 387 + }, + { + "epoch": 0.04767170414055781, + "grad_norm": 2.0437524067850372, + "learning_rate": 1.42997542997543e-05, + "loss": 0.7254, + "step": 388 + }, + { + "epoch": 0.04779456935741492, + "grad_norm": 2.132315872365481, + "learning_rate": 1.4336609336609337e-05, + "loss": 0.7217, + "step": 389 + }, + { + "epoch": 0.04791743457427202, + "grad_norm": 1.7906031709295303, + "learning_rate": 1.4373464373464374e-05, + "loss": 0.7334, + "step": 390 + }, + { + "epoch": 0.04804029979112913, + "grad_norm": 1.9015139910966756, + "learning_rate": 1.441031941031941e-05, + "loss": 0.7007, + "step": 391 + }, + { + "epoch": 0.04816316500798624, + "grad_norm": 2.5100527179759458, + "learning_rate": 1.4447174447174448e-05, + "loss": 0.7695, + "step": 392 + }, + { + "epoch": 0.048286030224843346, + "grad_norm": 2.5869255787425933, + "learning_rate": 1.4484029484029485e-05, + "loss": 0.696, + "step": 393 + }, + { + "epoch": 0.048408895441700454, + "grad_norm": 1.9841743033252863, + "learning_rate": 1.4520884520884522e-05, + "loss": 0.6753, + "step": 394 + }, + { + "epoch": 0.04853176065855756, + "grad_norm": 2.305836010745212, + "learning_rate": 1.4557739557739557e-05, + "loss": 0.721, + "step": 395 + }, + { + "epoch": 0.04865462587541467, + "grad_norm": 1.8166300558576498, + "learning_rate": 1.4594594594594596e-05, + "loss": 0.5741, + "step": 396 + }, + { + "epoch": 0.04877749109227178, + "grad_norm": 1.9774537134246464, + "learning_rate": 1.4631449631449633e-05, + "loss": 0.7166, + "step": 397 + }, + { + "epoch": 0.04890035630912889, + "grad_norm": 2.1403413408485625, + "learning_rate": 1.4668304668304668e-05, + "loss": 0.6792, + "step": 398 + }, + { + "epoch": 0.049023221525986, + "grad_norm": 1.7427602430512399, + "learning_rate": 1.4705159705159705e-05, + "loss": 0.616, + "step": 399 + }, + { + "epoch": 0.0491460867428431, + "grad_norm": 1.7606644580173563, + "learning_rate": 1.4742014742014743e-05, + "loss": 0.6295, + "step": 400 + }, + { + "epoch": 0.049268951959700207, + "grad_norm": 2.0917837745880767, + "learning_rate": 1.4778869778869779e-05, + "loss": 0.664, + "step": 401 + }, + { + "epoch": 0.049391817176557315, + "grad_norm": 1.5686558691995127, + "learning_rate": 1.4815724815724816e-05, + "loss": 0.7115, + "step": 402 + }, + { + "epoch": 0.04951468239341442, + "grad_norm": 2.0325770006891517, + "learning_rate": 1.4852579852579853e-05, + "loss": 0.6965, + "step": 403 + }, + { + "epoch": 0.04963754761027153, + "grad_norm": 2.1122779300572403, + "learning_rate": 1.488943488943489e-05, + "loss": 0.7723, + "step": 404 + }, + { + "epoch": 0.04976041282712864, + "grad_norm": 2.024207450354362, + "learning_rate": 1.4926289926289926e-05, + "loss": 0.7147, + "step": 405 + }, + { + "epoch": 0.04988327804398575, + "grad_norm": 1.9402270982175134, + "learning_rate": 1.4963144963144963e-05, + "loss": 0.7117, + "step": 406 + }, + { + "epoch": 0.05000614326084286, + "grad_norm": 2.0760149527073546, + "learning_rate": 1.5e-05, + "loss": 0.6795, + "step": 407 + }, + { + "epoch": 0.050129008477699966, + "grad_norm": 1.7639706254743934, + "learning_rate": 1.5036855036855039e-05, + "loss": 0.6619, + "step": 408 + }, + { + "epoch": 0.050251873694557074, + "grad_norm": 2.01214156299943, + "learning_rate": 1.5073710073710073e-05, + "loss": 0.6675, + "step": 409 + }, + { + "epoch": 0.050374738911414176, + "grad_norm": 2.1951937491314624, + "learning_rate": 1.5110565110565111e-05, + "loss": 0.6106, + "step": 410 + }, + { + "epoch": 0.050497604128271284, + "grad_norm": 2.137537365681502, + "learning_rate": 1.5147420147420148e-05, + "loss": 0.6583, + "step": 411 + }, + { + "epoch": 0.05062046934512839, + "grad_norm": 1.6279763403578074, + "learning_rate": 1.5184275184275183e-05, + "loss": 0.6736, + "step": 412 + }, + { + "epoch": 0.0507433345619855, + "grad_norm": 2.0399234379080937, + "learning_rate": 1.5221130221130222e-05, + "loss": 0.657, + "step": 413 + }, + { + "epoch": 0.05086619977884261, + "grad_norm": 1.762652647762199, + "learning_rate": 1.5257985257985259e-05, + "loss": 0.7263, + "step": 414 + }, + { + "epoch": 0.05098906499569972, + "grad_norm": 2.1585960742636083, + "learning_rate": 1.5294840294840294e-05, + "loss": 0.7437, + "step": 415 + }, + { + "epoch": 0.051111930212556826, + "grad_norm": 2.1321093878006363, + "learning_rate": 1.533169533169533e-05, + "loss": 0.7344, + "step": 416 + }, + { + "epoch": 0.051234795429413935, + "grad_norm": 2.3576870722535332, + "learning_rate": 1.536855036855037e-05, + "loss": 0.6276, + "step": 417 + }, + { + "epoch": 0.05135766064627104, + "grad_norm": 2.045082346913435, + "learning_rate": 1.5405405405405405e-05, + "loss": 0.628, + "step": 418 + }, + { + "epoch": 0.05148052586312815, + "grad_norm": 2.0759652553732977, + "learning_rate": 1.5442260442260442e-05, + "loss": 0.7324, + "step": 419 + }, + { + "epoch": 0.05160339107998525, + "grad_norm": 2.0412784329789875, + "learning_rate": 1.5479115479115482e-05, + "loss": 0.7996, + "step": 420 + }, + { + "epoch": 0.05172625629684236, + "grad_norm": 2.0913478248770536, + "learning_rate": 1.5515970515970516e-05, + "loss": 0.8609, + "step": 421 + }, + { + "epoch": 0.05184912151369947, + "grad_norm": 2.1371852509139626, + "learning_rate": 1.5552825552825553e-05, + "loss": 0.7313, + "step": 422 + }, + { + "epoch": 0.05197198673055658, + "grad_norm": 1.9442557050208458, + "learning_rate": 1.5589680589680593e-05, + "loss": 0.7469, + "step": 423 + }, + { + "epoch": 0.05209485194741369, + "grad_norm": 2.0892299790540068, + "learning_rate": 1.5626535626535627e-05, + "loss": 0.8119, + "step": 424 + }, + { + "epoch": 0.052217717164270795, + "grad_norm": 1.894552826160603, + "learning_rate": 1.5663390663390664e-05, + "loss": 0.6871, + "step": 425 + }, + { + "epoch": 0.052340582381127904, + "grad_norm": 1.8760943944084383, + "learning_rate": 1.57002457002457e-05, + "loss": 0.6326, + "step": 426 + }, + { + "epoch": 0.05246344759798501, + "grad_norm": 1.9440464350855404, + "learning_rate": 1.5737100737100738e-05, + "loss": 0.691, + "step": 427 + }, + { + "epoch": 0.05258631281484212, + "grad_norm": 2.3512172709301478, + "learning_rate": 1.5773955773955775e-05, + "loss": 0.7162, + "step": 428 + }, + { + "epoch": 0.05270917803169923, + "grad_norm": 2.2186890420809044, + "learning_rate": 1.5810810810810808e-05, + "loss": 0.6602, + "step": 429 + }, + { + "epoch": 0.05283204324855633, + "grad_norm": 1.7273211263309867, + "learning_rate": 1.584766584766585e-05, + "loss": 0.611, + "step": 430 + }, + { + "epoch": 0.05295490846541344, + "grad_norm": 1.7475577087129792, + "learning_rate": 1.5884520884520886e-05, + "loss": 0.6745, + "step": 431 + }, + { + "epoch": 0.05307777368227055, + "grad_norm": 2.0917120051655895, + "learning_rate": 1.592137592137592e-05, + "loss": 0.7538, + "step": 432 + }, + { + "epoch": 0.053200638899127656, + "grad_norm": 1.9707603548464119, + "learning_rate": 1.595823095823096e-05, + "loss": 0.7631, + "step": 433 + }, + { + "epoch": 0.053323504115984764, + "grad_norm": 1.8886966765714708, + "learning_rate": 1.5995085995085996e-05, + "loss": 0.8089, + "step": 434 + }, + { + "epoch": 0.05344636933284187, + "grad_norm": 2.1497817469515303, + "learning_rate": 1.603194103194103e-05, + "loss": 0.7558, + "step": 435 + }, + { + "epoch": 0.05356923454969898, + "grad_norm": 1.753946917007456, + "learning_rate": 1.606879606879607e-05, + "loss": 0.7652, + "step": 436 + }, + { + "epoch": 0.05369209976655609, + "grad_norm": 2.3976013266198444, + "learning_rate": 1.6105651105651107e-05, + "loss": 0.829, + "step": 437 + }, + { + "epoch": 0.0538149649834132, + "grad_norm": 2.094305826968303, + "learning_rate": 1.614250614250614e-05, + "loss": 0.7637, + "step": 438 + }, + { + "epoch": 0.053937830200270306, + "grad_norm": 1.999097283816911, + "learning_rate": 1.617936117936118e-05, + "loss": 0.6849, + "step": 439 + }, + { + "epoch": 0.05406069541712741, + "grad_norm": 1.9289062579404344, + "learning_rate": 1.6216216216216218e-05, + "loss": 0.6984, + "step": 440 + }, + { + "epoch": 0.054183560633984516, + "grad_norm": 1.9180958313981633, + "learning_rate": 1.625307125307125e-05, + "loss": 0.7515, + "step": 441 + }, + { + "epoch": 0.054306425850841625, + "grad_norm": 2.342090669604325, + "learning_rate": 1.628992628992629e-05, + "loss": 0.9044, + "step": 442 + }, + { + "epoch": 0.05442929106769873, + "grad_norm": 1.9260463456889936, + "learning_rate": 1.632678132678133e-05, + "loss": 0.7339, + "step": 443 + }, + { + "epoch": 0.05455215628455584, + "grad_norm": 2.264591673688307, + "learning_rate": 1.6363636363636363e-05, + "loss": 0.6785, + "step": 444 + }, + { + "epoch": 0.05467502150141295, + "grad_norm": 1.8200201074063862, + "learning_rate": 1.64004914004914e-05, + "loss": 0.6844, + "step": 445 + }, + { + "epoch": 0.05479788671827006, + "grad_norm": 1.9228555398635145, + "learning_rate": 1.643734643734644e-05, + "loss": 0.6536, + "step": 446 + }, + { + "epoch": 0.05492075193512717, + "grad_norm": 1.9566323134338655, + "learning_rate": 1.6474201474201473e-05, + "loss": 0.6743, + "step": 447 + }, + { + "epoch": 0.055043617151984275, + "grad_norm": 1.951599429963938, + "learning_rate": 1.651105651105651e-05, + "loss": 0.7161, + "step": 448 + }, + { + "epoch": 0.055166482368841384, + "grad_norm": 2.765309959326327, + "learning_rate": 1.654791154791155e-05, + "loss": 0.8251, + "step": 449 + }, + { + "epoch": 0.055289347585698485, + "grad_norm": 2.000189428560847, + "learning_rate": 1.6584766584766584e-05, + "loss": 0.6055, + "step": 450 + }, + { + "epoch": 0.055412212802555594, + "grad_norm": 2.1388224918442487, + "learning_rate": 1.662162162162162e-05, + "loss": 0.6275, + "step": 451 + }, + { + "epoch": 0.0555350780194127, + "grad_norm": 2.1359160568077433, + "learning_rate": 1.665847665847666e-05, + "loss": 0.7841, + "step": 452 + }, + { + "epoch": 0.05565794323626981, + "grad_norm": 1.7137154663549354, + "learning_rate": 1.6695331695331695e-05, + "loss": 0.7752, + "step": 453 + }, + { + "epoch": 0.05578080845312692, + "grad_norm": 2.058366152637004, + "learning_rate": 1.6732186732186732e-05, + "loss": 0.7866, + "step": 454 + }, + { + "epoch": 0.05590367366998403, + "grad_norm": 1.6755049597023677, + "learning_rate": 1.6769041769041772e-05, + "loss": 0.6936, + "step": 455 + }, + { + "epoch": 0.056026538886841136, + "grad_norm": 1.8901714723589986, + "learning_rate": 1.6805896805896806e-05, + "loss": 0.8164, + "step": 456 + }, + { + "epoch": 0.056149404103698244, + "grad_norm": 2.436826233660747, + "learning_rate": 1.6842751842751843e-05, + "loss": 0.8378, + "step": 457 + }, + { + "epoch": 0.05627226932055535, + "grad_norm": 2.041833619280879, + "learning_rate": 1.687960687960688e-05, + "loss": 0.6302, + "step": 458 + }, + { + "epoch": 0.05639513453741246, + "grad_norm": 2.0964824886227826, + "learning_rate": 1.6916461916461917e-05, + "loss": 0.7129, + "step": 459 + }, + { + "epoch": 0.05651799975426957, + "grad_norm": 2.270597388688162, + "learning_rate": 1.6953316953316954e-05, + "loss": 0.7128, + "step": 460 + }, + { + "epoch": 0.05664086497112667, + "grad_norm": 1.8660246822262052, + "learning_rate": 1.699017199017199e-05, + "loss": 0.777, + "step": 461 + }, + { + "epoch": 0.05676373018798378, + "grad_norm": 2.3782733506498053, + "learning_rate": 1.7027027027027028e-05, + "loss": 0.6615, + "step": 462 + }, + { + "epoch": 0.05688659540484089, + "grad_norm": 2.1442928870900735, + "learning_rate": 1.7063882063882065e-05, + "loss": 0.6766, + "step": 463 + }, + { + "epoch": 0.057009460621698, + "grad_norm": 2.071838223594134, + "learning_rate": 1.71007371007371e-05, + "loss": 0.6852, + "step": 464 + }, + { + "epoch": 0.057132325838555105, + "grad_norm": 1.8461221994993846, + "learning_rate": 1.713759213759214e-05, + "loss": 0.7903, + "step": 465 + }, + { + "epoch": 0.05725519105541221, + "grad_norm": 2.087416006304511, + "learning_rate": 1.7174447174447175e-05, + "loss": 0.5906, + "step": 466 + }, + { + "epoch": 0.05737805627226932, + "grad_norm": 1.907362834207086, + "learning_rate": 1.7211302211302212e-05, + "loss": 0.6855, + "step": 467 + }, + { + "epoch": 0.05750092148912643, + "grad_norm": 1.724123373800819, + "learning_rate": 1.724815724815725e-05, + "loss": 0.6707, + "step": 468 + }, + { + "epoch": 0.05762378670598354, + "grad_norm": 2.3223462639745933, + "learning_rate": 1.7285012285012286e-05, + "loss": 0.8163, + "step": 469 + }, + { + "epoch": 0.05774665192284065, + "grad_norm": 2.0821310428379243, + "learning_rate": 1.732186732186732e-05, + "loss": 0.812, + "step": 470 + }, + { + "epoch": 0.05786951713969775, + "grad_norm": 1.9926787966090476, + "learning_rate": 1.735872235872236e-05, + "loss": 0.743, + "step": 471 + }, + { + "epoch": 0.05799238235655486, + "grad_norm": 2.4976507545630655, + "learning_rate": 1.7395577395577397e-05, + "loss": 0.7529, + "step": 472 + }, + { + "epoch": 0.058115247573411966, + "grad_norm": 1.861992054403512, + "learning_rate": 1.743243243243243e-05, + "loss": 0.6764, + "step": 473 + }, + { + "epoch": 0.058238112790269074, + "grad_norm": 1.9025046576544518, + "learning_rate": 1.7469287469287468e-05, + "loss": 0.7604, + "step": 474 + }, + { + "epoch": 0.05836097800712618, + "grad_norm": 1.940748792513558, + "learning_rate": 1.7506142506142508e-05, + "loss": 0.6984, + "step": 475 + }, + { + "epoch": 0.05848384322398329, + "grad_norm": 1.9135095767707437, + "learning_rate": 1.754299754299754e-05, + "loss": 0.7756, + "step": 476 + }, + { + "epoch": 0.0586067084408404, + "grad_norm": 1.7726204957651555, + "learning_rate": 1.757985257985258e-05, + "loss": 0.6183, + "step": 477 + }, + { + "epoch": 0.05872957365769751, + "grad_norm": 1.8367505216838214, + "learning_rate": 1.761670761670762e-05, + "loss": 0.7494, + "step": 478 + }, + { + "epoch": 0.058852438874554616, + "grad_norm": 1.9270152495767996, + "learning_rate": 1.7653562653562652e-05, + "loss": 0.7208, + "step": 479 + }, + { + "epoch": 0.058975304091411725, + "grad_norm": 1.8548260381135355, + "learning_rate": 1.769041769041769e-05, + "loss": 0.651, + "step": 480 + }, + { + "epoch": 0.059098169308268826, + "grad_norm": 1.8625274834460512, + "learning_rate": 1.772727272727273e-05, + "loss": 0.7101, + "step": 481 + }, + { + "epoch": 0.059221034525125935, + "grad_norm": 1.8664744836340494, + "learning_rate": 1.7764127764127763e-05, + "loss": 0.6675, + "step": 482 + }, + { + "epoch": 0.05934389974198304, + "grad_norm": 2.2907973493667972, + "learning_rate": 1.78009828009828e-05, + "loss": 0.6579, + "step": 483 + }, + { + "epoch": 0.05946676495884015, + "grad_norm": 2.2143074581627595, + "learning_rate": 1.783783783783784e-05, + "loss": 0.7164, + "step": 484 + }, + { + "epoch": 0.05958963017569726, + "grad_norm": 2.020856206527743, + "learning_rate": 1.7874692874692874e-05, + "loss": 0.8205, + "step": 485 + }, + { + "epoch": 0.05971249539255437, + "grad_norm": 2.071605012177533, + "learning_rate": 1.791154791154791e-05, + "loss": 0.8685, + "step": 486 + }, + { + "epoch": 0.05983536060941148, + "grad_norm": 2.0374807606539322, + "learning_rate": 1.794840294840295e-05, + "loss": 0.7829, + "step": 487 + }, + { + "epoch": 0.059958225826268585, + "grad_norm": 2.173190255603572, + "learning_rate": 1.7985257985257985e-05, + "loss": 0.6843, + "step": 488 + }, + { + "epoch": 0.060081091043125694, + "grad_norm": 1.8273034307139202, + "learning_rate": 1.8022113022113022e-05, + "loss": 0.5978, + "step": 489 + }, + { + "epoch": 0.0602039562599828, + "grad_norm": 2.3089326187934947, + "learning_rate": 1.805896805896806e-05, + "loss": 0.7468, + "step": 490 + }, + { + "epoch": 0.060326821476839904, + "grad_norm": 2.4203853051172737, + "learning_rate": 1.8095823095823096e-05, + "loss": 0.7678, + "step": 491 + }, + { + "epoch": 0.06044968669369701, + "grad_norm": 2.373258334919009, + "learning_rate": 1.8132678132678133e-05, + "loss": 0.6623, + "step": 492 + }, + { + "epoch": 0.06057255191055412, + "grad_norm": 2.3414591829037317, + "learning_rate": 1.816953316953317e-05, + "loss": 0.873, + "step": 493 + }, + { + "epoch": 0.06069541712741123, + "grad_norm": 2.2740576417694474, + "learning_rate": 1.8206388206388207e-05, + "loss": 0.7079, + "step": 494 + }, + { + "epoch": 0.06081828234426834, + "grad_norm": 1.9565234393312163, + "learning_rate": 1.8243243243243244e-05, + "loss": 0.7624, + "step": 495 + }, + { + "epoch": 0.060941147561125446, + "grad_norm": 1.911531059391597, + "learning_rate": 1.828009828009828e-05, + "loss": 0.7939, + "step": 496 + }, + { + "epoch": 0.061064012777982554, + "grad_norm": 2.342224818221533, + "learning_rate": 1.8316953316953318e-05, + "loss": 0.7048, + "step": 497 + }, + { + "epoch": 0.06118687799483966, + "grad_norm": 2.171000784560076, + "learning_rate": 1.8353808353808355e-05, + "loss": 0.7169, + "step": 498 + }, + { + "epoch": 0.06130974321169677, + "grad_norm": 1.9958230958011038, + "learning_rate": 1.839066339066339e-05, + "loss": 0.6634, + "step": 499 + }, + { + "epoch": 0.06143260842855388, + "grad_norm": 2.1508795768885918, + "learning_rate": 1.842751842751843e-05, + "loss": 0.7765, + "step": 500 + }, + { + "epoch": 0.06155547364541098, + "grad_norm": 1.7283308921706135, + "learning_rate": 1.8464373464373465e-05, + "loss": 0.6258, + "step": 501 + }, + { + "epoch": 0.06167833886226809, + "grad_norm": 2.128066794176502, + "learning_rate": 1.8501228501228502e-05, + "loss": 0.5821, + "step": 502 + }, + { + "epoch": 0.0618012040791252, + "grad_norm": 2.3709256991990073, + "learning_rate": 1.853808353808354e-05, + "loss": 0.7189, + "step": 503 + }, + { + "epoch": 0.061924069295982306, + "grad_norm": 1.9808177177514754, + "learning_rate": 1.8574938574938576e-05, + "loss": 0.6448, + "step": 504 + }, + { + "epoch": 0.062046934512839415, + "grad_norm": 1.8464843298364157, + "learning_rate": 1.8611793611793613e-05, + "loss": 0.7335, + "step": 505 + }, + { + "epoch": 0.06216979972969652, + "grad_norm": 2.435547905533497, + "learning_rate": 1.864864864864865e-05, + "loss": 0.6962, + "step": 506 + }, + { + "epoch": 0.06229266494655363, + "grad_norm": 2.1373853240562974, + "learning_rate": 1.8685503685503687e-05, + "loss": 0.689, + "step": 507 + }, + { + "epoch": 0.06241553016341074, + "grad_norm": 2.0645155829860347, + "learning_rate": 1.8722358722358724e-05, + "loss": 0.6639, + "step": 508 + }, + { + "epoch": 0.06253839538026784, + "grad_norm": 1.7982090061191802, + "learning_rate": 1.8759213759213758e-05, + "loss": 0.6843, + "step": 509 + }, + { + "epoch": 0.06266126059712496, + "grad_norm": 1.9194479081781124, + "learning_rate": 1.8796068796068798e-05, + "loss": 0.7175, + "step": 510 + }, + { + "epoch": 0.06278412581398206, + "grad_norm": 2.1185243464470807, + "learning_rate": 1.883292383292383e-05, + "loss": 0.6509, + "step": 511 + }, + { + "epoch": 0.06290699103083917, + "grad_norm": 1.997360129445677, + "learning_rate": 1.886977886977887e-05, + "loss": 0.6824, + "step": 512 + }, + { + "epoch": 0.06302985624769628, + "grad_norm": 1.9097457072771375, + "learning_rate": 1.890663390663391e-05, + "loss": 0.7307, + "step": 513 + }, + { + "epoch": 0.06315272146455339, + "grad_norm": 1.7515420351212456, + "learning_rate": 1.8943488943488942e-05, + "loss": 0.7153, + "step": 514 + }, + { + "epoch": 0.06327558668141049, + "grad_norm": 2.1644615273116723, + "learning_rate": 1.898034398034398e-05, + "loss": 0.7089, + "step": 515 + }, + { + "epoch": 0.0633984518982676, + "grad_norm": 2.2608971652179526, + "learning_rate": 1.901719901719902e-05, + "loss": 0.6137, + "step": 516 + }, + { + "epoch": 0.06352131711512471, + "grad_norm": 2.077349988760777, + "learning_rate": 1.9054054054054053e-05, + "loss": 0.662, + "step": 517 + }, + { + "epoch": 0.06364418233198181, + "grad_norm": 1.7576926947213254, + "learning_rate": 1.909090909090909e-05, + "loss": 0.8567, + "step": 518 + }, + { + "epoch": 0.06376704754883893, + "grad_norm": 1.9833695154140059, + "learning_rate": 1.912776412776413e-05, + "loss": 0.7427, + "step": 519 + }, + { + "epoch": 0.06388991276569603, + "grad_norm": 2.3638498545522872, + "learning_rate": 1.9164619164619164e-05, + "loss": 0.7018, + "step": 520 + }, + { + "epoch": 0.06401277798255314, + "grad_norm": 2.1441800542079155, + "learning_rate": 1.92014742014742e-05, + "loss": 0.791, + "step": 521 + }, + { + "epoch": 0.06413564319941024, + "grad_norm": 2.1206188217070996, + "learning_rate": 1.923832923832924e-05, + "loss": 0.7062, + "step": 522 + }, + { + "epoch": 0.06425850841626736, + "grad_norm": 2.51121776362489, + "learning_rate": 1.9275184275184275e-05, + "loss": 0.7154, + "step": 523 + }, + { + "epoch": 0.06438137363312446, + "grad_norm": 1.897917368444447, + "learning_rate": 1.9312039312039312e-05, + "loss": 0.754, + "step": 524 + }, + { + "epoch": 0.06450423884998158, + "grad_norm": 1.9121804922393901, + "learning_rate": 1.934889434889435e-05, + "loss": 0.7304, + "step": 525 + }, + { + "epoch": 0.06462710406683868, + "grad_norm": 2.159825258338104, + "learning_rate": 1.9385749385749386e-05, + "loss": 0.7749, + "step": 526 + }, + { + "epoch": 0.06474996928369578, + "grad_norm": 2.19956708736866, + "learning_rate": 1.9422604422604423e-05, + "loss": 0.6687, + "step": 527 + }, + { + "epoch": 0.0648728345005529, + "grad_norm": 1.9694374157804415, + "learning_rate": 1.945945945945946e-05, + "loss": 0.6652, + "step": 528 + }, + { + "epoch": 0.06499569971741, + "grad_norm": 1.853260335430941, + "learning_rate": 1.9496314496314497e-05, + "loss": 0.5907, + "step": 529 + }, + { + "epoch": 0.06511856493426711, + "grad_norm": 1.8819944016719397, + "learning_rate": 1.9533169533169534e-05, + "loss": 0.5258, + "step": 530 + }, + { + "epoch": 0.06524143015112421, + "grad_norm": 1.9802728909732878, + "learning_rate": 1.957002457002457e-05, + "loss": 0.7492, + "step": 531 + }, + { + "epoch": 0.06536429536798133, + "grad_norm": 2.022593593600987, + "learning_rate": 1.9606879606879607e-05, + "loss": 0.6469, + "step": 532 + }, + { + "epoch": 0.06548716058483843, + "grad_norm": 1.7771327141137787, + "learning_rate": 1.9643734643734644e-05, + "loss": 0.7068, + "step": 533 + }, + { + "epoch": 0.06561002580169555, + "grad_norm": 1.852581214762114, + "learning_rate": 1.968058968058968e-05, + "loss": 0.8058, + "step": 534 + }, + { + "epoch": 0.06573289101855265, + "grad_norm": 1.6376101438135378, + "learning_rate": 1.971744471744472e-05, + "loss": 0.6492, + "step": 535 + }, + { + "epoch": 0.06585575623540975, + "grad_norm": 1.9951664331083137, + "learning_rate": 1.9754299754299755e-05, + "loss": 0.6434, + "step": 536 + }, + { + "epoch": 0.06597862145226686, + "grad_norm": 1.9022173503469637, + "learning_rate": 1.9791154791154792e-05, + "loss": 0.6672, + "step": 537 + }, + { + "epoch": 0.06610148666912397, + "grad_norm": 1.903943702161234, + "learning_rate": 1.982800982800983e-05, + "loss": 0.7301, + "step": 538 + }, + { + "epoch": 0.06622435188598108, + "grad_norm": 2.0375471489032475, + "learning_rate": 1.9864864864864866e-05, + "loss": 0.8768, + "step": 539 + }, + { + "epoch": 0.06634721710283818, + "grad_norm": 2.1440402634584235, + "learning_rate": 1.9901719901719903e-05, + "loss": 0.7016, + "step": 540 + }, + { + "epoch": 0.0664700823196953, + "grad_norm": 2.1344561936114506, + "learning_rate": 1.9938574938574937e-05, + "loss": 0.6984, + "step": 541 + }, + { + "epoch": 0.0665929475365524, + "grad_norm": 1.9382366230567107, + "learning_rate": 1.9975429975429977e-05, + "loss": 0.6456, + "step": 542 + }, + { + "epoch": 0.06671581275340951, + "grad_norm": 2.008008369586257, + "learning_rate": 2.0012285012285014e-05, + "loss": 0.6301, + "step": 543 + }, + { + "epoch": 0.06683867797026662, + "grad_norm": 2.6740831380919077, + "learning_rate": 2.0049140049140048e-05, + "loss": 0.689, + "step": 544 + }, + { + "epoch": 0.06696154318712373, + "grad_norm": 2.339770686935505, + "learning_rate": 2.0085995085995088e-05, + "loss": 0.8015, + "step": 545 + }, + { + "epoch": 0.06708440840398083, + "grad_norm": 3.202080304866746, + "learning_rate": 2.0122850122850125e-05, + "loss": 0.7726, + "step": 546 + }, + { + "epoch": 0.06720727362083793, + "grad_norm": 1.9620041088047253, + "learning_rate": 2.015970515970516e-05, + "loss": 0.7309, + "step": 547 + }, + { + "epoch": 0.06733013883769505, + "grad_norm": 1.8631079564208417, + "learning_rate": 2.01965601965602e-05, + "loss": 0.6778, + "step": 548 + }, + { + "epoch": 0.06745300405455215, + "grad_norm": 1.654514401179753, + "learning_rate": 2.0233415233415236e-05, + "loss": 0.6988, + "step": 549 + }, + { + "epoch": 0.06757586927140927, + "grad_norm": 2.030227879798917, + "learning_rate": 2.027027027027027e-05, + "loss": 0.6121, + "step": 550 + }, + { + "epoch": 0.06769873448826637, + "grad_norm": 1.9600033086491733, + "learning_rate": 2.030712530712531e-05, + "loss": 0.6675, + "step": 551 + }, + { + "epoch": 0.06782159970512348, + "grad_norm": 2.1051414421299723, + "learning_rate": 2.0343980343980343e-05, + "loss": 0.6527, + "step": 552 + }, + { + "epoch": 0.06794446492198059, + "grad_norm": 2.0667395553548493, + "learning_rate": 2.038083538083538e-05, + "loss": 0.7225, + "step": 553 + }, + { + "epoch": 0.0680673301388377, + "grad_norm": 1.7322573949240014, + "learning_rate": 2.041769041769042e-05, + "loss": 0.701, + "step": 554 + }, + { + "epoch": 0.0681901953556948, + "grad_norm": 2.1567206093813076, + "learning_rate": 2.0454545454545454e-05, + "loss": 0.8044, + "step": 555 + }, + { + "epoch": 0.06831306057255192, + "grad_norm": 2.1658034347403947, + "learning_rate": 2.049140049140049e-05, + "loss": 0.8096, + "step": 556 + }, + { + "epoch": 0.06843592578940902, + "grad_norm": 2.2849138053891163, + "learning_rate": 2.0528255528255528e-05, + "loss": 0.5901, + "step": 557 + }, + { + "epoch": 0.06855879100626612, + "grad_norm": 1.8841753784162607, + "learning_rate": 2.0565110565110565e-05, + "loss": 0.7117, + "step": 558 + }, + { + "epoch": 0.06868165622312324, + "grad_norm": 1.936132153787478, + "learning_rate": 2.0601965601965602e-05, + "loss": 0.6644, + "step": 559 + }, + { + "epoch": 0.06880452143998034, + "grad_norm": 1.9891220416109536, + "learning_rate": 2.063882063882064e-05, + "loss": 0.6325, + "step": 560 + }, + { + "epoch": 0.06892738665683745, + "grad_norm": 1.8962534309087307, + "learning_rate": 2.0675675675675676e-05, + "loss": 0.6545, + "step": 561 + }, + { + "epoch": 0.06905025187369455, + "grad_norm": 2.204048657624188, + "learning_rate": 2.0712530712530713e-05, + "loss": 0.684, + "step": 562 + }, + { + "epoch": 0.06917311709055167, + "grad_norm": 1.9916543130785984, + "learning_rate": 2.074938574938575e-05, + "loss": 0.7059, + "step": 563 + }, + { + "epoch": 0.06929598230740877, + "grad_norm": 2.0544967941324197, + "learning_rate": 2.0786240786240787e-05, + "loss": 0.7746, + "step": 564 + }, + { + "epoch": 0.06941884752426589, + "grad_norm": 1.8693663999584553, + "learning_rate": 2.0823095823095824e-05, + "loss": 0.7438, + "step": 565 + }, + { + "epoch": 0.06954171274112299, + "grad_norm": 2.1101554923166606, + "learning_rate": 2.085995085995086e-05, + "loss": 0.6561, + "step": 566 + }, + { + "epoch": 0.06966457795798009, + "grad_norm": 2.1169254339568515, + "learning_rate": 2.0896805896805897e-05, + "loss": 0.7129, + "step": 567 + }, + { + "epoch": 0.0697874431748372, + "grad_norm": 1.8470221909127316, + "learning_rate": 2.0933660933660934e-05, + "loss": 0.7379, + "step": 568 + }, + { + "epoch": 0.0699103083916943, + "grad_norm": 1.7109189527029212, + "learning_rate": 2.097051597051597e-05, + "loss": 0.7486, + "step": 569 + }, + { + "epoch": 0.07003317360855142, + "grad_norm": 1.9112601420959396, + "learning_rate": 2.1007371007371008e-05, + "loss": 0.5895, + "step": 570 + }, + { + "epoch": 0.07015603882540852, + "grad_norm": 2.192992153351891, + "learning_rate": 2.1044226044226045e-05, + "loss": 0.667, + "step": 571 + }, + { + "epoch": 0.07027890404226564, + "grad_norm": 1.9278428221726676, + "learning_rate": 2.1081081081081082e-05, + "loss": 0.6867, + "step": 572 + }, + { + "epoch": 0.07040176925912274, + "grad_norm": 1.9372487668278928, + "learning_rate": 2.111793611793612e-05, + "loss": 0.8377, + "step": 573 + }, + { + "epoch": 0.07052463447597986, + "grad_norm": 1.8201532239171367, + "learning_rate": 2.1154791154791156e-05, + "loss": 0.6086, + "step": 574 + }, + { + "epoch": 0.07064749969283696, + "grad_norm": 1.9507494045829414, + "learning_rate": 2.1191646191646193e-05, + "loss": 0.8002, + "step": 575 + }, + { + "epoch": 0.07077036490969407, + "grad_norm": 1.7880748889421596, + "learning_rate": 2.1228501228501227e-05, + "loss": 0.703, + "step": 576 + }, + { + "epoch": 0.07089323012655117, + "grad_norm": 1.9125399549392166, + "learning_rate": 2.1265356265356267e-05, + "loss": 0.6774, + "step": 577 + }, + { + "epoch": 0.07101609534340828, + "grad_norm": 2.3411935773084944, + "learning_rate": 2.1302211302211304e-05, + "loss": 0.7504, + "step": 578 + }, + { + "epoch": 0.07113896056026539, + "grad_norm": 1.8469575071452744, + "learning_rate": 2.1339066339066337e-05, + "loss": 0.5581, + "step": 579 + }, + { + "epoch": 0.07126182577712249, + "grad_norm": 1.980992909925409, + "learning_rate": 2.1375921375921378e-05, + "loss": 0.7456, + "step": 580 + }, + { + "epoch": 0.07138469099397961, + "grad_norm": 1.656736701128086, + "learning_rate": 2.1412776412776415e-05, + "loss": 0.6771, + "step": 581 + }, + { + "epoch": 0.07150755621083671, + "grad_norm": 1.8793876805389826, + "learning_rate": 2.1449631449631448e-05, + "loss": 0.6047, + "step": 582 + }, + { + "epoch": 0.07163042142769382, + "grad_norm": 1.8252532617280828, + "learning_rate": 2.148648648648649e-05, + "loss": 0.6572, + "step": 583 + }, + { + "epoch": 0.07175328664455093, + "grad_norm": 2.7235407375076077, + "learning_rate": 2.1523341523341526e-05, + "loss": 0.7116, + "step": 584 + }, + { + "epoch": 0.07187615186140804, + "grad_norm": 2.0549695427326244, + "learning_rate": 2.156019656019656e-05, + "loss": 0.8645, + "step": 585 + }, + { + "epoch": 0.07199901707826514, + "grad_norm": 1.9662709890153582, + "learning_rate": 2.15970515970516e-05, + "loss": 0.6612, + "step": 586 + }, + { + "epoch": 0.07212188229512224, + "grad_norm": 2.072374302791573, + "learning_rate": 2.1633906633906636e-05, + "loss": 0.82, + "step": 587 + }, + { + "epoch": 0.07224474751197936, + "grad_norm": 1.9232320305518285, + "learning_rate": 2.167076167076167e-05, + "loss": 0.7297, + "step": 588 + }, + { + "epoch": 0.07236761272883646, + "grad_norm": 1.8937276450105898, + "learning_rate": 2.170761670761671e-05, + "loss": 0.6319, + "step": 589 + }, + { + "epoch": 0.07249047794569358, + "grad_norm": 2.5708160186337676, + "learning_rate": 2.1744471744471747e-05, + "loss": 0.7812, + "step": 590 + }, + { + "epoch": 0.07261334316255068, + "grad_norm": 2.252257641288314, + "learning_rate": 2.178132678132678e-05, + "loss": 0.6408, + "step": 591 + }, + { + "epoch": 0.0727362083794078, + "grad_norm": 1.9186663630717087, + "learning_rate": 2.1818181818181818e-05, + "loss": 0.6203, + "step": 592 + }, + { + "epoch": 0.0728590735962649, + "grad_norm": 2.070433886871464, + "learning_rate": 2.1855036855036855e-05, + "loss": 0.6898, + "step": 593 + }, + { + "epoch": 0.07298193881312201, + "grad_norm": 1.9948235298812942, + "learning_rate": 2.1891891891891892e-05, + "loss": 0.7608, + "step": 594 + }, + { + "epoch": 0.07310480402997911, + "grad_norm": 1.6439195401470175, + "learning_rate": 2.192874692874693e-05, + "loss": 0.7041, + "step": 595 + }, + { + "epoch": 0.07322766924683623, + "grad_norm": 2.059761775744195, + "learning_rate": 2.1965601965601966e-05, + "loss": 0.7671, + "step": 596 + }, + { + "epoch": 0.07335053446369333, + "grad_norm": 1.9714717947903029, + "learning_rate": 2.2002457002457003e-05, + "loss": 0.704, + "step": 597 + }, + { + "epoch": 0.07347339968055043, + "grad_norm": 2.0066872105060676, + "learning_rate": 2.203931203931204e-05, + "loss": 0.6947, + "step": 598 + }, + { + "epoch": 0.07359626489740755, + "grad_norm": 1.990341662718597, + "learning_rate": 2.2076167076167076e-05, + "loss": 0.736, + "step": 599 + }, + { + "epoch": 0.07371913011426465, + "grad_norm": 2.188100481231796, + "learning_rate": 2.2113022113022113e-05, + "loss": 0.6427, + "step": 600 + }, + { + "epoch": 0.07384199533112176, + "grad_norm": 2.342597054727318, + "learning_rate": 2.214987714987715e-05, + "loss": 0.7819, + "step": 601 + }, + { + "epoch": 0.07396486054797886, + "grad_norm": 2.549170714779742, + "learning_rate": 2.2186732186732187e-05, + "loss": 0.8905, + "step": 602 + }, + { + "epoch": 0.07408772576483598, + "grad_norm": 1.825207964177122, + "learning_rate": 2.2223587223587224e-05, + "loss": 0.7032, + "step": 603 + }, + { + "epoch": 0.07421059098169308, + "grad_norm": 2.0318987454219934, + "learning_rate": 2.226044226044226e-05, + "loss": 0.7042, + "step": 604 + }, + { + "epoch": 0.0743334561985502, + "grad_norm": 1.833453897454606, + "learning_rate": 2.2297297297297298e-05, + "loss": 0.7917, + "step": 605 + }, + { + "epoch": 0.0744563214154073, + "grad_norm": 2.0796188323911218, + "learning_rate": 2.2334152334152335e-05, + "loss": 0.7021, + "step": 606 + }, + { + "epoch": 0.0745791866322644, + "grad_norm": 2.065534963203004, + "learning_rate": 2.2371007371007372e-05, + "loss": 0.7327, + "step": 607 + }, + { + "epoch": 0.07470205184912151, + "grad_norm": 2.1378459006523642, + "learning_rate": 2.2407862407862406e-05, + "loss": 0.6251, + "step": 608 + }, + { + "epoch": 0.07482491706597862, + "grad_norm": 2.0892609805261193, + "learning_rate": 2.2444717444717446e-05, + "loss": 0.7242, + "step": 609 + }, + { + "epoch": 0.07494778228283573, + "grad_norm": 2.1045805079981244, + "learning_rate": 2.2481572481572483e-05, + "loss": 0.7377, + "step": 610 + }, + { + "epoch": 0.07507064749969283, + "grad_norm": 1.8423494101097295, + "learning_rate": 2.2518427518427517e-05, + "loss": 0.7431, + "step": 611 + }, + { + "epoch": 0.07519351271654995, + "grad_norm": 1.816169539963417, + "learning_rate": 2.2555282555282557e-05, + "loss": 0.6797, + "step": 612 + }, + { + "epoch": 0.07531637793340705, + "grad_norm": 1.786998407621002, + "learning_rate": 2.2592137592137594e-05, + "loss": 0.6533, + "step": 613 + }, + { + "epoch": 0.07543924315026417, + "grad_norm": 1.801343383153808, + "learning_rate": 2.2628992628992627e-05, + "loss": 0.6911, + "step": 614 + }, + { + "epoch": 0.07556210836712127, + "grad_norm": 1.8839246656393371, + "learning_rate": 2.2665847665847668e-05, + "loss": 0.5506, + "step": 615 + }, + { + "epoch": 0.07568497358397838, + "grad_norm": 1.7016074446134384, + "learning_rate": 2.2702702702702705e-05, + "loss": 0.6732, + "step": 616 + }, + { + "epoch": 0.07580783880083548, + "grad_norm": 1.8609990934574652, + "learning_rate": 2.2739557739557738e-05, + "loss": 0.7229, + "step": 617 + }, + { + "epoch": 0.07593070401769259, + "grad_norm": 1.8504435578850045, + "learning_rate": 2.277641277641278e-05, + "loss": 0.7272, + "step": 618 + }, + { + "epoch": 0.0760535692345497, + "grad_norm": 1.9052358697588765, + "learning_rate": 2.2813267813267816e-05, + "loss": 0.6563, + "step": 619 + }, + { + "epoch": 0.0761764344514068, + "grad_norm": 1.7824462202745566, + "learning_rate": 2.285012285012285e-05, + "loss": 0.6875, + "step": 620 + }, + { + "epoch": 0.07629929966826392, + "grad_norm": 1.8378148405768098, + "learning_rate": 2.288697788697789e-05, + "loss": 0.7189, + "step": 621 + }, + { + "epoch": 0.07642216488512102, + "grad_norm": 1.9607908846539674, + "learning_rate": 2.2923832923832926e-05, + "loss": 0.6669, + "step": 622 + }, + { + "epoch": 0.07654503010197813, + "grad_norm": 2.1216298379633556, + "learning_rate": 2.296068796068796e-05, + "loss": 0.7276, + "step": 623 + }, + { + "epoch": 0.07666789531883524, + "grad_norm": 1.5484644418224525, + "learning_rate": 2.2997542997542997e-05, + "loss": 0.7356, + "step": 624 + }, + { + "epoch": 0.07679076053569235, + "grad_norm": 1.888460188688903, + "learning_rate": 2.3034398034398037e-05, + "loss": 0.7807, + "step": 625 + }, + { + "epoch": 0.07691362575254945, + "grad_norm": 2.0666182163393154, + "learning_rate": 2.307125307125307e-05, + "loss": 0.8242, + "step": 626 + }, + { + "epoch": 0.07703649096940655, + "grad_norm": 2.0564807526433806, + "learning_rate": 2.3108108108108108e-05, + "loss": 0.7612, + "step": 627 + }, + { + "epoch": 0.07715935618626367, + "grad_norm": 1.8509144985170571, + "learning_rate": 2.3144963144963148e-05, + "loss": 0.6884, + "step": 628 + }, + { + "epoch": 0.07728222140312077, + "grad_norm": 2.0066058367990136, + "learning_rate": 2.318181818181818e-05, + "loss": 0.6909, + "step": 629 + }, + { + "epoch": 0.07740508661997789, + "grad_norm": 2.286763811516986, + "learning_rate": 2.321867321867322e-05, + "loss": 0.637, + "step": 630 + }, + { + "epoch": 0.07752795183683499, + "grad_norm": 2.0288506621844644, + "learning_rate": 2.3255528255528256e-05, + "loss": 0.8078, + "step": 631 + }, + { + "epoch": 0.0776508170536921, + "grad_norm": 1.8383604308531454, + "learning_rate": 2.3292383292383292e-05, + "loss": 0.7482, + "step": 632 + }, + { + "epoch": 0.0777736822705492, + "grad_norm": 1.7085955089707194, + "learning_rate": 2.332923832923833e-05, + "loss": 0.6243, + "step": 633 + }, + { + "epoch": 0.07789654748740632, + "grad_norm": 1.884760375078464, + "learning_rate": 2.3366093366093366e-05, + "loss": 0.6904, + "step": 634 + }, + { + "epoch": 0.07801941270426342, + "grad_norm": 1.925715746962403, + "learning_rate": 2.3402948402948403e-05, + "loss": 0.6704, + "step": 635 + }, + { + "epoch": 0.07814227792112054, + "grad_norm": 1.9922559251170244, + "learning_rate": 2.343980343980344e-05, + "loss": 0.8221, + "step": 636 + }, + { + "epoch": 0.07826514313797764, + "grad_norm": 1.766099361963721, + "learning_rate": 2.3476658476658477e-05, + "loss": 0.7942, + "step": 637 + }, + { + "epoch": 0.07838800835483474, + "grad_norm": 3.0823455442009005, + "learning_rate": 2.3513513513513514e-05, + "loss": 0.8466, + "step": 638 + }, + { + "epoch": 0.07851087357169186, + "grad_norm": 1.6894818105936766, + "learning_rate": 2.355036855036855e-05, + "loss": 0.8231, + "step": 639 + }, + { + "epoch": 0.07863373878854896, + "grad_norm": 2.068864290870519, + "learning_rate": 2.3587223587223585e-05, + "loss": 0.6031, + "step": 640 + }, + { + "epoch": 0.07875660400540607, + "grad_norm": 2.2146348425574134, + "learning_rate": 2.3624078624078625e-05, + "loss": 0.741, + "step": 641 + }, + { + "epoch": 0.07887946922226317, + "grad_norm": 1.8544829266401204, + "learning_rate": 2.3660933660933662e-05, + "loss": 0.8268, + "step": 642 + }, + { + "epoch": 0.07900233443912029, + "grad_norm": 2.002310166888022, + "learning_rate": 2.3697788697788696e-05, + "loss": 0.7727, + "step": 643 + }, + { + "epoch": 0.07912519965597739, + "grad_norm": 1.727559049895937, + "learning_rate": 2.3734643734643736e-05, + "loss": 0.7225, + "step": 644 + }, + { + "epoch": 0.0792480648728345, + "grad_norm": 1.7987185503825505, + "learning_rate": 2.3771498771498773e-05, + "loss": 0.7349, + "step": 645 + }, + { + "epoch": 0.07937093008969161, + "grad_norm": 1.6203868054475823, + "learning_rate": 2.3808353808353806e-05, + "loss": 0.7179, + "step": 646 + }, + { + "epoch": 0.07949379530654872, + "grad_norm": 1.7771052096605318, + "learning_rate": 2.3845208845208847e-05, + "loss": 0.7225, + "step": 647 + }, + { + "epoch": 0.07961666052340582, + "grad_norm": 1.8249108319701817, + "learning_rate": 2.3882063882063884e-05, + "loss": 0.7042, + "step": 648 + }, + { + "epoch": 0.07973952574026293, + "grad_norm": 2.3213947728402857, + "learning_rate": 2.3918918918918917e-05, + "loss": 0.6781, + "step": 649 + }, + { + "epoch": 0.07986239095712004, + "grad_norm": 1.661493825137898, + "learning_rate": 2.3955773955773958e-05, + "loss": 0.6693, + "step": 650 + }, + { + "epoch": 0.07998525617397714, + "grad_norm": 1.877934173817582, + "learning_rate": 2.3992628992628995e-05, + "loss": 0.8023, + "step": 651 + }, + { + "epoch": 0.08010812139083426, + "grad_norm": 2.2717249502090158, + "learning_rate": 2.4029484029484028e-05, + "loss": 0.7029, + "step": 652 + }, + { + "epoch": 0.08023098660769136, + "grad_norm": 1.9749875260656726, + "learning_rate": 2.406633906633907e-05, + "loss": 0.7231, + "step": 653 + }, + { + "epoch": 0.08035385182454848, + "grad_norm": 2.0235208059091145, + "learning_rate": 2.4103194103194105e-05, + "loss": 0.6019, + "step": 654 + }, + { + "epoch": 0.08047671704140558, + "grad_norm": 1.9488758119253824, + "learning_rate": 2.414004914004914e-05, + "loss": 0.7002, + "step": 655 + }, + { + "epoch": 0.08059958225826269, + "grad_norm": 2.08201930886419, + "learning_rate": 2.417690417690418e-05, + "loss": 0.7334, + "step": 656 + }, + { + "epoch": 0.0807224474751198, + "grad_norm": 2.3386777455430448, + "learning_rate": 2.4213759213759216e-05, + "loss": 0.7283, + "step": 657 + }, + { + "epoch": 0.0808453126919769, + "grad_norm": 1.9614452408707026, + "learning_rate": 2.425061425061425e-05, + "loss": 0.738, + "step": 658 + }, + { + "epoch": 0.08096817790883401, + "grad_norm": 1.979862302207417, + "learning_rate": 2.4287469287469287e-05, + "loss": 0.5718, + "step": 659 + }, + { + "epoch": 0.08109104312569111, + "grad_norm": 2.368414824043649, + "learning_rate": 2.4324324324324327e-05, + "loss": 0.7385, + "step": 660 + }, + { + "epoch": 0.08121390834254823, + "grad_norm": 2.0636374038579532, + "learning_rate": 2.436117936117936e-05, + "loss": 0.6717, + "step": 661 + }, + { + "epoch": 0.08133677355940533, + "grad_norm": 1.952024538904434, + "learning_rate": 2.4398034398034398e-05, + "loss": 0.6759, + "step": 662 + }, + { + "epoch": 0.08145963877626244, + "grad_norm": 2.100027352994839, + "learning_rate": 2.4434889434889438e-05, + "loss": 0.6236, + "step": 663 + }, + { + "epoch": 0.08158250399311955, + "grad_norm": 1.7057289166671867, + "learning_rate": 2.447174447174447e-05, + "loss": 0.8087, + "step": 664 + }, + { + "epoch": 0.08170536920997666, + "grad_norm": 1.6877249122975946, + "learning_rate": 2.450859950859951e-05, + "loss": 0.7088, + "step": 665 + }, + { + "epoch": 0.08182823442683376, + "grad_norm": 1.7851300649246822, + "learning_rate": 2.454545454545455e-05, + "loss": 0.6561, + "step": 666 + }, + { + "epoch": 0.08195109964369088, + "grad_norm": 1.9128002833242168, + "learning_rate": 2.4582309582309582e-05, + "loss": 0.7204, + "step": 667 + }, + { + "epoch": 0.08207396486054798, + "grad_norm": 2.0009437988322913, + "learning_rate": 2.461916461916462e-05, + "loss": 0.7596, + "step": 668 + }, + { + "epoch": 0.08219683007740508, + "grad_norm": 1.761803061763879, + "learning_rate": 2.465601965601966e-05, + "loss": 0.7069, + "step": 669 + }, + { + "epoch": 0.0823196952942622, + "grad_norm": 1.7775113283965929, + "learning_rate": 2.4692874692874693e-05, + "loss": 0.8083, + "step": 670 + }, + { + "epoch": 0.0824425605111193, + "grad_norm": 1.9434199576965636, + "learning_rate": 2.472972972972973e-05, + "loss": 0.7583, + "step": 671 + }, + { + "epoch": 0.08256542572797641, + "grad_norm": 1.931002366681942, + "learning_rate": 2.4766584766584767e-05, + "loss": 0.6696, + "step": 672 + }, + { + "epoch": 0.08268829094483351, + "grad_norm": 1.801609016878661, + "learning_rate": 2.4803439803439804e-05, + "loss": 0.8318, + "step": 673 + }, + { + "epoch": 0.08281115616169063, + "grad_norm": 2.37412428611428, + "learning_rate": 2.484029484029484e-05, + "loss": 0.6725, + "step": 674 + }, + { + "epoch": 0.08293402137854773, + "grad_norm": 2.191037019069383, + "learning_rate": 2.4877149877149875e-05, + "loss": 0.8105, + "step": 675 + }, + { + "epoch": 0.08305688659540485, + "grad_norm": 1.9750275927718735, + "learning_rate": 2.4914004914004915e-05, + "loss": 0.7909, + "step": 676 + }, + { + "epoch": 0.08317975181226195, + "grad_norm": 1.7551641645196254, + "learning_rate": 2.4950859950859952e-05, + "loss": 0.7162, + "step": 677 + }, + { + "epoch": 0.08330261702911905, + "grad_norm": 2.257952607382702, + "learning_rate": 2.4987714987714985e-05, + "loss": 0.8187, + "step": 678 + }, + { + "epoch": 0.08342548224597617, + "grad_norm": 1.720034328567471, + "learning_rate": 2.5024570024570026e-05, + "loss": 0.7066, + "step": 679 + }, + { + "epoch": 0.08354834746283327, + "grad_norm": 1.8773052678784248, + "learning_rate": 2.5061425061425063e-05, + "loss": 0.7023, + "step": 680 + }, + { + "epoch": 0.08367121267969038, + "grad_norm": 1.6375229187289637, + "learning_rate": 2.5098280098280096e-05, + "loss": 0.7372, + "step": 681 + }, + { + "epoch": 0.08379407789654748, + "grad_norm": 2.1653577302506095, + "learning_rate": 2.5135135135135137e-05, + "loss": 0.8151, + "step": 682 + }, + { + "epoch": 0.0839169431134046, + "grad_norm": 2.0128394665439453, + "learning_rate": 2.5171990171990174e-05, + "loss": 0.8008, + "step": 683 + }, + { + "epoch": 0.0840398083302617, + "grad_norm": 2.199399863364831, + "learning_rate": 2.5208845208845207e-05, + "loss": 0.6506, + "step": 684 + }, + { + "epoch": 0.08416267354711882, + "grad_norm": 1.9290279569809399, + "learning_rate": 2.5245700245700248e-05, + "loss": 0.7927, + "step": 685 + }, + { + "epoch": 0.08428553876397592, + "grad_norm": 2.0440955486621015, + "learning_rate": 2.5282555282555284e-05, + "loss": 0.7259, + "step": 686 + }, + { + "epoch": 0.08440840398083303, + "grad_norm": 1.5633787830392318, + "learning_rate": 2.5319410319410318e-05, + "loss": 0.7643, + "step": 687 + }, + { + "epoch": 0.08453126919769013, + "grad_norm": 1.9141925847656822, + "learning_rate": 2.535626535626536e-05, + "loss": 0.7375, + "step": 688 + }, + { + "epoch": 0.08465413441454724, + "grad_norm": 1.9354248054621102, + "learning_rate": 2.5393120393120395e-05, + "loss": 0.6888, + "step": 689 + }, + { + "epoch": 0.08477699963140435, + "grad_norm": 1.9509715097857399, + "learning_rate": 2.542997542997543e-05, + "loss": 0.6953, + "step": 690 + }, + { + "epoch": 0.08489986484826145, + "grad_norm": 1.7122612796926369, + "learning_rate": 2.5466830466830466e-05, + "loss": 0.7513, + "step": 691 + }, + { + "epoch": 0.08502273006511857, + "grad_norm": 1.878370508749116, + "learning_rate": 2.5503685503685506e-05, + "loss": 0.7067, + "step": 692 + }, + { + "epoch": 0.08514559528197567, + "grad_norm": 2.133843265423579, + "learning_rate": 2.554054054054054e-05, + "loss": 0.7008, + "step": 693 + }, + { + "epoch": 0.08526846049883278, + "grad_norm": 2.039041027140724, + "learning_rate": 2.5577395577395577e-05, + "loss": 0.7189, + "step": 694 + }, + { + "epoch": 0.08539132571568989, + "grad_norm": 1.6886245257068158, + "learning_rate": 2.5614250614250617e-05, + "loss": 0.7496, + "step": 695 + }, + { + "epoch": 0.085514190932547, + "grad_norm": 1.9994119629392992, + "learning_rate": 2.565110565110565e-05, + "loss": 0.6969, + "step": 696 + }, + { + "epoch": 0.0856370561494041, + "grad_norm": 2.1815283157048175, + "learning_rate": 2.5687960687960688e-05, + "loss": 0.733, + "step": 697 + }, + { + "epoch": 0.0857599213662612, + "grad_norm": 1.790726850633851, + "learning_rate": 2.5724815724815728e-05, + "loss": 0.6345, + "step": 698 + }, + { + "epoch": 0.08588278658311832, + "grad_norm": 1.8136074284309402, + "learning_rate": 2.576167076167076e-05, + "loss": 0.8141, + "step": 699 + }, + { + "epoch": 0.08600565179997542, + "grad_norm": 1.8552977846941467, + "learning_rate": 2.57985257985258e-05, + "loss": 0.7642, + "step": 700 + }, + { + "epoch": 0.08612851701683254, + "grad_norm": 2.1836400493024777, + "learning_rate": 2.583538083538084e-05, + "loss": 0.7579, + "step": 701 + }, + { + "epoch": 0.08625138223368964, + "grad_norm": 1.6784475462964883, + "learning_rate": 2.5872235872235872e-05, + "loss": 0.643, + "step": 702 + }, + { + "epoch": 0.08637424745054675, + "grad_norm": 1.9106768723067558, + "learning_rate": 2.590909090909091e-05, + "loss": 0.7653, + "step": 703 + }, + { + "epoch": 0.08649711266740386, + "grad_norm": 1.8164383360368452, + "learning_rate": 2.594594594594595e-05, + "loss": 0.6836, + "step": 704 + }, + { + "epoch": 0.08661997788426097, + "grad_norm": 2.1218487241891184, + "learning_rate": 2.5982800982800983e-05, + "loss": 0.8274, + "step": 705 + }, + { + "epoch": 0.08674284310111807, + "grad_norm": 1.9557454965061531, + "learning_rate": 2.601965601965602e-05, + "loss": 0.7209, + "step": 706 + }, + { + "epoch": 0.08686570831797519, + "grad_norm": 1.830276620205532, + "learning_rate": 2.6056511056511057e-05, + "loss": 0.7363, + "step": 707 + }, + { + "epoch": 0.08698857353483229, + "grad_norm": 1.76407245777333, + "learning_rate": 2.6093366093366094e-05, + "loss": 0.6053, + "step": 708 + }, + { + "epoch": 0.08711143875168939, + "grad_norm": 1.897899819347947, + "learning_rate": 2.613022113022113e-05, + "loss": 0.7802, + "step": 709 + }, + { + "epoch": 0.0872343039685465, + "grad_norm": 1.919438263381118, + "learning_rate": 2.6167076167076168e-05, + "loss": 0.7213, + "step": 710 + }, + { + "epoch": 0.08735716918540361, + "grad_norm": 1.7251714178748108, + "learning_rate": 2.6203931203931205e-05, + "loss": 0.7453, + "step": 711 + }, + { + "epoch": 0.08748003440226072, + "grad_norm": 1.898155873271899, + "learning_rate": 2.6240786240786242e-05, + "loss": 0.7317, + "step": 712 + }, + { + "epoch": 0.08760289961911782, + "grad_norm": 1.793396232243243, + "learning_rate": 2.6277641277641275e-05, + "loss": 0.7684, + "step": 713 + }, + { + "epoch": 0.08772576483597494, + "grad_norm": 1.8199359460694664, + "learning_rate": 2.6314496314496316e-05, + "loss": 0.7441, + "step": 714 + }, + { + "epoch": 0.08784863005283204, + "grad_norm": 1.7570280510112468, + "learning_rate": 2.6351351351351353e-05, + "loss": 0.6544, + "step": 715 + }, + { + "epoch": 0.08797149526968916, + "grad_norm": 2.0289327165183506, + "learning_rate": 2.6388206388206386e-05, + "loss": 0.7242, + "step": 716 + }, + { + "epoch": 0.08809436048654626, + "grad_norm": 2.39705719088832, + "learning_rate": 2.6425061425061427e-05, + "loss": 0.7362, + "step": 717 + }, + { + "epoch": 0.08821722570340336, + "grad_norm": 2.048719459912594, + "learning_rate": 2.6461916461916464e-05, + "loss": 0.661, + "step": 718 + }, + { + "epoch": 0.08834009092026048, + "grad_norm": 1.9934833821034068, + "learning_rate": 2.6498771498771497e-05, + "loss": 0.6636, + "step": 719 + }, + { + "epoch": 0.08846295613711758, + "grad_norm": 1.6935359207095917, + "learning_rate": 2.6535626535626537e-05, + "loss": 0.6872, + "step": 720 + }, + { + "epoch": 0.08858582135397469, + "grad_norm": 1.6920369655935517, + "learning_rate": 2.6572481572481574e-05, + "loss": 0.6154, + "step": 721 + }, + { + "epoch": 0.0887086865708318, + "grad_norm": 2.0760174108611733, + "learning_rate": 2.6609336609336608e-05, + "loss": 0.7297, + "step": 722 + }, + { + "epoch": 0.08883155178768891, + "grad_norm": 2.077259433286552, + "learning_rate": 2.6646191646191645e-05, + "loss": 0.6294, + "step": 723 + }, + { + "epoch": 0.08895441700454601, + "grad_norm": 2.410904790531113, + "learning_rate": 2.6683046683046685e-05, + "loss": 0.7154, + "step": 724 + }, + { + "epoch": 0.08907728222140313, + "grad_norm": 1.9130010748639648, + "learning_rate": 2.671990171990172e-05, + "loss": 0.8182, + "step": 725 + }, + { + "epoch": 0.08920014743826023, + "grad_norm": 1.7996594405288788, + "learning_rate": 2.6756756756756756e-05, + "loss": 0.6928, + "step": 726 + }, + { + "epoch": 0.08932301265511734, + "grad_norm": 2.4717604219950795, + "learning_rate": 2.6793611793611796e-05, + "loss": 0.7318, + "step": 727 + }, + { + "epoch": 0.08944587787197444, + "grad_norm": 1.7841440740342938, + "learning_rate": 2.683046683046683e-05, + "loss": 0.6427, + "step": 728 + }, + { + "epoch": 0.08956874308883155, + "grad_norm": 2.122793042974321, + "learning_rate": 2.6867321867321867e-05, + "loss": 0.8501, + "step": 729 + }, + { + "epoch": 0.08969160830568866, + "grad_norm": 1.8641386492434562, + "learning_rate": 2.6904176904176907e-05, + "loss": 0.6758, + "step": 730 + }, + { + "epoch": 0.08981447352254576, + "grad_norm": 1.8529403818197823, + "learning_rate": 2.694103194103194e-05, + "loss": 0.6809, + "step": 731 + }, + { + "epoch": 0.08993733873940288, + "grad_norm": 1.8844533664095007, + "learning_rate": 2.6977886977886977e-05, + "loss": 0.8055, + "step": 732 + }, + { + "epoch": 0.09006020395625998, + "grad_norm": 2.205152780988247, + "learning_rate": 2.7014742014742018e-05, + "loss": 0.6133, + "step": 733 + }, + { + "epoch": 0.0901830691731171, + "grad_norm": 1.8738690379163438, + "learning_rate": 2.705159705159705e-05, + "loss": 0.721, + "step": 734 + }, + { + "epoch": 0.0903059343899742, + "grad_norm": 1.9276824342763887, + "learning_rate": 2.708845208845209e-05, + "loss": 0.7057, + "step": 735 + }, + { + "epoch": 0.09042879960683131, + "grad_norm": 1.8151668937020267, + "learning_rate": 2.712530712530713e-05, + "loss": 0.6927, + "step": 736 + }, + { + "epoch": 0.09055166482368841, + "grad_norm": 2.0410520180397618, + "learning_rate": 2.7162162162162162e-05, + "loss": 0.7571, + "step": 737 + }, + { + "epoch": 0.09067453004054553, + "grad_norm": 1.6490851547190442, + "learning_rate": 2.71990171990172e-05, + "loss": 0.6468, + "step": 738 + }, + { + "epoch": 0.09079739525740263, + "grad_norm": 1.7687358648071572, + "learning_rate": 2.7235872235872236e-05, + "loss": 0.6225, + "step": 739 + }, + { + "epoch": 0.09092026047425973, + "grad_norm": 2.021049230200209, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.7445, + "step": 740 + }, + { + "epoch": 0.09104312569111685, + "grad_norm": 2.01425907255038, + "learning_rate": 2.730958230958231e-05, + "loss": 0.7138, + "step": 741 + }, + { + "epoch": 0.09116599090797395, + "grad_norm": 2.199930315619395, + "learning_rate": 2.7346437346437347e-05, + "loss": 0.7787, + "step": 742 + }, + { + "epoch": 0.09128885612483106, + "grad_norm": 1.744512084045899, + "learning_rate": 2.7383292383292384e-05, + "loss": 0.6105, + "step": 743 + }, + { + "epoch": 0.09141172134168817, + "grad_norm": 1.7937909360538287, + "learning_rate": 2.742014742014742e-05, + "loss": 0.7318, + "step": 744 + }, + { + "epoch": 0.09153458655854528, + "grad_norm": 1.742012851978804, + "learning_rate": 2.7457002457002458e-05, + "loss": 0.6268, + "step": 745 + }, + { + "epoch": 0.09165745177540238, + "grad_norm": 1.7848459168400745, + "learning_rate": 2.7493857493857495e-05, + "loss": 0.5972, + "step": 746 + }, + { + "epoch": 0.0917803169922595, + "grad_norm": 1.7476609982271176, + "learning_rate": 2.7530712530712532e-05, + "loss": 0.6487, + "step": 747 + }, + { + "epoch": 0.0919031822091166, + "grad_norm": 1.814878806880183, + "learning_rate": 2.756756756756757e-05, + "loss": 0.7957, + "step": 748 + }, + { + "epoch": 0.0920260474259737, + "grad_norm": 1.9976664762703271, + "learning_rate": 2.7604422604422606e-05, + "loss": 0.801, + "step": 749 + }, + { + "epoch": 0.09214891264283082, + "grad_norm": 1.876846877310892, + "learning_rate": 2.7641277641277643e-05, + "loss": 0.8528, + "step": 750 + }, + { + "epoch": 0.09227177785968792, + "grad_norm": 1.7545647846128416, + "learning_rate": 2.767813267813268e-05, + "loss": 0.847, + "step": 751 + }, + { + "epoch": 0.09239464307654503, + "grad_norm": 1.8031723387185685, + "learning_rate": 2.7714987714987717e-05, + "loss": 0.754, + "step": 752 + }, + { + "epoch": 0.09251750829340213, + "grad_norm": 1.797273438904846, + "learning_rate": 2.7751842751842753e-05, + "loss": 0.5849, + "step": 753 + }, + { + "epoch": 0.09264037351025925, + "grad_norm": 1.750646620722912, + "learning_rate": 2.7788697788697787e-05, + "loss": 0.7259, + "step": 754 + }, + { + "epoch": 0.09276323872711635, + "grad_norm": 1.8294502122465757, + "learning_rate": 2.7825552825552827e-05, + "loss": 0.714, + "step": 755 + }, + { + "epoch": 0.09288610394397347, + "grad_norm": 2.3523571492091633, + "learning_rate": 2.7862407862407864e-05, + "loss": 0.7442, + "step": 756 + }, + { + "epoch": 0.09300896916083057, + "grad_norm": 1.9997820418717616, + "learning_rate": 2.7899262899262898e-05, + "loss": 0.684, + "step": 757 + }, + { + "epoch": 0.09313183437768768, + "grad_norm": 1.9949969304756525, + "learning_rate": 2.7936117936117935e-05, + "loss": 0.6808, + "step": 758 + }, + { + "epoch": 0.09325469959454478, + "grad_norm": 1.7925039543647958, + "learning_rate": 2.7972972972972975e-05, + "loss": 0.8131, + "step": 759 + }, + { + "epoch": 0.09337756481140189, + "grad_norm": 1.8136271698608502, + "learning_rate": 2.800982800982801e-05, + "loss": 0.7519, + "step": 760 + }, + { + "epoch": 0.093500430028259, + "grad_norm": 2.152903615096727, + "learning_rate": 2.8046683046683046e-05, + "loss": 0.7839, + "step": 761 + }, + { + "epoch": 0.0936232952451161, + "grad_norm": 2.019726412734734, + "learning_rate": 2.8083538083538086e-05, + "loss": 0.6159, + "step": 762 + }, + { + "epoch": 0.09374616046197322, + "grad_norm": 1.7123898376173292, + "learning_rate": 2.812039312039312e-05, + "loss": 0.6471, + "step": 763 + }, + { + "epoch": 0.09386902567883032, + "grad_norm": 1.7456603706753775, + "learning_rate": 2.8157248157248157e-05, + "loss": 0.7607, + "step": 764 + }, + { + "epoch": 0.09399189089568744, + "grad_norm": 2.0557589121048774, + "learning_rate": 2.8194103194103197e-05, + "loss": 0.6497, + "step": 765 + }, + { + "epoch": 0.09411475611254454, + "grad_norm": 1.8273794204935183, + "learning_rate": 2.823095823095823e-05, + "loss": 0.7724, + "step": 766 + }, + { + "epoch": 0.09423762132940165, + "grad_norm": 1.7920851693299429, + "learning_rate": 2.8267813267813267e-05, + "loss": 0.7578, + "step": 767 + }, + { + "epoch": 0.09436048654625875, + "grad_norm": 1.7897970751219825, + "learning_rate": 2.8304668304668308e-05, + "loss": 0.929, + "step": 768 + }, + { + "epoch": 0.09448335176311586, + "grad_norm": 2.2052655684371123, + "learning_rate": 2.834152334152334e-05, + "loss": 0.7738, + "step": 769 + }, + { + "epoch": 0.09460621697997297, + "grad_norm": 1.6941956657393724, + "learning_rate": 2.8378378378378378e-05, + "loss": 0.7234, + "step": 770 + }, + { + "epoch": 0.09472908219683007, + "grad_norm": 1.6576057753979319, + "learning_rate": 2.841523341523342e-05, + "loss": 0.7708, + "step": 771 + }, + { + "epoch": 0.09485194741368719, + "grad_norm": 1.6485550791361792, + "learning_rate": 2.8452088452088452e-05, + "loss": 0.6292, + "step": 772 + }, + { + "epoch": 0.09497481263054429, + "grad_norm": 1.7515084019243152, + "learning_rate": 2.848894348894349e-05, + "loss": 0.6632, + "step": 773 + }, + { + "epoch": 0.0950976778474014, + "grad_norm": 2.015025921159922, + "learning_rate": 2.8525798525798526e-05, + "loss": 0.6529, + "step": 774 + }, + { + "epoch": 0.0952205430642585, + "grad_norm": 1.6490860331979154, + "learning_rate": 2.8562653562653563e-05, + "loss": 0.6888, + "step": 775 + }, + { + "epoch": 0.09534340828111562, + "grad_norm": 1.723817595711864, + "learning_rate": 2.85995085995086e-05, + "loss": 0.7561, + "step": 776 + }, + { + "epoch": 0.09546627349797272, + "grad_norm": 1.7727870879813197, + "learning_rate": 2.8636363636363637e-05, + "loss": 0.7156, + "step": 777 + }, + { + "epoch": 0.09558913871482984, + "grad_norm": 1.767167770446803, + "learning_rate": 2.8673218673218674e-05, + "loss": 0.6408, + "step": 778 + }, + { + "epoch": 0.09571200393168694, + "grad_norm": 1.7485732361363484, + "learning_rate": 2.871007371007371e-05, + "loss": 0.67, + "step": 779 + }, + { + "epoch": 0.09583486914854404, + "grad_norm": 1.8606602335017313, + "learning_rate": 2.8746928746928748e-05, + "loss": 0.6906, + "step": 780 + }, + { + "epoch": 0.09595773436540116, + "grad_norm": 1.534330702737851, + "learning_rate": 2.8783783783783785e-05, + "loss": 0.6914, + "step": 781 + }, + { + "epoch": 0.09608059958225826, + "grad_norm": 1.602667890550802, + "learning_rate": 2.882063882063882e-05, + "loss": 0.7187, + "step": 782 + }, + { + "epoch": 0.09620346479911537, + "grad_norm": 1.839623507005776, + "learning_rate": 2.885749385749386e-05, + "loss": 0.6185, + "step": 783 + }, + { + "epoch": 0.09632633001597248, + "grad_norm": 1.9440183623512557, + "learning_rate": 2.8894348894348896e-05, + "loss": 0.6713, + "step": 784 + }, + { + "epoch": 0.09644919523282959, + "grad_norm": 1.8177161917277385, + "learning_rate": 2.8931203931203933e-05, + "loss": 0.6296, + "step": 785 + }, + { + "epoch": 0.09657206044968669, + "grad_norm": 1.7068180447676822, + "learning_rate": 2.896805896805897e-05, + "loss": 0.6355, + "step": 786 + }, + { + "epoch": 0.09669492566654381, + "grad_norm": 2.1379981086390845, + "learning_rate": 2.9004914004914006e-05, + "loss": 0.7311, + "step": 787 + }, + { + "epoch": 0.09681779088340091, + "grad_norm": 2.0726508968495483, + "learning_rate": 2.9041769041769043e-05, + "loss": 0.6938, + "step": 788 + }, + { + "epoch": 0.09694065610025801, + "grad_norm": 1.8011049088612894, + "learning_rate": 2.907862407862408e-05, + "loss": 0.7179, + "step": 789 + }, + { + "epoch": 0.09706352131711513, + "grad_norm": 1.8610689684907293, + "learning_rate": 2.9115479115479114e-05, + "loss": 0.683, + "step": 790 + }, + { + "epoch": 0.09718638653397223, + "grad_norm": 1.936700246687565, + "learning_rate": 2.9152334152334154e-05, + "loss": 0.7167, + "step": 791 + }, + { + "epoch": 0.09730925175082934, + "grad_norm": 1.979351549352123, + "learning_rate": 2.918918918918919e-05, + "loss": 0.6801, + "step": 792 + }, + { + "epoch": 0.09743211696768644, + "grad_norm": 1.8232765587262365, + "learning_rate": 2.9226044226044225e-05, + "loss": 0.753, + "step": 793 + }, + { + "epoch": 0.09755498218454356, + "grad_norm": 1.7183954512651585, + "learning_rate": 2.9262899262899265e-05, + "loss": 0.6029, + "step": 794 + }, + { + "epoch": 0.09767784740140066, + "grad_norm": 1.5055830780147417, + "learning_rate": 2.92997542997543e-05, + "loss": 0.7741, + "step": 795 + }, + { + "epoch": 0.09780071261825778, + "grad_norm": 1.7353704153614988, + "learning_rate": 2.9336609336609336e-05, + "loss": 0.6684, + "step": 796 + }, + { + "epoch": 0.09792357783511488, + "grad_norm": 1.9806485034265893, + "learning_rate": 2.9373464373464376e-05, + "loss": 0.8293, + "step": 797 + }, + { + "epoch": 0.098046443051972, + "grad_norm": 1.6642103299188327, + "learning_rate": 2.941031941031941e-05, + "loss": 0.6977, + "step": 798 + }, + { + "epoch": 0.0981693082688291, + "grad_norm": 1.7666901647672828, + "learning_rate": 2.9447174447174446e-05, + "loss": 0.7435, + "step": 799 + }, + { + "epoch": 0.0982921734856862, + "grad_norm": 1.9059673713649457, + "learning_rate": 2.9484029484029487e-05, + "loss": 0.6279, + "step": 800 + }, + { + "epoch": 0.09841503870254331, + "grad_norm": 1.6331573677058462, + "learning_rate": 2.952088452088452e-05, + "loss": 0.7214, + "step": 801 + }, + { + "epoch": 0.09853790391940041, + "grad_norm": 2.2368254305739637, + "learning_rate": 2.9557739557739557e-05, + "loss": 0.6907, + "step": 802 + }, + { + "epoch": 0.09866076913625753, + "grad_norm": 1.8620822521949996, + "learning_rate": 2.9594594594594598e-05, + "loss": 0.5667, + "step": 803 + }, + { + "epoch": 0.09878363435311463, + "grad_norm": 1.8405431999482347, + "learning_rate": 2.963144963144963e-05, + "loss": 0.6636, + "step": 804 + }, + { + "epoch": 0.09890649956997175, + "grad_norm": 2.1524082198062513, + "learning_rate": 2.9668304668304668e-05, + "loss": 0.6005, + "step": 805 + }, + { + "epoch": 0.09902936478682885, + "grad_norm": 1.961454296800343, + "learning_rate": 2.9705159705159705e-05, + "loss": 0.7756, + "step": 806 + }, + { + "epoch": 0.09915223000368596, + "grad_norm": 2.296145582468531, + "learning_rate": 2.9742014742014742e-05, + "loss": 0.7254, + "step": 807 + }, + { + "epoch": 0.09927509522054306, + "grad_norm": 1.636090386743313, + "learning_rate": 2.977886977886978e-05, + "loss": 0.646, + "step": 808 + }, + { + "epoch": 0.09939796043740017, + "grad_norm": 1.7278840839888492, + "learning_rate": 2.9815724815724816e-05, + "loss": 0.7, + "step": 809 + }, + { + "epoch": 0.09952082565425728, + "grad_norm": 1.7718539064940368, + "learning_rate": 2.9852579852579853e-05, + "loss": 0.5874, + "step": 810 + }, + { + "epoch": 0.09964369087111438, + "grad_norm": 1.7569748035378938, + "learning_rate": 2.988943488943489e-05, + "loss": 0.6211, + "step": 811 + }, + { + "epoch": 0.0997665560879715, + "grad_norm": 1.783174233042772, + "learning_rate": 2.9926289926289927e-05, + "loss": 0.6519, + "step": 812 + }, + { + "epoch": 0.0998894213048286, + "grad_norm": 1.8715097862635965, + "learning_rate": 2.9963144963144964e-05, + "loss": 0.8062, + "step": 813 + }, + { + "epoch": 0.10001228652168571, + "grad_norm": 1.8322324042239617, + "learning_rate": 3e-05, + "loss": 0.6869, + "step": 814 + }, + { + "epoch": 0.10013515173854282, + "grad_norm": 1.9110877211728614, + "learning_rate": 2.999999862042364e-05, + "loss": 0.6044, + "step": 815 + }, + { + "epoch": 0.10025801695539993, + "grad_norm": 1.920788411540279, + "learning_rate": 2.999999448169481e-05, + "loss": 0.736, + "step": 816 + }, + { + "epoch": 0.10038088217225703, + "grad_norm": 2.0113341101147726, + "learning_rate": 2.9999987583814276e-05, + "loss": 0.6973, + "step": 817 + }, + { + "epoch": 0.10050374738911415, + "grad_norm": 2.198214750382668, + "learning_rate": 2.9999977926783303e-05, + "loss": 0.7296, + "step": 818 + }, + { + "epoch": 0.10062661260597125, + "grad_norm": 1.9945483589847353, + "learning_rate": 2.999996551060367e-05, + "loss": 0.6734, + "step": 819 + }, + { + "epoch": 0.10074947782282835, + "grad_norm": 1.8418330843375452, + "learning_rate": 2.999995033527766e-05, + "loss": 0.6548, + "step": 820 + }, + { + "epoch": 0.10087234303968547, + "grad_norm": 1.7601094012802614, + "learning_rate": 2.999993240080806e-05, + "loss": 0.7416, + "step": 821 + }, + { + "epoch": 0.10099520825654257, + "grad_norm": 1.726841900632902, + "learning_rate": 2.9999911707198176e-05, + "loss": 0.7876, + "step": 822 + }, + { + "epoch": 0.10111807347339968, + "grad_norm": 1.7548151254345847, + "learning_rate": 2.999988825445181e-05, + "loss": 0.7137, + "step": 823 + }, + { + "epoch": 0.10124093869025678, + "grad_norm": 1.8549727989339913, + "learning_rate": 2.999986204257328e-05, + "loss": 0.849, + "step": 824 + }, + { + "epoch": 0.1013638039071139, + "grad_norm": 1.793482004941933, + "learning_rate": 2.9999833071567397e-05, + "loss": 0.7766, + "step": 825 + }, + { + "epoch": 0.101486669123971, + "grad_norm": 2.0761075922881984, + "learning_rate": 2.9999801341439506e-05, + "loss": 0.7363, + "step": 826 + }, + { + "epoch": 0.10160953434082812, + "grad_norm": 1.7202202444656944, + "learning_rate": 2.999976685219543e-05, + "loss": 0.714, + "step": 827 + }, + { + "epoch": 0.10173239955768522, + "grad_norm": 2.059747992382795, + "learning_rate": 2.9999729603841524e-05, + "loss": 0.8224, + "step": 828 + }, + { + "epoch": 0.10185526477454233, + "grad_norm": 1.6925348112811556, + "learning_rate": 2.999968959638463e-05, + "loss": 0.6544, + "step": 829 + }, + { + "epoch": 0.10197812999139944, + "grad_norm": 2.0761179667686593, + "learning_rate": 2.999964682983211e-05, + "loss": 0.6533, + "step": 830 + }, + { + "epoch": 0.10210099520825654, + "grad_norm": 1.7975177001510814, + "learning_rate": 2.9999601304191835e-05, + "loss": 0.7377, + "step": 831 + }, + { + "epoch": 0.10222386042511365, + "grad_norm": 2.2815351114743216, + "learning_rate": 2.9999553019472177e-05, + "loss": 0.7976, + "step": 832 + }, + { + "epoch": 0.10234672564197075, + "grad_norm": 2.160673375146874, + "learning_rate": 2.9999501975682015e-05, + "loss": 0.6293, + "step": 833 + }, + { + "epoch": 0.10246959085882787, + "grad_norm": 1.9060779401388181, + "learning_rate": 2.9999448172830738e-05, + "loss": 0.6915, + "step": 834 + }, + { + "epoch": 0.10259245607568497, + "grad_norm": 1.7016832348007465, + "learning_rate": 2.9999391610928247e-05, + "loss": 0.7251, + "step": 835 + }, + { + "epoch": 0.10271532129254209, + "grad_norm": 1.7298757262256934, + "learning_rate": 2.999933228998494e-05, + "loss": 0.6793, + "step": 836 + }, + { + "epoch": 0.10283818650939919, + "grad_norm": 1.6895229958826614, + "learning_rate": 2.9999270210011737e-05, + "loss": 0.7324, + "step": 837 + }, + { + "epoch": 0.1029610517262563, + "grad_norm": 1.7987936410199106, + "learning_rate": 2.999920537102005e-05, + "loss": 0.6969, + "step": 838 + }, + { + "epoch": 0.1030839169431134, + "grad_norm": 1.8137962721660554, + "learning_rate": 2.9999137773021807e-05, + "loss": 0.6411, + "step": 839 + }, + { + "epoch": 0.1032067821599705, + "grad_norm": 1.8779149770775954, + "learning_rate": 2.9999067416029446e-05, + "loss": 0.7585, + "step": 840 + }, + { + "epoch": 0.10332964737682762, + "grad_norm": 1.7359599431781119, + "learning_rate": 2.9998994300055905e-05, + "loss": 0.7239, + "step": 841 + }, + { + "epoch": 0.10345251259368472, + "grad_norm": 1.6836336979403232, + "learning_rate": 2.9998918425114633e-05, + "loss": 0.6164, + "step": 842 + }, + { + "epoch": 0.10357537781054184, + "grad_norm": 1.8202252092638256, + "learning_rate": 2.9998839791219593e-05, + "loss": 0.7304, + "step": 843 + }, + { + "epoch": 0.10369824302739894, + "grad_norm": 1.7102109385460997, + "learning_rate": 2.999875839838524e-05, + "loss": 0.6972, + "step": 844 + }, + { + "epoch": 0.10382110824425606, + "grad_norm": 1.681122626293071, + "learning_rate": 2.999867424662655e-05, + "loss": 0.7113, + "step": 845 + }, + { + "epoch": 0.10394397346111316, + "grad_norm": 1.7979892047752892, + "learning_rate": 2.9998587335959002e-05, + "loss": 0.7173, + "step": 846 + }, + { + "epoch": 0.10406683867797027, + "grad_norm": 1.7933612570861321, + "learning_rate": 2.9998497666398586e-05, + "loss": 0.678, + "step": 847 + }, + { + "epoch": 0.10418970389482737, + "grad_norm": 1.6856806772157973, + "learning_rate": 2.999840523796179e-05, + "loss": 0.6144, + "step": 848 + }, + { + "epoch": 0.10431256911168449, + "grad_norm": 1.5769254666206316, + "learning_rate": 2.9998310050665622e-05, + "loss": 0.7458, + "step": 849 + }, + { + "epoch": 0.10443543432854159, + "grad_norm": 1.582085693581304, + "learning_rate": 2.9998212104527582e-05, + "loss": 0.6019, + "step": 850 + }, + { + "epoch": 0.10455829954539869, + "grad_norm": 1.993783243125877, + "learning_rate": 2.9998111399565696e-05, + "loss": 0.7048, + "step": 851 + }, + { + "epoch": 0.10468116476225581, + "grad_norm": 1.6749058902865683, + "learning_rate": 2.9998007935798486e-05, + "loss": 0.5928, + "step": 852 + }, + { + "epoch": 0.10480402997911291, + "grad_norm": 1.922134106252314, + "learning_rate": 2.999790171324498e-05, + "loss": 0.6484, + "step": 853 + }, + { + "epoch": 0.10492689519597002, + "grad_norm": 1.647214266136117, + "learning_rate": 2.9997792731924718e-05, + "loss": 0.7465, + "step": 854 + }, + { + "epoch": 0.10504976041282713, + "grad_norm": 1.746357877381484, + "learning_rate": 2.9997680991857744e-05, + "loss": 0.5806, + "step": 855 + }, + { + "epoch": 0.10517262562968424, + "grad_norm": 1.7681333929045144, + "learning_rate": 2.999756649306462e-05, + "loss": 0.6909, + "step": 856 + }, + { + "epoch": 0.10529549084654134, + "grad_norm": 1.8109413092672964, + "learning_rate": 2.99974492355664e-05, + "loss": 0.7324, + "step": 857 + }, + { + "epoch": 0.10541835606339846, + "grad_norm": 1.9006284353907568, + "learning_rate": 2.9997329219384655e-05, + "loss": 0.6463, + "step": 858 + }, + { + "epoch": 0.10554122128025556, + "grad_norm": 1.6675558117732143, + "learning_rate": 2.999720644454146e-05, + "loss": 0.6952, + "step": 859 + }, + { + "epoch": 0.10566408649711266, + "grad_norm": 1.6467466447444385, + "learning_rate": 2.9997080911059402e-05, + "loss": 0.6968, + "step": 860 + }, + { + "epoch": 0.10578695171396978, + "grad_norm": 1.5481236354096752, + "learning_rate": 2.9996952618961567e-05, + "loss": 0.6396, + "step": 861 + }, + { + "epoch": 0.10590981693082688, + "grad_norm": 1.6424131996879718, + "learning_rate": 2.9996821568271563e-05, + "loss": 0.7588, + "step": 862 + }, + { + "epoch": 0.106032682147684, + "grad_norm": 1.710068046896545, + "learning_rate": 2.9996687759013483e-05, + "loss": 0.7198, + "step": 863 + }, + { + "epoch": 0.1061555473645411, + "grad_norm": 1.9602026551495622, + "learning_rate": 2.9996551191211948e-05, + "loss": 0.8309, + "step": 864 + }, + { + "epoch": 0.10627841258139821, + "grad_norm": 1.655791176740972, + "learning_rate": 2.9996411864892078e-05, + "loss": 0.8019, + "step": 865 + }, + { + "epoch": 0.10640127779825531, + "grad_norm": 1.7152286993695074, + "learning_rate": 2.9996269780079497e-05, + "loss": 0.7309, + "step": 866 + }, + { + "epoch": 0.10652414301511243, + "grad_norm": 2.647214705126715, + "learning_rate": 2.999612493680035e-05, + "loss": 0.7219, + "step": 867 + }, + { + "epoch": 0.10664700823196953, + "grad_norm": 1.7121488236087108, + "learning_rate": 2.9995977335081273e-05, + "loss": 0.7539, + "step": 868 + }, + { + "epoch": 0.10676987344882664, + "grad_norm": 1.73865527002815, + "learning_rate": 2.9995826974949413e-05, + "loss": 0.6458, + "step": 869 + }, + { + "epoch": 0.10689273866568375, + "grad_norm": 1.9440280497344198, + "learning_rate": 2.9995673856432436e-05, + "loss": 0.8574, + "step": 870 + }, + { + "epoch": 0.10701560388254085, + "grad_norm": 2.0024570955194614, + "learning_rate": 2.9995517979558503e-05, + "loss": 0.6999, + "step": 871 + }, + { + "epoch": 0.10713846909939796, + "grad_norm": 1.8420198719280292, + "learning_rate": 2.9995359344356287e-05, + "loss": 0.6556, + "step": 872 + }, + { + "epoch": 0.10726133431625506, + "grad_norm": 1.8729907704120885, + "learning_rate": 2.999519795085497e-05, + "loss": 0.6831, + "step": 873 + }, + { + "epoch": 0.10738419953311218, + "grad_norm": 1.8281572248235232, + "learning_rate": 2.9995033799084232e-05, + "loss": 0.7561, + "step": 874 + }, + { + "epoch": 0.10750706474996928, + "grad_norm": 1.5748095458832954, + "learning_rate": 2.999486688907428e-05, + "loss": 0.7263, + "step": 875 + }, + { + "epoch": 0.1076299299668264, + "grad_norm": 1.6073602615538956, + "learning_rate": 2.9994697220855805e-05, + "loss": 0.6249, + "step": 876 + }, + { + "epoch": 0.1077527951836835, + "grad_norm": 1.6200979185301414, + "learning_rate": 2.9994524794460016e-05, + "loss": 0.7218, + "step": 877 + }, + { + "epoch": 0.10787566040054061, + "grad_norm": 1.9287448787643673, + "learning_rate": 2.9994349609918643e-05, + "loss": 0.6491, + "step": 878 + }, + { + "epoch": 0.10799852561739771, + "grad_norm": 1.881379236330819, + "learning_rate": 2.999417166726389e-05, + "loss": 0.7482, + "step": 879 + }, + { + "epoch": 0.10812139083425482, + "grad_norm": 1.739984279366319, + "learning_rate": 2.999399096652851e-05, + "loss": 0.6835, + "step": 880 + }, + { + "epoch": 0.10824425605111193, + "grad_norm": 1.9472959743074811, + "learning_rate": 2.9993807507745725e-05, + "loss": 0.6825, + "step": 881 + }, + { + "epoch": 0.10836712126796903, + "grad_norm": 1.7678619999001417, + "learning_rate": 2.999362129094929e-05, + "loss": 0.7111, + "step": 882 + }, + { + "epoch": 0.10848998648482615, + "grad_norm": 2.0151904741787026, + "learning_rate": 2.9993432316173456e-05, + "loss": 0.8369, + "step": 883 + }, + { + "epoch": 0.10861285170168325, + "grad_norm": 1.7492221387370035, + "learning_rate": 2.999324058345298e-05, + "loss": 0.725, + "step": 884 + }, + { + "epoch": 0.10873571691854036, + "grad_norm": 2.083925099957001, + "learning_rate": 2.9993046092823137e-05, + "loss": 0.8388, + "step": 885 + }, + { + "epoch": 0.10885858213539747, + "grad_norm": 1.9666780029736954, + "learning_rate": 2.9992848844319697e-05, + "loss": 0.6891, + "step": 886 + }, + { + "epoch": 0.10898144735225458, + "grad_norm": 1.922153256619731, + "learning_rate": 2.9992648837978944e-05, + "loss": 0.7401, + "step": 887 + }, + { + "epoch": 0.10910431256911168, + "grad_norm": 2.0004469454196374, + "learning_rate": 2.9992446073837665e-05, + "loss": 0.6496, + "step": 888 + }, + { + "epoch": 0.1092271777859688, + "grad_norm": 1.821791088923992, + "learning_rate": 2.9992240551933163e-05, + "loss": 0.65, + "step": 889 + }, + { + "epoch": 0.1093500430028259, + "grad_norm": 1.8714780627487186, + "learning_rate": 2.9992032272303238e-05, + "loss": 0.7548, + "step": 890 + }, + { + "epoch": 0.109472908219683, + "grad_norm": 1.9173877653152953, + "learning_rate": 2.9991821234986205e-05, + "loss": 0.7176, + "step": 891 + }, + { + "epoch": 0.10959577343654012, + "grad_norm": 1.8324634784926979, + "learning_rate": 2.9991607440020885e-05, + "loss": 0.7643, + "step": 892 + }, + { + "epoch": 0.10971863865339722, + "grad_norm": 1.7626335958280535, + "learning_rate": 2.999139088744659e-05, + "loss": 0.7257, + "step": 893 + }, + { + "epoch": 0.10984150387025433, + "grad_norm": 2.180450643285706, + "learning_rate": 2.9991171577303175e-05, + "loss": 0.741, + "step": 894 + }, + { + "epoch": 0.10996436908711144, + "grad_norm": 2.241402523102751, + "learning_rate": 2.9990949509630964e-05, + "loss": 0.6719, + "step": 895 + }, + { + "epoch": 0.11008723430396855, + "grad_norm": 1.9645271423030781, + "learning_rate": 2.9990724684470814e-05, + "loss": 0.7951, + "step": 896 + }, + { + "epoch": 0.11021009952082565, + "grad_norm": 2.040076681445501, + "learning_rate": 2.999049710186407e-05, + "loss": 0.7832, + "step": 897 + }, + { + "epoch": 0.11033296473768277, + "grad_norm": 1.8356450729469074, + "learning_rate": 2.9990266761852607e-05, + "loss": 0.6955, + "step": 898 + }, + { + "epoch": 0.11045582995453987, + "grad_norm": 1.8876600941006187, + "learning_rate": 2.9990033664478786e-05, + "loss": 0.5818, + "step": 899 + }, + { + "epoch": 0.11057869517139697, + "grad_norm": 1.6512466669666943, + "learning_rate": 2.9989797809785484e-05, + "loss": 0.7221, + "step": 900 + }, + { + "epoch": 0.11070156038825409, + "grad_norm": 1.9776948762561855, + "learning_rate": 2.998955919781609e-05, + "loss": 0.7435, + "step": 901 + }, + { + "epoch": 0.11082442560511119, + "grad_norm": 1.8931780238287874, + "learning_rate": 2.998931782861449e-05, + "loss": 0.7505, + "step": 902 + }, + { + "epoch": 0.1109472908219683, + "grad_norm": 1.9746881903566404, + "learning_rate": 2.998907370222509e-05, + "loss": 0.7514, + "step": 903 + }, + { + "epoch": 0.1110701560388254, + "grad_norm": 2.2017779563417035, + "learning_rate": 2.9988826818692784e-05, + "loss": 0.8303, + "step": 904 + }, + { + "epoch": 0.11119302125568252, + "grad_norm": 1.640662795406327, + "learning_rate": 2.998857717806299e-05, + "loss": 0.6454, + "step": 905 + }, + { + "epoch": 0.11131588647253962, + "grad_norm": 1.9777465233969052, + "learning_rate": 2.9988324780381633e-05, + "loss": 0.6456, + "step": 906 + }, + { + "epoch": 0.11143875168939674, + "grad_norm": 1.6911123225790752, + "learning_rate": 2.9988069625695134e-05, + "loss": 0.7055, + "step": 907 + }, + { + "epoch": 0.11156161690625384, + "grad_norm": 1.7602750841449315, + "learning_rate": 2.998781171405043e-05, + "loss": 0.6935, + "step": 908 + }, + { + "epoch": 0.11168448212311095, + "grad_norm": 1.6464657669920808, + "learning_rate": 2.9987551045494956e-05, + "loss": 0.6387, + "step": 909 + }, + { + "epoch": 0.11180734733996806, + "grad_norm": 1.7492388462265454, + "learning_rate": 2.998728762007667e-05, + "loss": 0.6234, + "step": 910 + }, + { + "epoch": 0.11193021255682516, + "grad_norm": 1.984834571097876, + "learning_rate": 2.998702143784402e-05, + "loss": 0.7274, + "step": 911 + }, + { + "epoch": 0.11205307777368227, + "grad_norm": 1.786139055153458, + "learning_rate": 2.998675249884597e-05, + "loss": 0.7535, + "step": 912 + }, + { + "epoch": 0.11217594299053937, + "grad_norm": 1.6828445805325252, + "learning_rate": 2.998648080313199e-05, + "loss": 0.6985, + "step": 913 + }, + { + "epoch": 0.11229880820739649, + "grad_norm": 1.924082825152558, + "learning_rate": 2.9986206350752058e-05, + "loss": 0.7258, + "step": 914 + }, + { + "epoch": 0.11242167342425359, + "grad_norm": 1.640439725352474, + "learning_rate": 2.9985929141756655e-05, + "loss": 0.6569, + "step": 915 + }, + { + "epoch": 0.1125445386411107, + "grad_norm": 1.7739376402655125, + "learning_rate": 2.998564917619678e-05, + "loss": 0.6889, + "step": 916 + }, + { + "epoch": 0.11266740385796781, + "grad_norm": 1.9022090529373108, + "learning_rate": 2.9985366454123914e-05, + "loss": 0.7098, + "step": 917 + }, + { + "epoch": 0.11279026907482492, + "grad_norm": 1.7708336598267937, + "learning_rate": 2.9985080975590083e-05, + "loss": 0.6411, + "step": 918 + }, + { + "epoch": 0.11291313429168202, + "grad_norm": 1.7044777167977707, + "learning_rate": 2.998479274064778e-05, + "loss": 0.6384, + "step": 919 + }, + { + "epoch": 0.11303599950853914, + "grad_norm": 1.6527131598758114, + "learning_rate": 2.9984501749350038e-05, + "loss": 0.6573, + "step": 920 + }, + { + "epoch": 0.11315886472539624, + "grad_norm": 1.6143084602550728, + "learning_rate": 2.9984208001750372e-05, + "loss": 0.7337, + "step": 921 + }, + { + "epoch": 0.11328172994225334, + "grad_norm": 1.945995805855703, + "learning_rate": 2.9983911497902822e-05, + "loss": 0.721, + "step": 922 + }, + { + "epoch": 0.11340459515911046, + "grad_norm": 1.8120202690090428, + "learning_rate": 2.9983612237861927e-05, + "loss": 0.7042, + "step": 923 + }, + { + "epoch": 0.11352746037596756, + "grad_norm": 1.697666466704746, + "learning_rate": 2.998331022168273e-05, + "loss": 0.722, + "step": 924 + }, + { + "epoch": 0.11365032559282467, + "grad_norm": 1.7204181890016, + "learning_rate": 2.9983005449420792e-05, + "loss": 0.7334, + "step": 925 + }, + { + "epoch": 0.11377319080968178, + "grad_norm": 2.034375749946619, + "learning_rate": 2.998269792113217e-05, + "loss": 0.7825, + "step": 926 + }, + { + "epoch": 0.11389605602653889, + "grad_norm": 2.0174048596851946, + "learning_rate": 2.9982387636873428e-05, + "loss": 0.6811, + "step": 927 + }, + { + "epoch": 0.114018921243396, + "grad_norm": 1.831695854404119, + "learning_rate": 2.9982074596701644e-05, + "loss": 0.711, + "step": 928 + }, + { + "epoch": 0.11414178646025311, + "grad_norm": 1.4595843529376773, + "learning_rate": 2.9981758800674404e-05, + "loss": 0.6742, + "step": 929 + }, + { + "epoch": 0.11426465167711021, + "grad_norm": 1.990850231365332, + "learning_rate": 2.9981440248849793e-05, + "loss": 0.7988, + "step": 930 + }, + { + "epoch": 0.11438751689396731, + "grad_norm": 1.65411997569785, + "learning_rate": 2.9981118941286402e-05, + "loss": 0.6542, + "step": 931 + }, + { + "epoch": 0.11451038211082443, + "grad_norm": 2.0241758170345987, + "learning_rate": 2.9980794878043338e-05, + "loss": 0.7369, + "step": 932 + }, + { + "epoch": 0.11463324732768153, + "grad_norm": 2.033522992646661, + "learning_rate": 2.9980468059180215e-05, + "loss": 0.7993, + "step": 933 + }, + { + "epoch": 0.11475611254453864, + "grad_norm": 1.6687471032166814, + "learning_rate": 2.9980138484757137e-05, + "loss": 0.6607, + "step": 934 + }, + { + "epoch": 0.11487897776139575, + "grad_norm": 1.800387535371971, + "learning_rate": 2.9979806154834743e-05, + "loss": 0.688, + "step": 935 + }, + { + "epoch": 0.11500184297825286, + "grad_norm": 1.5239242450621517, + "learning_rate": 2.9979471069474148e-05, + "loss": 0.5969, + "step": 936 + }, + { + "epoch": 0.11512470819510996, + "grad_norm": 2.0392172473699626, + "learning_rate": 2.9979133228736998e-05, + "loss": 0.7098, + "step": 937 + }, + { + "epoch": 0.11524757341196708, + "grad_norm": 1.6093785620592955, + "learning_rate": 2.997879263268543e-05, + "loss": 0.6024, + "step": 938 + }, + { + "epoch": 0.11537043862882418, + "grad_norm": 1.623260960923209, + "learning_rate": 2.99784492813821e-05, + "loss": 0.6335, + "step": 939 + }, + { + "epoch": 0.1154933038456813, + "grad_norm": 1.9313604260050343, + "learning_rate": 2.9978103174890167e-05, + "loss": 0.7909, + "step": 940 + }, + { + "epoch": 0.1156161690625384, + "grad_norm": 1.9559459867999844, + "learning_rate": 2.9977754313273286e-05, + "loss": 0.6451, + "step": 941 + }, + { + "epoch": 0.1157390342793955, + "grad_norm": 1.8356358611081436, + "learning_rate": 2.9977402696595638e-05, + "loss": 0.6963, + "step": 942 + }, + { + "epoch": 0.11586189949625261, + "grad_norm": 1.7880031501323828, + "learning_rate": 2.9977048324921895e-05, + "loss": 0.7626, + "step": 943 + }, + { + "epoch": 0.11598476471310971, + "grad_norm": 2.2681437508666114, + "learning_rate": 2.997669119831724e-05, + "loss": 0.7904, + "step": 944 + }, + { + "epoch": 0.11610762992996683, + "grad_norm": 1.715961806485397, + "learning_rate": 2.9976331316847366e-05, + "loss": 0.6347, + "step": 945 + }, + { + "epoch": 0.11623049514682393, + "grad_norm": 1.730114264100273, + "learning_rate": 2.9975968680578472e-05, + "loss": 0.6577, + "step": 946 + }, + { + "epoch": 0.11635336036368105, + "grad_norm": 1.6734098550637142, + "learning_rate": 2.997560328957726e-05, + "loss": 0.9078, + "step": 947 + }, + { + "epoch": 0.11647622558053815, + "grad_norm": 1.6747659062551452, + "learning_rate": 2.9975235143910945e-05, + "loss": 0.6541, + "step": 948 + }, + { + "epoch": 0.11659909079739526, + "grad_norm": 1.6665144517418158, + "learning_rate": 2.9974864243647243e-05, + "loss": 0.666, + "step": 949 + }, + { + "epoch": 0.11672195601425236, + "grad_norm": 2.2224752994394987, + "learning_rate": 2.997449058885438e-05, + "loss": 0.7344, + "step": 950 + }, + { + "epoch": 0.11684482123110947, + "grad_norm": 1.7848051308519954, + "learning_rate": 2.9974114179601085e-05, + "loss": 0.792, + "step": 951 + }, + { + "epoch": 0.11696768644796658, + "grad_norm": 1.7045958162640003, + "learning_rate": 2.9973735015956596e-05, + "loss": 0.628, + "step": 952 + }, + { + "epoch": 0.11709055166482368, + "grad_norm": 1.4962538422161107, + "learning_rate": 2.9973353097990665e-05, + "loss": 0.7212, + "step": 953 + }, + { + "epoch": 0.1172134168816808, + "grad_norm": 1.7421943900398063, + "learning_rate": 2.997296842577353e-05, + "loss": 0.6899, + "step": 954 + }, + { + "epoch": 0.1173362820985379, + "grad_norm": 1.666832408102903, + "learning_rate": 2.9972580999375957e-05, + "loss": 0.8394, + "step": 955 + }, + { + "epoch": 0.11745914731539502, + "grad_norm": 1.660293458017513, + "learning_rate": 2.997219081886921e-05, + "loss": 0.704, + "step": 956 + }, + { + "epoch": 0.11758201253225212, + "grad_norm": 1.500870963294999, + "learning_rate": 2.9971797884325062e-05, + "loss": 0.6691, + "step": 957 + }, + { + "epoch": 0.11770487774910923, + "grad_norm": 1.6090890054970115, + "learning_rate": 2.997140219581579e-05, + "loss": 0.7163, + "step": 958 + }, + { + "epoch": 0.11782774296596633, + "grad_norm": 1.9504343035537988, + "learning_rate": 2.9971003753414173e-05, + "loss": 0.7751, + "step": 959 + }, + { + "epoch": 0.11795060818282345, + "grad_norm": 1.7718814833191252, + "learning_rate": 2.997060255719351e-05, + "loss": 0.727, + "step": 960 + }, + { + "epoch": 0.11807347339968055, + "grad_norm": 1.7819396884821375, + "learning_rate": 2.997019860722759e-05, + "loss": 0.8704, + "step": 961 + }, + { + "epoch": 0.11819633861653765, + "grad_norm": 1.7552512381599663, + "learning_rate": 2.9969791903590727e-05, + "loss": 0.8173, + "step": 962 + }, + { + "epoch": 0.11831920383339477, + "grad_norm": 1.6195587659864026, + "learning_rate": 2.996938244635772e-05, + "loss": 0.6907, + "step": 963 + }, + { + "epoch": 0.11844206905025187, + "grad_norm": 1.6399874256573652, + "learning_rate": 2.9968970235603897e-05, + "loss": 0.674, + "step": 964 + }, + { + "epoch": 0.11856493426710898, + "grad_norm": 1.5807111758291847, + "learning_rate": 2.996855527140507e-05, + "loss": 0.7625, + "step": 965 + }, + { + "epoch": 0.11868779948396609, + "grad_norm": 1.6518682404897127, + "learning_rate": 2.9968137553837578e-05, + "loss": 0.6313, + "step": 966 + }, + { + "epoch": 0.1188106647008232, + "grad_norm": 1.5393445640587344, + "learning_rate": 2.996771708297826e-05, + "loss": 0.6983, + "step": 967 + }, + { + "epoch": 0.1189335299176803, + "grad_norm": 1.641498477331313, + "learning_rate": 2.9967293858904447e-05, + "loss": 0.7421, + "step": 968 + }, + { + "epoch": 0.11905639513453742, + "grad_norm": 1.5795100070765273, + "learning_rate": 2.9966867881693995e-05, + "loss": 0.7488, + "step": 969 + }, + { + "epoch": 0.11917926035139452, + "grad_norm": 1.535289105314938, + "learning_rate": 2.996643915142526e-05, + "loss": 0.6455, + "step": 970 + }, + { + "epoch": 0.11930212556825162, + "grad_norm": 1.5398875949981876, + "learning_rate": 2.9966007668177112e-05, + "loss": 0.7271, + "step": 971 + }, + { + "epoch": 0.11942499078510874, + "grad_norm": 1.597661943186351, + "learning_rate": 2.9965573432028907e-05, + "loss": 0.8128, + "step": 972 + }, + { + "epoch": 0.11954785600196584, + "grad_norm": 1.874274188184879, + "learning_rate": 2.9965136443060523e-05, + "loss": 0.7607, + "step": 973 + }, + { + "epoch": 0.11967072121882295, + "grad_norm": 2.2633525574357822, + "learning_rate": 2.9964696701352337e-05, + "loss": 0.7755, + "step": 974 + }, + { + "epoch": 0.11979358643568006, + "grad_norm": 1.9163759652229604, + "learning_rate": 2.9964254206985248e-05, + "loss": 0.6826, + "step": 975 + }, + { + "epoch": 0.11991645165253717, + "grad_norm": 1.5511171572089466, + "learning_rate": 2.9963808960040645e-05, + "loss": 0.6637, + "step": 976 + }, + { + "epoch": 0.12003931686939427, + "grad_norm": 2.034460886112864, + "learning_rate": 2.9963360960600427e-05, + "loss": 0.662, + "step": 977 + }, + { + "epoch": 0.12016218208625139, + "grad_norm": 1.4904389641958855, + "learning_rate": 2.9962910208747e-05, + "loss": 0.6058, + "step": 978 + }, + { + "epoch": 0.12028504730310849, + "grad_norm": 2.086325559344193, + "learning_rate": 2.996245670456328e-05, + "loss": 0.7161, + "step": 979 + }, + { + "epoch": 0.1204079125199656, + "grad_norm": 1.8940147833476066, + "learning_rate": 2.996200044813268e-05, + "loss": 0.6574, + "step": 980 + }, + { + "epoch": 0.1205307777368227, + "grad_norm": 1.710189084168831, + "learning_rate": 2.996154143953913e-05, + "loss": 0.5656, + "step": 981 + }, + { + "epoch": 0.12065364295367981, + "grad_norm": 1.5260725063145748, + "learning_rate": 2.9961079678867064e-05, + "loss": 0.7211, + "step": 982 + }, + { + "epoch": 0.12077650817053692, + "grad_norm": 1.5371099229531677, + "learning_rate": 2.9960615166201417e-05, + "loss": 0.6258, + "step": 983 + }, + { + "epoch": 0.12089937338739402, + "grad_norm": 1.6815215739851217, + "learning_rate": 2.996014790162763e-05, + "loss": 0.5639, + "step": 984 + }, + { + "epoch": 0.12102223860425114, + "grad_norm": 1.6069953840523878, + "learning_rate": 2.995967788523166e-05, + "loss": 0.7785, + "step": 985 + }, + { + "epoch": 0.12114510382110824, + "grad_norm": 1.729038782250289, + "learning_rate": 2.995920511709996e-05, + "loss": 0.6434, + "step": 986 + }, + { + "epoch": 0.12126796903796536, + "grad_norm": 1.9795436135216768, + "learning_rate": 2.995872959731949e-05, + "loss": 0.7228, + "step": 987 + }, + { + "epoch": 0.12139083425482246, + "grad_norm": 1.7353020741796659, + "learning_rate": 2.9958251325977726e-05, + "loss": 0.646, + "step": 988 + }, + { + "epoch": 0.12151369947167957, + "grad_norm": 1.69620004842621, + "learning_rate": 2.9957770303162634e-05, + "loss": 0.6902, + "step": 989 + }, + { + "epoch": 0.12163656468853667, + "grad_norm": 1.471739502127223, + "learning_rate": 2.99572865289627e-05, + "loss": 0.6312, + "step": 990 + }, + { + "epoch": 0.12175942990539378, + "grad_norm": 1.937543293883123, + "learning_rate": 2.995680000346691e-05, + "loss": 0.6524, + "step": 991 + }, + { + "epoch": 0.12188229512225089, + "grad_norm": 1.7123279740455752, + "learning_rate": 2.995631072676476e-05, + "loss": 0.7177, + "step": 992 + }, + { + "epoch": 0.122005160339108, + "grad_norm": 1.5703044853222123, + "learning_rate": 2.9955818698946243e-05, + "loss": 0.7582, + "step": 993 + }, + { + "epoch": 0.12212802555596511, + "grad_norm": 1.6270738289526687, + "learning_rate": 2.9955323920101872e-05, + "loss": 0.6279, + "step": 994 + }, + { + "epoch": 0.12225089077282221, + "grad_norm": 1.7028831799280446, + "learning_rate": 2.9954826390322653e-05, + "loss": 0.6748, + "step": 995 + }, + { + "epoch": 0.12237375598967933, + "grad_norm": 1.5939771887037102, + "learning_rate": 2.9954326109700104e-05, + "loss": 0.632, + "step": 996 + }, + { + "epoch": 0.12249662120653643, + "grad_norm": 1.6619543156824201, + "learning_rate": 2.9953823078326252e-05, + "loss": 0.5534, + "step": 997 + }, + { + "epoch": 0.12261948642339354, + "grad_norm": 1.6257406992276335, + "learning_rate": 2.995331729629362e-05, + "loss": 0.7374, + "step": 998 + }, + { + "epoch": 0.12274235164025064, + "grad_norm": 1.9853691962915836, + "learning_rate": 2.9952808763695247e-05, + "loss": 0.7009, + "step": 999 + }, + { + "epoch": 0.12286521685710776, + "grad_norm": 1.6249542899389606, + "learning_rate": 2.9952297480624678e-05, + "loss": 0.7002, + "step": 1000 + }, + { + "epoch": 0.12298808207396486, + "grad_norm": 1.5778414431899186, + "learning_rate": 2.9951783447175954e-05, + "loss": 0.6402, + "step": 1001 + }, + { + "epoch": 0.12311094729082196, + "grad_norm": 2.0874593486187107, + "learning_rate": 2.9951266663443634e-05, + "loss": 0.7527, + "step": 1002 + }, + { + "epoch": 0.12323381250767908, + "grad_norm": 1.9236355685092414, + "learning_rate": 2.995074712952277e-05, + "loss": 0.6869, + "step": 1003 + }, + { + "epoch": 0.12335667772453618, + "grad_norm": 2.1163180732001603, + "learning_rate": 2.995022484550893e-05, + "loss": 0.7183, + "step": 1004 + }, + { + "epoch": 0.1234795429413933, + "grad_norm": 1.5879735405788065, + "learning_rate": 2.994969981149819e-05, + "loss": 0.7409, + "step": 1005 + }, + { + "epoch": 0.1236024081582504, + "grad_norm": 1.6235846789859358, + "learning_rate": 2.9949172027587116e-05, + "loss": 0.6656, + "step": 1006 + }, + { + "epoch": 0.12372527337510751, + "grad_norm": 1.6187951425034859, + "learning_rate": 2.99486414938728e-05, + "loss": 0.8049, + "step": 1007 + }, + { + "epoch": 0.12384813859196461, + "grad_norm": 2.090461465668652, + "learning_rate": 2.9948108210452824e-05, + "loss": 0.656, + "step": 1008 + }, + { + "epoch": 0.12397100380882173, + "grad_norm": 1.8696158686109474, + "learning_rate": 2.9947572177425285e-05, + "loss": 0.6037, + "step": 1009 + }, + { + "epoch": 0.12409386902567883, + "grad_norm": 1.6716225834339644, + "learning_rate": 2.9947033394888786e-05, + "loss": 0.7694, + "step": 1010 + }, + { + "epoch": 0.12421673424253594, + "grad_norm": 1.868196850506318, + "learning_rate": 2.9946491862942426e-05, + "loss": 0.677, + "step": 1011 + }, + { + "epoch": 0.12433959945939305, + "grad_norm": 1.6943817245924655, + "learning_rate": 2.994594758168582e-05, + "loss": 0.6645, + "step": 1012 + }, + { + "epoch": 0.12446246467625015, + "grad_norm": 2.3596853645259968, + "learning_rate": 2.9945400551219084e-05, + "loss": 0.769, + "step": 1013 + }, + { + "epoch": 0.12458532989310726, + "grad_norm": 1.7647279657083175, + "learning_rate": 2.9944850771642843e-05, + "loss": 0.6843, + "step": 1014 + }, + { + "epoch": 0.12470819510996436, + "grad_norm": 2.001597647305168, + "learning_rate": 2.9944298243058217e-05, + "loss": 0.8216, + "step": 1015 + }, + { + "epoch": 0.12483106032682148, + "grad_norm": 2.4293765027399092, + "learning_rate": 2.9943742965566854e-05, + "loss": 0.7272, + "step": 1016 + }, + { + "epoch": 0.12495392554367858, + "grad_norm": 2.0057682712893836, + "learning_rate": 2.9943184939270882e-05, + "loss": 0.7424, + "step": 1017 + }, + { + "epoch": 0.12507679076053568, + "grad_norm": 1.8114123374825029, + "learning_rate": 2.994262416427295e-05, + "loss": 0.7571, + "step": 1018 + }, + { + "epoch": 0.1251996559773928, + "grad_norm": 1.7857596790370338, + "learning_rate": 2.994206064067621e-05, + "loss": 0.6938, + "step": 1019 + }, + { + "epoch": 0.12532252119424991, + "grad_norm": 1.8504140600909118, + "learning_rate": 2.994149436858432e-05, + "loss": 0.7713, + "step": 1020 + }, + { + "epoch": 0.12544538641110703, + "grad_norm": 1.7834403693247516, + "learning_rate": 2.994092534810144e-05, + "loss": 0.65, + "step": 1021 + }, + { + "epoch": 0.12556825162796412, + "grad_norm": 1.7229019313668377, + "learning_rate": 2.9940353579332233e-05, + "loss": 0.6635, + "step": 1022 + }, + { + "epoch": 0.12569111684482123, + "grad_norm": 1.8409393907020477, + "learning_rate": 2.9939779062381876e-05, + "loss": 0.7243, + "step": 1023 + }, + { + "epoch": 0.12581398206167835, + "grad_norm": 1.5963612283430768, + "learning_rate": 2.9939201797356053e-05, + "loss": 0.6832, + "step": 1024 + }, + { + "epoch": 0.12593684727853544, + "grad_norm": 1.6393220706638125, + "learning_rate": 2.993862178436094e-05, + "loss": 0.6668, + "step": 1025 + }, + { + "epoch": 0.12605971249539255, + "grad_norm": 1.6078984246283192, + "learning_rate": 2.9938039023503233e-05, + "loss": 0.7442, + "step": 1026 + }, + { + "epoch": 0.12618257771224967, + "grad_norm": 1.7665842136358443, + "learning_rate": 2.9937453514890123e-05, + "loss": 0.7841, + "step": 1027 + }, + { + "epoch": 0.12630544292910678, + "grad_norm": 1.4864408591818952, + "learning_rate": 2.9936865258629312e-05, + "loss": 0.6271, + "step": 1028 + }, + { + "epoch": 0.12642830814596387, + "grad_norm": 1.5184504569936368, + "learning_rate": 2.9936274254829007e-05, + "loss": 0.6183, + "step": 1029 + }, + { + "epoch": 0.12655117336282098, + "grad_norm": 1.7832266201503009, + "learning_rate": 2.9935680503597917e-05, + "loss": 0.651, + "step": 1030 + }, + { + "epoch": 0.1266740385796781, + "grad_norm": 1.9398792647983572, + "learning_rate": 2.993508400504526e-05, + "loss": 0.7072, + "step": 1031 + }, + { + "epoch": 0.1267969037965352, + "grad_norm": 1.5983712281493048, + "learning_rate": 2.9934484759280756e-05, + "loss": 0.7195, + "step": 1032 + }, + { + "epoch": 0.1269197690133923, + "grad_norm": 1.591340307152886, + "learning_rate": 2.9933882766414634e-05, + "loss": 0.6022, + "step": 1033 + }, + { + "epoch": 0.12704263423024942, + "grad_norm": 1.8092691577786109, + "learning_rate": 2.9933278026557627e-05, + "loss": 0.8369, + "step": 1034 + }, + { + "epoch": 0.12716549944710653, + "grad_norm": 1.639598330700463, + "learning_rate": 2.9932670539820975e-05, + "loss": 0.6061, + "step": 1035 + }, + { + "epoch": 0.12728836466396362, + "grad_norm": 1.736962784004616, + "learning_rate": 2.9932060306316416e-05, + "loss": 0.7568, + "step": 1036 + }, + { + "epoch": 0.12741122988082074, + "grad_norm": 1.6565201366898092, + "learning_rate": 2.9931447326156204e-05, + "loss": 0.594, + "step": 1037 + }, + { + "epoch": 0.12753409509767785, + "grad_norm": 1.7033835861610587, + "learning_rate": 2.9930831599453087e-05, + "loss": 0.7553, + "step": 1038 + }, + { + "epoch": 0.12765696031453497, + "grad_norm": 1.79384836041842, + "learning_rate": 2.9930213126320333e-05, + "loss": 0.659, + "step": 1039 + }, + { + "epoch": 0.12777982553139206, + "grad_norm": 1.7769733738955777, + "learning_rate": 2.9929591906871696e-05, + "loss": 0.7507, + "step": 1040 + }, + { + "epoch": 0.12790269074824917, + "grad_norm": 1.995675685753367, + "learning_rate": 2.992896794122145e-05, + "loss": 0.75, + "step": 1041 + }, + { + "epoch": 0.12802555596510629, + "grad_norm": 1.6647032382231328, + "learning_rate": 2.992834122948437e-05, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.12814842118196337, + "grad_norm": 1.785718702120485, + "learning_rate": 2.9927711771775735e-05, + "loss": 0.7239, + "step": 1043 + }, + { + "epoch": 0.1282712863988205, + "grad_norm": 1.655665154994619, + "learning_rate": 2.9927079568211327e-05, + "loss": 0.708, + "step": 1044 + }, + { + "epoch": 0.1283941516156776, + "grad_norm": 1.5490913307762182, + "learning_rate": 2.9926444618907438e-05, + "loss": 0.6317, + "step": 1045 + }, + { + "epoch": 0.12851701683253472, + "grad_norm": 1.900604893061953, + "learning_rate": 2.9925806923980863e-05, + "loss": 0.8069, + "step": 1046 + }, + { + "epoch": 0.1286398820493918, + "grad_norm": 1.8049158843087787, + "learning_rate": 2.9925166483548903e-05, + "loss": 0.7032, + "step": 1047 + }, + { + "epoch": 0.12876274726624892, + "grad_norm": 1.3282479359829484, + "learning_rate": 2.9924523297729358e-05, + "loss": 0.6359, + "step": 1048 + }, + { + "epoch": 0.12888561248310604, + "grad_norm": 1.7681151167012679, + "learning_rate": 2.9923877366640543e-05, + "loss": 0.6992, + "step": 1049 + }, + { + "epoch": 0.12900847769996315, + "grad_norm": 2.2170000762704545, + "learning_rate": 2.9923228690401273e-05, + "loss": 0.7693, + "step": 1050 + }, + { + "epoch": 0.12913134291682024, + "grad_norm": 1.8092401809576655, + "learning_rate": 2.9922577269130866e-05, + "loss": 0.7706, + "step": 1051 + }, + { + "epoch": 0.12925420813367736, + "grad_norm": 1.7624374533037472, + "learning_rate": 2.9921923102949142e-05, + "loss": 0.6787, + "step": 1052 + }, + { + "epoch": 0.12937707335053447, + "grad_norm": 1.8843453181944527, + "learning_rate": 2.992126619197644e-05, + "loss": 0.7445, + "step": 1053 + }, + { + "epoch": 0.12949993856739156, + "grad_norm": 1.6128247249951075, + "learning_rate": 2.9920606536333587e-05, + "loss": 0.6856, + "step": 1054 + }, + { + "epoch": 0.12962280378424867, + "grad_norm": 1.5574081506399267, + "learning_rate": 2.991994413614193e-05, + "loss": 0.7239, + "step": 1055 + }, + { + "epoch": 0.1297456690011058, + "grad_norm": 1.5450313028733766, + "learning_rate": 2.9919278991523307e-05, + "loss": 0.7535, + "step": 1056 + }, + { + "epoch": 0.1298685342179629, + "grad_norm": 1.7127214724111612, + "learning_rate": 2.9918611102600066e-05, + "loss": 0.6009, + "step": 1057 + }, + { + "epoch": 0.12999139943482, + "grad_norm": 1.6257758853019078, + "learning_rate": 2.9917940469495065e-05, + "loss": 0.7182, + "step": 1058 + }, + { + "epoch": 0.1301142646516771, + "grad_norm": 1.583824872117485, + "learning_rate": 2.9917267092331655e-05, + "loss": 0.5854, + "step": 1059 + }, + { + "epoch": 0.13023712986853422, + "grad_norm": 1.469559497269179, + "learning_rate": 2.9916590971233714e-05, + "loss": 0.6542, + "step": 1060 + }, + { + "epoch": 0.13035999508539134, + "grad_norm": 1.5862365184085885, + "learning_rate": 2.99159121063256e-05, + "loss": 0.729, + "step": 1061 + }, + { + "epoch": 0.13048286030224843, + "grad_norm": 1.5545005513403674, + "learning_rate": 2.991523049773218e-05, + "loss": 0.6382, + "step": 1062 + }, + { + "epoch": 0.13060572551910554, + "grad_norm": 1.503068933204754, + "learning_rate": 2.9914546145578843e-05, + "loss": 0.7352, + "step": 1063 + }, + { + "epoch": 0.13072859073596266, + "grad_norm": 1.9106083474133921, + "learning_rate": 2.9913859049991464e-05, + "loss": 0.7455, + "step": 1064 + }, + { + "epoch": 0.13085145595281975, + "grad_norm": 1.8363870822732362, + "learning_rate": 2.991316921109644e-05, + "loss": 0.7338, + "step": 1065 + }, + { + "epoch": 0.13097432116967686, + "grad_norm": 1.5685336737116835, + "learning_rate": 2.991247662902065e-05, + "loss": 0.6975, + "step": 1066 + }, + { + "epoch": 0.13109718638653398, + "grad_norm": 1.6261019595969362, + "learning_rate": 2.9911781303891493e-05, + "loss": 0.6532, + "step": 1067 + }, + { + "epoch": 0.1312200516033911, + "grad_norm": 1.6649994992870834, + "learning_rate": 2.9911083235836872e-05, + "loss": 0.7355, + "step": 1068 + }, + { + "epoch": 0.13134291682024818, + "grad_norm": 1.7134890650926717, + "learning_rate": 2.9910382424985196e-05, + "loss": 0.758, + "step": 1069 + }, + { + "epoch": 0.1314657820371053, + "grad_norm": 1.5529677249782339, + "learning_rate": 2.9909678871465368e-05, + "loss": 0.7866, + "step": 1070 + }, + { + "epoch": 0.1315886472539624, + "grad_norm": 2.1087056801190935, + "learning_rate": 2.9908972575406802e-05, + "loss": 0.6364, + "step": 1071 + }, + { + "epoch": 0.1317115124708195, + "grad_norm": 1.6490759630849179, + "learning_rate": 2.990826353693942e-05, + "loss": 0.7244, + "step": 1072 + }, + { + "epoch": 0.1318343776876766, + "grad_norm": 1.5253849343391495, + "learning_rate": 2.9907551756193645e-05, + "loss": 0.662, + "step": 1073 + }, + { + "epoch": 0.13195724290453373, + "grad_norm": 1.5997971327964589, + "learning_rate": 2.9906837233300403e-05, + "loss": 0.7599, + "step": 1074 + }, + { + "epoch": 0.13208010812139084, + "grad_norm": 1.685469445928351, + "learning_rate": 2.9906119968391125e-05, + "loss": 0.7296, + "step": 1075 + }, + { + "epoch": 0.13220297333824793, + "grad_norm": 1.4826514386477891, + "learning_rate": 2.990539996159775e-05, + "loss": 0.6762, + "step": 1076 + }, + { + "epoch": 0.13232583855510505, + "grad_norm": 1.6491923179873533, + "learning_rate": 2.9904677213052712e-05, + "loss": 0.6171, + "step": 1077 + }, + { + "epoch": 0.13244870377196216, + "grad_norm": 1.7221441488332943, + "learning_rate": 2.990395172288897e-05, + "loss": 0.7139, + "step": 1078 + }, + { + "epoch": 0.13257156898881928, + "grad_norm": 1.75224878860636, + "learning_rate": 2.9903223491239958e-05, + "loss": 0.734, + "step": 1079 + }, + { + "epoch": 0.13269443420567636, + "grad_norm": 1.5175572099141303, + "learning_rate": 2.9902492518239638e-05, + "loss": 0.6105, + "step": 1080 + }, + { + "epoch": 0.13281729942253348, + "grad_norm": 1.7041409133501713, + "learning_rate": 2.990175880402246e-05, + "loss": 0.7258, + "step": 1081 + }, + { + "epoch": 0.1329401646393906, + "grad_norm": 1.5253051442756447, + "learning_rate": 2.9901022348723397e-05, + "loss": 0.718, + "step": 1082 + }, + { + "epoch": 0.13306302985624768, + "grad_norm": 1.520831403060298, + "learning_rate": 2.990028315247791e-05, + "loss": 0.6726, + "step": 1083 + }, + { + "epoch": 0.1331858950731048, + "grad_norm": 1.7374415573273694, + "learning_rate": 2.9899541215421965e-05, + "loss": 0.6715, + "step": 1084 + }, + { + "epoch": 0.13330876028996191, + "grad_norm": 1.774969031330596, + "learning_rate": 2.9898796537692044e-05, + "loss": 0.7461, + "step": 1085 + }, + { + "epoch": 0.13343162550681903, + "grad_norm": 1.6893695017599883, + "learning_rate": 2.9898049119425118e-05, + "loss": 0.7804, + "step": 1086 + }, + { + "epoch": 0.13355449072367612, + "grad_norm": 1.8280899379724442, + "learning_rate": 2.9897298960758674e-05, + "loss": 0.7282, + "step": 1087 + }, + { + "epoch": 0.13367735594053323, + "grad_norm": 1.5469141291530257, + "learning_rate": 2.98965460618307e-05, + "loss": 0.6232, + "step": 1088 + }, + { + "epoch": 0.13380022115739035, + "grad_norm": 1.6239630377852645, + "learning_rate": 2.989579042277969e-05, + "loss": 0.6922, + "step": 1089 + }, + { + "epoch": 0.13392308637424746, + "grad_norm": 1.5582302830631414, + "learning_rate": 2.9895032043744625e-05, + "loss": 0.7025, + "step": 1090 + }, + { + "epoch": 0.13404595159110455, + "grad_norm": 1.7151813415907224, + "learning_rate": 2.989427092486502e-05, + "loss": 0.8021, + "step": 1091 + }, + { + "epoch": 0.13416881680796167, + "grad_norm": 1.6671315355968632, + "learning_rate": 2.989350706628087e-05, + "loss": 0.619, + "step": 1092 + }, + { + "epoch": 0.13429168202481878, + "grad_norm": 1.5897395470727977, + "learning_rate": 2.9892740468132683e-05, + "loss": 0.7023, + "step": 1093 + }, + { + "epoch": 0.13441454724167587, + "grad_norm": 1.5005597620455335, + "learning_rate": 2.9891971130561467e-05, + "loss": 0.6672, + "step": 1094 + }, + { + "epoch": 0.13453741245853298, + "grad_norm": 1.5826884769046605, + "learning_rate": 2.9891199053708743e-05, + "loss": 0.7526, + "step": 1095 + }, + { + "epoch": 0.1346602776753901, + "grad_norm": 1.6741467416650784, + "learning_rate": 2.9890424237716524e-05, + "loss": 0.652, + "step": 1096 + }, + { + "epoch": 0.13478314289224722, + "grad_norm": 1.614720866942373, + "learning_rate": 2.9889646682727334e-05, + "loss": 0.7233, + "step": 1097 + }, + { + "epoch": 0.1349060081091043, + "grad_norm": 1.852966897322659, + "learning_rate": 2.98888663888842e-05, + "loss": 0.6917, + "step": 1098 + }, + { + "epoch": 0.13502887332596142, + "grad_norm": 1.395848435555885, + "learning_rate": 2.988808335633065e-05, + "loss": 0.6428, + "step": 1099 + }, + { + "epoch": 0.13515173854281853, + "grad_norm": 1.727699178066572, + "learning_rate": 2.988729758521072e-05, + "loss": 0.5772, + "step": 1100 + }, + { + "epoch": 0.13527460375967565, + "grad_norm": 2.360574451364533, + "learning_rate": 2.9886509075668947e-05, + "loss": 0.8123, + "step": 1101 + }, + { + "epoch": 0.13539746897653274, + "grad_norm": 1.7034318099687635, + "learning_rate": 2.9885717827850372e-05, + "loss": 0.8034, + "step": 1102 + }, + { + "epoch": 0.13552033419338985, + "grad_norm": 1.6964938823875, + "learning_rate": 2.988492384190054e-05, + "loss": 0.648, + "step": 1103 + }, + { + "epoch": 0.13564319941024697, + "grad_norm": 1.357080599585336, + "learning_rate": 2.98841271179655e-05, + "loss": 0.7694, + "step": 1104 + }, + { + "epoch": 0.13576606462710405, + "grad_norm": 1.6051874443433634, + "learning_rate": 2.9883327656191806e-05, + "loss": 0.5676, + "step": 1105 + }, + { + "epoch": 0.13588892984396117, + "grad_norm": 1.5996527238503389, + "learning_rate": 2.9882525456726507e-05, + "loss": 0.6148, + "step": 1106 + }, + { + "epoch": 0.13601179506081829, + "grad_norm": 1.7527813375420025, + "learning_rate": 2.988172051971717e-05, + "loss": 0.6786, + "step": 1107 + }, + { + "epoch": 0.1361346602776754, + "grad_norm": 2.1797407390282446, + "learning_rate": 2.988091284531185e-05, + "loss": 0.7661, + "step": 1108 + }, + { + "epoch": 0.1362575254945325, + "grad_norm": 1.6281417280686319, + "learning_rate": 2.988010243365912e-05, + "loss": 0.6377, + "step": 1109 + }, + { + "epoch": 0.1363803907113896, + "grad_norm": 1.628826675952316, + "learning_rate": 2.9879289284908053e-05, + "loss": 0.7115, + "step": 1110 + }, + { + "epoch": 0.13650325592824672, + "grad_norm": 1.575995587052852, + "learning_rate": 2.9878473399208215e-05, + "loss": 0.761, + "step": 1111 + }, + { + "epoch": 0.13662612114510383, + "grad_norm": 1.5557246073582405, + "learning_rate": 2.9877654776709685e-05, + "loss": 0.7143, + "step": 1112 + }, + { + "epoch": 0.13674898636196092, + "grad_norm": 1.68306350535184, + "learning_rate": 2.9876833417563044e-05, + "loss": 0.7616, + "step": 1113 + }, + { + "epoch": 0.13687185157881804, + "grad_norm": 1.5120456943854803, + "learning_rate": 2.9876009321919372e-05, + "loss": 0.7334, + "step": 1114 + }, + { + "epoch": 0.13699471679567515, + "grad_norm": 2.0993213715121914, + "learning_rate": 2.9875182489930263e-05, + "loss": 0.6747, + "step": 1115 + }, + { + "epoch": 0.13711758201253224, + "grad_norm": 1.5984791951395687, + "learning_rate": 2.98743529217478e-05, + "loss": 0.6757, + "step": 1116 + }, + { + "epoch": 0.13724044722938936, + "grad_norm": 1.6232016639873426, + "learning_rate": 2.9873520617524585e-05, + "loss": 0.7204, + "step": 1117 + }, + { + "epoch": 0.13736331244624647, + "grad_norm": 1.4248520190304972, + "learning_rate": 2.9872685577413712e-05, + "loss": 0.6354, + "step": 1118 + }, + { + "epoch": 0.1374861776631036, + "grad_norm": 1.6957929170662709, + "learning_rate": 2.987184780156878e-05, + "loss": 0.7979, + "step": 1119 + }, + { + "epoch": 0.13760904287996067, + "grad_norm": 1.4160038707855611, + "learning_rate": 2.9871007290143884e-05, + "loss": 0.7727, + "step": 1120 + }, + { + "epoch": 0.1377319080968178, + "grad_norm": 1.714684305341049, + "learning_rate": 2.9870164043293645e-05, + "loss": 0.7741, + "step": 1121 + }, + { + "epoch": 0.1378547733136749, + "grad_norm": 1.7006929161823827, + "learning_rate": 2.9869318061173168e-05, + "loss": 0.6791, + "step": 1122 + }, + { + "epoch": 0.137977638530532, + "grad_norm": 1.607830749630221, + "learning_rate": 2.9868469343938063e-05, + "loss": 0.7489, + "step": 1123 + }, + { + "epoch": 0.1381005037473891, + "grad_norm": 1.6238723271731592, + "learning_rate": 2.9867617891744447e-05, + "loss": 0.5973, + "step": 1124 + }, + { + "epoch": 0.13822336896424622, + "grad_norm": 1.6223529364307367, + "learning_rate": 2.9866763704748938e-05, + "loss": 0.6676, + "step": 1125 + }, + { + "epoch": 0.13834623418110334, + "grad_norm": 1.4303665214802679, + "learning_rate": 2.9865906783108663e-05, + "loss": 0.6366, + "step": 1126 + }, + { + "epoch": 0.13846909939796043, + "grad_norm": 1.5903653314170936, + "learning_rate": 2.9865047126981238e-05, + "loss": 0.6407, + "step": 1127 + }, + { + "epoch": 0.13859196461481754, + "grad_norm": 1.6142191370840326, + "learning_rate": 2.9864184736524808e-05, + "loss": 0.6473, + "step": 1128 + }, + { + "epoch": 0.13871482983167466, + "grad_norm": 1.3842105458389793, + "learning_rate": 2.9863319611897985e-05, + "loss": 0.7453, + "step": 1129 + }, + { + "epoch": 0.13883769504853177, + "grad_norm": 1.5572047301595482, + "learning_rate": 2.9862451753259912e-05, + "loss": 0.5556, + "step": 1130 + }, + { + "epoch": 0.13896056026538886, + "grad_norm": 1.5155458938049342, + "learning_rate": 2.986158116077023e-05, + "loss": 0.6837, + "step": 1131 + }, + { + "epoch": 0.13908342548224598, + "grad_norm": 1.5818650898706743, + "learning_rate": 2.986070783458907e-05, + "loss": 0.5668, + "step": 1132 + }, + { + "epoch": 0.1392062906991031, + "grad_norm": 1.7291002308753405, + "learning_rate": 2.9859831774877077e-05, + "loss": 0.7332, + "step": 1133 + }, + { + "epoch": 0.13932915591596018, + "grad_norm": 1.3800473069173909, + "learning_rate": 2.9858952981795407e-05, + "loss": 0.7414, + "step": 1134 + }, + { + "epoch": 0.1394520211328173, + "grad_norm": 1.5772211023209295, + "learning_rate": 2.985807145550569e-05, + "loss": 0.6733, + "step": 1135 + }, + { + "epoch": 0.1395748863496744, + "grad_norm": 1.7771990478139945, + "learning_rate": 2.9857187196170093e-05, + "loss": 0.7348, + "step": 1136 + }, + { + "epoch": 0.13969775156653153, + "grad_norm": 1.531975844586977, + "learning_rate": 2.985630020395126e-05, + "loss": 0.6783, + "step": 1137 + }, + { + "epoch": 0.1398206167833886, + "grad_norm": 2.113447093229907, + "learning_rate": 2.9855410479012354e-05, + "loss": 0.7505, + "step": 1138 + }, + { + "epoch": 0.13994348200024573, + "grad_norm": 1.5142104459046328, + "learning_rate": 2.985451802151703e-05, + "loss": 0.6454, + "step": 1139 + }, + { + "epoch": 0.14006634721710284, + "grad_norm": 1.5959559270845487, + "learning_rate": 2.9853622831629448e-05, + "loss": 0.6463, + "step": 1140 + }, + { + "epoch": 0.14018921243395996, + "grad_norm": 1.4290711790573352, + "learning_rate": 2.985272490951428e-05, + "loss": 0.6776, + "step": 1141 + }, + { + "epoch": 0.14031207765081705, + "grad_norm": 1.5117342303363002, + "learning_rate": 2.9851824255336686e-05, + "loss": 0.6445, + "step": 1142 + }, + { + "epoch": 0.14043494286767416, + "grad_norm": 1.741812081110336, + "learning_rate": 2.9850920869262338e-05, + "loss": 0.7343, + "step": 1143 + }, + { + "epoch": 0.14055780808453128, + "grad_norm": 1.7573076206717237, + "learning_rate": 2.9850014751457407e-05, + "loss": 0.7436, + "step": 1144 + }, + { + "epoch": 0.14068067330138836, + "grad_norm": 1.6571594013571993, + "learning_rate": 2.984910590208857e-05, + "loss": 0.7109, + "step": 1145 + }, + { + "epoch": 0.14080353851824548, + "grad_norm": 1.5100474084765798, + "learning_rate": 2.9848194321322996e-05, + "loss": 0.7391, + "step": 1146 + }, + { + "epoch": 0.1409264037351026, + "grad_norm": 1.5865471745601147, + "learning_rate": 2.9847280009328377e-05, + "loss": 0.6929, + "step": 1147 + }, + { + "epoch": 0.1410492689519597, + "grad_norm": 1.7695306423384076, + "learning_rate": 2.9846362966272888e-05, + "loss": 0.7631, + "step": 1148 + }, + { + "epoch": 0.1411721341688168, + "grad_norm": 1.5094838137769047, + "learning_rate": 2.984544319232521e-05, + "loss": 0.7425, + "step": 1149 + }, + { + "epoch": 0.14129499938567391, + "grad_norm": 1.405896089121248, + "learning_rate": 2.9844520687654537e-05, + "loss": 0.6925, + "step": 1150 + }, + { + "epoch": 0.14141786460253103, + "grad_norm": 1.8775933743033086, + "learning_rate": 2.984359545243055e-05, + "loss": 0.8296, + "step": 1151 + }, + { + "epoch": 0.14154072981938814, + "grad_norm": 1.5284700945190077, + "learning_rate": 2.9842667486823446e-05, + "loss": 0.7085, + "step": 1152 + }, + { + "epoch": 0.14166359503624523, + "grad_norm": 1.6149581262704114, + "learning_rate": 2.9841736791003914e-05, + "loss": 0.6629, + "step": 1153 + }, + { + "epoch": 0.14178646025310235, + "grad_norm": 1.6705926070544859, + "learning_rate": 2.9840803365143153e-05, + "loss": 0.6295, + "step": 1154 + }, + { + "epoch": 0.14190932546995946, + "grad_norm": 1.6265851552636075, + "learning_rate": 2.983986720941286e-05, + "loss": 0.5882, + "step": 1155 + }, + { + "epoch": 0.14203219068681655, + "grad_norm": 1.8213003661624614, + "learning_rate": 2.983892832398523e-05, + "loss": 0.7028, + "step": 1156 + }, + { + "epoch": 0.14215505590367367, + "grad_norm": 1.8238674991249615, + "learning_rate": 2.983798670903297e-05, + "loss": 0.8217, + "step": 1157 + }, + { + "epoch": 0.14227792112053078, + "grad_norm": 1.4786832757343902, + "learning_rate": 2.9837042364729284e-05, + "loss": 0.6697, + "step": 1158 + }, + { + "epoch": 0.1424007863373879, + "grad_norm": 1.5583943344798705, + "learning_rate": 2.9836095291247875e-05, + "loss": 0.8287, + "step": 1159 + }, + { + "epoch": 0.14252365155424498, + "grad_norm": 1.700219130998528, + "learning_rate": 2.9835145488762952e-05, + "loss": 0.6588, + "step": 1160 + }, + { + "epoch": 0.1426465167711021, + "grad_norm": 1.6867726267692873, + "learning_rate": 2.983419295744923e-05, + "loss": 0.8276, + "step": 1161 + }, + { + "epoch": 0.14276938198795922, + "grad_norm": 1.5464045937771351, + "learning_rate": 2.983323769748191e-05, + "loss": 0.7312, + "step": 1162 + }, + { + "epoch": 0.1428922472048163, + "grad_norm": 1.4725598950961054, + "learning_rate": 2.983227970903672e-05, + "loss": 0.7272, + "step": 1163 + }, + { + "epoch": 0.14301511242167342, + "grad_norm": 1.5764167115941043, + "learning_rate": 2.983131899228986e-05, + "loss": 0.7458, + "step": 1164 + }, + { + "epoch": 0.14313797763853053, + "grad_norm": 1.7073788727927754, + "learning_rate": 2.983035554741806e-05, + "loss": 0.7216, + "step": 1165 + }, + { + "epoch": 0.14326084285538765, + "grad_norm": 1.369893090073144, + "learning_rate": 2.9829389374598538e-05, + "loss": 0.5365, + "step": 1166 + }, + { + "epoch": 0.14338370807224474, + "grad_norm": 1.753218224005704, + "learning_rate": 2.982842047400901e-05, + "loss": 0.6553, + "step": 1167 + }, + { + "epoch": 0.14350657328910185, + "grad_norm": 1.6290826751220913, + "learning_rate": 2.9827448845827697e-05, + "loss": 0.7021, + "step": 1168 + }, + { + "epoch": 0.14362943850595897, + "grad_norm": 1.8613743971801529, + "learning_rate": 2.9826474490233337e-05, + "loss": 0.7541, + "step": 1169 + }, + { + "epoch": 0.14375230372281608, + "grad_norm": 1.5388950279651519, + "learning_rate": 2.9825497407405144e-05, + "loss": 0.6148, + "step": 1170 + }, + { + "epoch": 0.14387516893967317, + "grad_norm": 1.8247844825561343, + "learning_rate": 2.982451759752285e-05, + "loss": 0.6611, + "step": 1171 + }, + { + "epoch": 0.14399803415653029, + "grad_norm": 1.8459463799301494, + "learning_rate": 2.982353506076668e-05, + "loss": 0.7517, + "step": 1172 + }, + { + "epoch": 0.1441208993733874, + "grad_norm": 1.7305820071085602, + "learning_rate": 2.9822549797317374e-05, + "loss": 0.6252, + "step": 1173 + }, + { + "epoch": 0.1442437645902445, + "grad_norm": 1.6306274681626463, + "learning_rate": 2.9821561807356158e-05, + "loss": 0.6901, + "step": 1174 + }, + { + "epoch": 0.1443666298071016, + "grad_norm": 1.6475188417039162, + "learning_rate": 2.9820571091064767e-05, + "loss": 0.7643, + "step": 1175 + }, + { + "epoch": 0.14448949502395872, + "grad_norm": 2.157485210174474, + "learning_rate": 2.9819577648625442e-05, + "loss": 0.6939, + "step": 1176 + }, + { + "epoch": 0.14461236024081583, + "grad_norm": 1.7119502689604285, + "learning_rate": 2.981858148022092e-05, + "loss": 0.6844, + "step": 1177 + }, + { + "epoch": 0.14473522545767292, + "grad_norm": 1.8389559260002328, + "learning_rate": 2.9817582586034433e-05, + "loss": 0.5993, + "step": 1178 + }, + { + "epoch": 0.14485809067453004, + "grad_norm": 1.7853063116985455, + "learning_rate": 2.981658096624972e-05, + "loss": 0.6719, + "step": 1179 + }, + { + "epoch": 0.14498095589138715, + "grad_norm": 1.5264089205767941, + "learning_rate": 2.9815576621051036e-05, + "loss": 0.7524, + "step": 1180 + }, + { + "epoch": 0.14510382110824427, + "grad_norm": 1.6589414063727597, + "learning_rate": 2.9814569550623108e-05, + "loss": 0.6786, + "step": 1181 + }, + { + "epoch": 0.14522668632510136, + "grad_norm": 1.583312604020123, + "learning_rate": 2.981355975515119e-05, + "loss": 0.6543, + "step": 1182 + }, + { + "epoch": 0.14534955154195847, + "grad_norm": 1.753303253691593, + "learning_rate": 2.9812547234821024e-05, + "loss": 0.7135, + "step": 1183 + }, + { + "epoch": 0.1454724167588156, + "grad_norm": 1.4360977477057137, + "learning_rate": 2.981153198981886e-05, + "loss": 0.604, + "step": 1184 + }, + { + "epoch": 0.14559528197567267, + "grad_norm": 1.6191223919459055, + "learning_rate": 2.9810514020331437e-05, + "loss": 0.7221, + "step": 1185 + }, + { + "epoch": 0.1457181471925298, + "grad_norm": 1.9682130471230237, + "learning_rate": 2.980949332654601e-05, + "loss": 0.7947, + "step": 1186 + }, + { + "epoch": 0.1458410124093869, + "grad_norm": 1.872401214267614, + "learning_rate": 2.9808469908650335e-05, + "loss": 0.6328, + "step": 1187 + }, + { + "epoch": 0.14596387762624402, + "grad_norm": 1.5490625447687678, + "learning_rate": 2.980744376683265e-05, + "loss": 0.7051, + "step": 1188 + }, + { + "epoch": 0.1460867428431011, + "grad_norm": 1.7426227546212294, + "learning_rate": 2.9806414901281716e-05, + "loss": 0.6305, + "step": 1189 + }, + { + "epoch": 0.14620960805995822, + "grad_norm": 1.602279188139357, + "learning_rate": 2.9805383312186784e-05, + "loss": 0.6566, + "step": 1190 + }, + { + "epoch": 0.14633247327681534, + "grad_norm": 1.6625325543909981, + "learning_rate": 2.980434899973761e-05, + "loss": 0.7038, + "step": 1191 + }, + { + "epoch": 0.14645533849367245, + "grad_norm": 1.422983208073982, + "learning_rate": 2.9803311964124444e-05, + "loss": 0.606, + "step": 1192 + }, + { + "epoch": 0.14657820371052954, + "grad_norm": 1.2448112809501521, + "learning_rate": 2.9802272205538045e-05, + "loss": 0.6758, + "step": 1193 + }, + { + "epoch": 0.14670106892738666, + "grad_norm": 1.633985917320782, + "learning_rate": 2.980122972416967e-05, + "loss": 0.7712, + "step": 1194 + }, + { + "epoch": 0.14682393414424377, + "grad_norm": 1.9262031182157078, + "learning_rate": 2.980018452021108e-05, + "loss": 0.8191, + "step": 1195 + }, + { + "epoch": 0.14694679936110086, + "grad_norm": 1.8125483833149785, + "learning_rate": 2.9799136593854524e-05, + "loss": 0.5848, + "step": 1196 + }, + { + "epoch": 0.14706966457795798, + "grad_norm": 1.955113153097074, + "learning_rate": 2.979808594529277e-05, + "loss": 0.7724, + "step": 1197 + }, + { + "epoch": 0.1471925297948151, + "grad_norm": 1.5764574389032477, + "learning_rate": 2.979703257471908e-05, + "loss": 0.653, + "step": 1198 + }, + { + "epoch": 0.1473153950116722, + "grad_norm": 1.4581251083745166, + "learning_rate": 2.9795976482327206e-05, + "loss": 0.6443, + "step": 1199 + }, + { + "epoch": 0.1474382602285293, + "grad_norm": 1.5907438929782913, + "learning_rate": 2.979491766831141e-05, + "loss": 0.5955, + "step": 1200 + }, + { + "epoch": 0.1475611254453864, + "grad_norm": 1.8345042417039938, + "learning_rate": 2.9793856132866465e-05, + "loss": 0.7026, + "step": 1201 + }, + { + "epoch": 0.14768399066224353, + "grad_norm": 1.8327276685472635, + "learning_rate": 2.979279187618762e-05, + "loss": 0.7877, + "step": 1202 + }, + { + "epoch": 0.14780685587910064, + "grad_norm": 1.655776278183064, + "learning_rate": 2.9791724898470646e-05, + "loss": 0.7082, + "step": 1203 + }, + { + "epoch": 0.14792972109595773, + "grad_norm": 1.8710857964958132, + "learning_rate": 2.9790655199911803e-05, + "loss": 0.6639, + "step": 1204 + }, + { + "epoch": 0.14805258631281484, + "grad_norm": 1.5729160823184503, + "learning_rate": 2.978958278070786e-05, + "loss": 0.7187, + "step": 1205 + }, + { + "epoch": 0.14817545152967196, + "grad_norm": 1.7692842183867792, + "learning_rate": 2.9788507641056077e-05, + "loss": 0.6397, + "step": 1206 + }, + { + "epoch": 0.14829831674652905, + "grad_norm": 1.5225726857010475, + "learning_rate": 2.9787429781154216e-05, + "loss": 0.6324, + "step": 1207 + }, + { + "epoch": 0.14842118196338616, + "grad_norm": 1.574101701200964, + "learning_rate": 2.9786349201200554e-05, + "loss": 0.6848, + "step": 1208 + }, + { + "epoch": 0.14854404718024328, + "grad_norm": 1.7350174605080917, + "learning_rate": 2.9785265901393843e-05, + "loss": 0.623, + "step": 1209 + }, + { + "epoch": 0.1486669123971004, + "grad_norm": 1.519916208235255, + "learning_rate": 2.978417988193336e-05, + "loss": 0.6394, + "step": 1210 + }, + { + "epoch": 0.14878977761395748, + "grad_norm": 1.4064763057858765, + "learning_rate": 2.9783091143018862e-05, + "loss": 0.5975, + "step": 1211 + }, + { + "epoch": 0.1489126428308146, + "grad_norm": 1.8710996494560364, + "learning_rate": 2.9781999684850625e-05, + "loss": 0.6911, + "step": 1212 + }, + { + "epoch": 0.1490355080476717, + "grad_norm": 1.5262959561241536, + "learning_rate": 2.9780905507629405e-05, + "loss": 0.6146, + "step": 1213 + }, + { + "epoch": 0.1491583732645288, + "grad_norm": 1.8150959374948743, + "learning_rate": 2.9779808611556478e-05, + "loss": 0.6508, + "step": 1214 + }, + { + "epoch": 0.14928123848138591, + "grad_norm": 1.6406309209669037, + "learning_rate": 2.977870899683361e-05, + "loss": 0.6877, + "step": 1215 + }, + { + "epoch": 0.14940410369824303, + "grad_norm": 1.5686184682656354, + "learning_rate": 2.9777606663663058e-05, + "loss": 0.5543, + "step": 1216 + }, + { + "epoch": 0.14952696891510014, + "grad_norm": 1.440127246322775, + "learning_rate": 2.9776501612247603e-05, + "loss": 0.7569, + "step": 1217 + }, + { + "epoch": 0.14964983413195723, + "grad_norm": 2.027850025331084, + "learning_rate": 2.97753938427905e-05, + "loss": 0.7018, + "step": 1218 + }, + { + "epoch": 0.14977269934881435, + "grad_norm": 1.8733522399494582, + "learning_rate": 2.9774283355495527e-05, + "loss": 0.6699, + "step": 1219 + }, + { + "epoch": 0.14989556456567146, + "grad_norm": 1.6544268863803946, + "learning_rate": 2.9773170150566943e-05, + "loss": 0.7353, + "step": 1220 + }, + { + "epoch": 0.15001842978252858, + "grad_norm": 1.5992461680689807, + "learning_rate": 2.9772054228209514e-05, + "loss": 0.7086, + "step": 1221 + }, + { + "epoch": 0.15014129499938567, + "grad_norm": 1.7465601249987257, + "learning_rate": 2.9770935588628513e-05, + "loss": 0.7106, + "step": 1222 + }, + { + "epoch": 0.15026416021624278, + "grad_norm": 1.4179863590448907, + "learning_rate": 2.9769814232029703e-05, + "loss": 0.739, + "step": 1223 + }, + { + "epoch": 0.1503870254330999, + "grad_norm": 1.5766732914177883, + "learning_rate": 2.976869015861935e-05, + "loss": 0.7044, + "step": 1224 + }, + { + "epoch": 0.15050989064995698, + "grad_norm": 1.5193364312620863, + "learning_rate": 2.976756336860422e-05, + "loss": 0.6617, + "step": 1225 + }, + { + "epoch": 0.1506327558668141, + "grad_norm": 1.8576393134305684, + "learning_rate": 2.976643386219158e-05, + "loss": 0.7441, + "step": 1226 + }, + { + "epoch": 0.15075562108367122, + "grad_norm": 1.6802947420540455, + "learning_rate": 2.97653016395892e-05, + "loss": 0.5402, + "step": 1227 + }, + { + "epoch": 0.15087848630052833, + "grad_norm": 1.465944010393479, + "learning_rate": 2.9764166701005334e-05, + "loss": 0.6746, + "step": 1228 + }, + { + "epoch": 0.15100135151738542, + "grad_norm": 1.5726405831910284, + "learning_rate": 2.9763029046648753e-05, + "loss": 0.5916, + "step": 1229 + }, + { + "epoch": 0.15112421673424253, + "grad_norm": 1.5494649880831801, + "learning_rate": 2.976188867672872e-05, + "loss": 0.5809, + "step": 1230 + }, + { + "epoch": 0.15124708195109965, + "grad_norm": 1.812806697714595, + "learning_rate": 2.9760745591455e-05, + "loss": 0.7011, + "step": 1231 + }, + { + "epoch": 0.15136994716795676, + "grad_norm": 1.6085733910007172, + "learning_rate": 2.9759599791037855e-05, + "loss": 0.6369, + "step": 1232 + }, + { + "epoch": 0.15149281238481385, + "grad_norm": 1.6843380470488998, + "learning_rate": 2.9758451275688044e-05, + "loss": 0.6993, + "step": 1233 + }, + { + "epoch": 0.15161567760167097, + "grad_norm": 1.685937370061377, + "learning_rate": 2.975730004561684e-05, + "loss": 0.7194, + "step": 1234 + }, + { + "epoch": 0.15173854281852808, + "grad_norm": 1.8206571296468042, + "learning_rate": 2.9756146101035995e-05, + "loss": 0.7139, + "step": 1235 + }, + { + "epoch": 0.15186140803538517, + "grad_norm": 1.8174694587224753, + "learning_rate": 2.9754989442157767e-05, + "loss": 0.7365, + "step": 1236 + }, + { + "epoch": 0.15198427325224229, + "grad_norm": 2.0524678961573075, + "learning_rate": 2.9753830069194926e-05, + "loss": 0.7248, + "step": 1237 + }, + { + "epoch": 0.1521071384690994, + "grad_norm": 1.6973357879228583, + "learning_rate": 2.9752667982360725e-05, + "loss": 0.6457, + "step": 1238 + }, + { + "epoch": 0.15223000368595652, + "grad_norm": 1.8660744581845183, + "learning_rate": 2.975150318186892e-05, + "loss": 0.6815, + "step": 1239 + }, + { + "epoch": 0.1523528689028136, + "grad_norm": 1.8230493721011525, + "learning_rate": 2.9750335667933775e-05, + "loss": 0.7618, + "step": 1240 + }, + { + "epoch": 0.15247573411967072, + "grad_norm": 1.7556107481799574, + "learning_rate": 2.9749165440770037e-05, + "loss": 0.7147, + "step": 1241 + }, + { + "epoch": 0.15259859933652783, + "grad_norm": 1.6465963824374081, + "learning_rate": 2.9747992500592977e-05, + "loss": 0.6343, + "step": 1242 + }, + { + "epoch": 0.15272146455338495, + "grad_norm": 1.5166542104554002, + "learning_rate": 2.9746816847618333e-05, + "loss": 0.6261, + "step": 1243 + }, + { + "epoch": 0.15284432977024204, + "grad_norm": 1.5597973401527465, + "learning_rate": 2.974563848206237e-05, + "loss": 0.6809, + "step": 1244 + }, + { + "epoch": 0.15296719498709915, + "grad_norm": 1.7195669806365428, + "learning_rate": 2.9744457404141837e-05, + "loss": 0.6773, + "step": 1245 + }, + { + "epoch": 0.15309006020395627, + "grad_norm": 1.6324555019739144, + "learning_rate": 2.9743273614073987e-05, + "loss": 0.6696, + "step": 1246 + }, + { + "epoch": 0.15321292542081336, + "grad_norm": 1.6042525455540455, + "learning_rate": 2.974208711207657e-05, + "loss": 0.6072, + "step": 1247 + }, + { + "epoch": 0.15333579063767047, + "grad_norm": 1.3472970887532731, + "learning_rate": 2.9740897898367827e-05, + "loss": 0.6372, + "step": 1248 + }, + { + "epoch": 0.1534586558545276, + "grad_norm": 1.6303594470143339, + "learning_rate": 2.973970597316652e-05, + "loss": 0.6902, + "step": 1249 + }, + { + "epoch": 0.1535815210713847, + "grad_norm": 1.8559133738857785, + "learning_rate": 2.9738511336691887e-05, + "loss": 0.7459, + "step": 1250 + }, + { + "epoch": 0.1537043862882418, + "grad_norm": 1.5022320996995606, + "learning_rate": 2.973731398916368e-05, + "loss": 0.6402, + "step": 1251 + }, + { + "epoch": 0.1538272515050989, + "grad_norm": 1.5781780439044837, + "learning_rate": 2.9736113930802134e-05, + "loss": 0.7471, + "step": 1252 + }, + { + "epoch": 0.15395011672195602, + "grad_norm": 1.919848691796749, + "learning_rate": 2.9734911161828e-05, + "loss": 0.7187, + "step": 1253 + }, + { + "epoch": 0.1540729819388131, + "grad_norm": 1.4624158100168778, + "learning_rate": 2.973370568246252e-05, + "loss": 0.7944, + "step": 1254 + }, + { + "epoch": 0.15419584715567022, + "grad_norm": 1.586576339664053, + "learning_rate": 2.9732497492927424e-05, + "loss": 0.6188, + "step": 1255 + }, + { + "epoch": 0.15431871237252734, + "grad_norm": 1.599093378098706, + "learning_rate": 2.9731286593444967e-05, + "loss": 0.6992, + "step": 1256 + }, + { + "epoch": 0.15444157758938445, + "grad_norm": 1.6118384022378216, + "learning_rate": 2.973007298423787e-05, + "loss": 0.7319, + "step": 1257 + }, + { + "epoch": 0.15456444280624154, + "grad_norm": 1.5112759376211, + "learning_rate": 2.9728856665529378e-05, + "loss": 0.6735, + "step": 1258 + }, + { + "epoch": 0.15468730802309866, + "grad_norm": 1.6017720437177714, + "learning_rate": 2.9727637637543225e-05, + "loss": 0.7303, + "step": 1259 + }, + { + "epoch": 0.15481017323995577, + "grad_norm": 1.4653649692502642, + "learning_rate": 2.9726415900503635e-05, + "loss": 0.6752, + "step": 1260 + }, + { + "epoch": 0.1549330384568129, + "grad_norm": 1.6244094135079614, + "learning_rate": 2.9725191454635346e-05, + "loss": 0.5937, + "step": 1261 + }, + { + "epoch": 0.15505590367366998, + "grad_norm": 1.7797051934797476, + "learning_rate": 2.9723964300163584e-05, + "loss": 0.7147, + "step": 1262 + }, + { + "epoch": 0.1551787688905271, + "grad_norm": 1.4272109958895642, + "learning_rate": 2.9722734437314084e-05, + "loss": 0.734, + "step": 1263 + }, + { + "epoch": 0.1553016341073842, + "grad_norm": 1.5027857866874148, + "learning_rate": 2.972150186631306e-05, + "loss": 0.741, + "step": 1264 + }, + { + "epoch": 0.1554244993242413, + "grad_norm": 1.5861847813482135, + "learning_rate": 2.9720266587387236e-05, + "loss": 0.6582, + "step": 1265 + }, + { + "epoch": 0.1555473645410984, + "grad_norm": 1.7374219966008766, + "learning_rate": 2.971902860076384e-05, + "loss": 0.6234, + "step": 1266 + }, + { + "epoch": 0.15567022975795552, + "grad_norm": 1.4613946582290924, + "learning_rate": 2.9717787906670592e-05, + "loss": 0.6831, + "step": 1267 + }, + { + "epoch": 0.15579309497481264, + "grad_norm": 1.6033703076904535, + "learning_rate": 2.9716544505335705e-05, + "loss": 0.6142, + "step": 1268 + }, + { + "epoch": 0.15591596019166973, + "grad_norm": 1.5013388546449573, + "learning_rate": 2.9715298396987898e-05, + "loss": 0.7599, + "step": 1269 + }, + { + "epoch": 0.15603882540852684, + "grad_norm": 1.7325398793553692, + "learning_rate": 2.971404958185638e-05, + "loss": 0.67, + "step": 1270 + }, + { + "epoch": 0.15616169062538396, + "grad_norm": 1.6531963699380625, + "learning_rate": 2.9712798060170868e-05, + "loss": 0.628, + "step": 1271 + }, + { + "epoch": 0.15628455584224107, + "grad_norm": 1.8100984749784255, + "learning_rate": 2.9711543832161565e-05, + "loss": 0.6465, + "step": 1272 + }, + { + "epoch": 0.15640742105909816, + "grad_norm": 1.7743883132036649, + "learning_rate": 2.9710286898059185e-05, + "loss": 0.7905, + "step": 1273 + }, + { + "epoch": 0.15653028627595528, + "grad_norm": 1.758571612609507, + "learning_rate": 2.970902725809493e-05, + "loss": 0.8187, + "step": 1274 + }, + { + "epoch": 0.1566531514928124, + "grad_norm": 1.8136552327253288, + "learning_rate": 2.97077649125005e-05, + "loss": 0.6677, + "step": 1275 + }, + { + "epoch": 0.15677601670966948, + "grad_norm": 1.6582225453232893, + "learning_rate": 2.9706499861508098e-05, + "loss": 0.586, + "step": 1276 + }, + { + "epoch": 0.1568988819265266, + "grad_norm": 1.5538353615202256, + "learning_rate": 2.9705232105350427e-05, + "loss": 0.6086, + "step": 1277 + }, + { + "epoch": 0.1570217471433837, + "grad_norm": 1.6881453621732438, + "learning_rate": 2.970396164426067e-05, + "loss": 0.6459, + "step": 1278 + }, + { + "epoch": 0.15714461236024083, + "grad_norm": 1.7824457932803317, + "learning_rate": 2.970268847847253e-05, + "loss": 0.758, + "step": 1279 + }, + { + "epoch": 0.1572674775770979, + "grad_norm": 1.6059541957193617, + "learning_rate": 2.9701412608220193e-05, + "loss": 0.5885, + "step": 1280 + }, + { + "epoch": 0.15739034279395503, + "grad_norm": 1.6319281442140379, + "learning_rate": 2.970013403373835e-05, + "loss": 0.6287, + "step": 1281 + }, + { + "epoch": 0.15751320801081214, + "grad_norm": 1.4178645942500518, + "learning_rate": 2.9698852755262186e-05, + "loss": 0.5757, + "step": 1282 + }, + { + "epoch": 0.15763607322766926, + "grad_norm": 1.6505530635654917, + "learning_rate": 2.9697568773027385e-05, + "loss": 0.7155, + "step": 1283 + }, + { + "epoch": 0.15775893844452635, + "grad_norm": 1.5030189715633817, + "learning_rate": 2.9696282087270116e-05, + "loss": 0.8029, + "step": 1284 + }, + { + "epoch": 0.15788180366138346, + "grad_norm": 1.7418034144745902, + "learning_rate": 2.9694992698227074e-05, + "loss": 0.6499, + "step": 1285 + }, + { + "epoch": 0.15800466887824058, + "grad_norm": 1.346544920923115, + "learning_rate": 2.9693700606135425e-05, + "loss": 0.6607, + "step": 1286 + }, + { + "epoch": 0.15812753409509767, + "grad_norm": 1.621410126077198, + "learning_rate": 2.969240581123284e-05, + "loss": 0.8047, + "step": 1287 + }, + { + "epoch": 0.15825039931195478, + "grad_norm": 1.6409232256203494, + "learning_rate": 2.969110831375749e-05, + "loss": 0.6086, + "step": 1288 + }, + { + "epoch": 0.1583732645288119, + "grad_norm": 1.8995747747087122, + "learning_rate": 2.968980811394804e-05, + "loss": 0.6762, + "step": 1289 + }, + { + "epoch": 0.158496129745669, + "grad_norm": 1.7754159948899968, + "learning_rate": 2.9688505212043656e-05, + "loss": 0.7541, + "step": 1290 + }, + { + "epoch": 0.1586189949625261, + "grad_norm": 1.570811799089022, + "learning_rate": 2.9687199608283992e-05, + "loss": 0.6511, + "step": 1291 + }, + { + "epoch": 0.15874186017938322, + "grad_norm": 1.6469236055916405, + "learning_rate": 2.9685891302909213e-05, + "loss": 0.7096, + "step": 1292 + }, + { + "epoch": 0.15886472539624033, + "grad_norm": 1.5254124244048641, + "learning_rate": 2.9684580296159973e-05, + "loss": 0.6033, + "step": 1293 + }, + { + "epoch": 0.15898759061309745, + "grad_norm": 1.4985237821231112, + "learning_rate": 2.9683266588277417e-05, + "loss": 0.7394, + "step": 1294 + }, + { + "epoch": 0.15911045582995453, + "grad_norm": 1.4449490425959275, + "learning_rate": 2.9681950179503196e-05, + "loss": 0.7565, + "step": 1295 + }, + { + "epoch": 0.15923332104681165, + "grad_norm": 1.591382564047471, + "learning_rate": 2.968063107007946e-05, + "loss": 0.7137, + "step": 1296 + }, + { + "epoch": 0.15935618626366876, + "grad_norm": 1.7880322015403431, + "learning_rate": 2.967930926024884e-05, + "loss": 0.6009, + "step": 1297 + }, + { + "epoch": 0.15947905148052585, + "grad_norm": 1.411917167878625, + "learning_rate": 2.9677984750254482e-05, + "loss": 0.6514, + "step": 1298 + }, + { + "epoch": 0.15960191669738297, + "grad_norm": 1.6356712102624247, + "learning_rate": 2.967665754034002e-05, + "loss": 0.6135, + "step": 1299 + }, + { + "epoch": 0.15972478191424008, + "grad_norm": 1.4553546760529097, + "learning_rate": 2.9675327630749587e-05, + "loss": 0.7229, + "step": 1300 + }, + { + "epoch": 0.1598476471310972, + "grad_norm": 1.3834559534024546, + "learning_rate": 2.967399502172781e-05, + "loss": 0.6244, + "step": 1301 + }, + { + "epoch": 0.15997051234795429, + "grad_norm": 1.680602345451252, + "learning_rate": 2.9672659713519805e-05, + "loss": 0.7073, + "step": 1302 + }, + { + "epoch": 0.1600933775648114, + "grad_norm": 1.3865309773500987, + "learning_rate": 2.9671321706371206e-05, + "loss": 0.7573, + "step": 1303 + }, + { + "epoch": 0.16021624278166852, + "grad_norm": 1.3975713757750239, + "learning_rate": 2.966998100052813e-05, + "loss": 0.6678, + "step": 1304 + }, + { + "epoch": 0.1603391079985256, + "grad_norm": 1.6903514129699546, + "learning_rate": 2.966863759623718e-05, + "loss": 0.673, + "step": 1305 + }, + { + "epoch": 0.16046197321538272, + "grad_norm": 1.6165966700953591, + "learning_rate": 2.9667291493745478e-05, + "loss": 0.7919, + "step": 1306 + }, + { + "epoch": 0.16058483843223983, + "grad_norm": 1.499330263769428, + "learning_rate": 2.9665942693300626e-05, + "loss": 0.5896, + "step": 1307 + }, + { + "epoch": 0.16070770364909695, + "grad_norm": 1.490506323217619, + "learning_rate": 2.9664591195150725e-05, + "loss": 0.6456, + "step": 1308 + }, + { + "epoch": 0.16083056886595404, + "grad_norm": 1.532839452705662, + "learning_rate": 2.966323699954438e-05, + "loss": 0.6691, + "step": 1309 + }, + { + "epoch": 0.16095343408281115, + "grad_norm": 1.7803524181408446, + "learning_rate": 2.966188010673068e-05, + "loss": 0.7608, + "step": 1310 + }, + { + "epoch": 0.16107629929966827, + "grad_norm": 1.873214918687934, + "learning_rate": 2.9660520516959227e-05, + "loss": 0.6906, + "step": 1311 + }, + { + "epoch": 0.16119916451652538, + "grad_norm": 1.6498083580532625, + "learning_rate": 2.9659158230480098e-05, + "loss": 0.686, + "step": 1312 + }, + { + "epoch": 0.16132202973338247, + "grad_norm": 1.4210685906984497, + "learning_rate": 2.9657793247543875e-05, + "loss": 0.5709, + "step": 1313 + }, + { + "epoch": 0.1614448949502396, + "grad_norm": 1.255774445567483, + "learning_rate": 2.965642556840165e-05, + "loss": 0.7376, + "step": 1314 + }, + { + "epoch": 0.1615677601670967, + "grad_norm": 1.7211587252446188, + "learning_rate": 2.9655055193304987e-05, + "loss": 0.6596, + "step": 1315 + }, + { + "epoch": 0.1616906253839538, + "grad_norm": 1.4468243973662143, + "learning_rate": 2.9653682122505966e-05, + "loss": 0.7649, + "step": 1316 + }, + { + "epoch": 0.1618134906008109, + "grad_norm": 1.5855813203330258, + "learning_rate": 2.965230635625715e-05, + "loss": 0.7753, + "step": 1317 + }, + { + "epoch": 0.16193635581766802, + "grad_norm": 1.3841659030117763, + "learning_rate": 2.9650927894811607e-05, + "loss": 0.7618, + "step": 1318 + }, + { + "epoch": 0.16205922103452514, + "grad_norm": 1.3952000715802364, + "learning_rate": 2.9649546738422887e-05, + "loss": 0.6983, + "step": 1319 + }, + { + "epoch": 0.16218208625138222, + "grad_norm": 1.4968054536422764, + "learning_rate": 2.9648162887345052e-05, + "loss": 0.7055, + "step": 1320 + }, + { + "epoch": 0.16230495146823934, + "grad_norm": 1.7636479235532885, + "learning_rate": 2.9646776341832648e-05, + "loss": 0.5898, + "step": 1321 + }, + { + "epoch": 0.16242781668509645, + "grad_norm": 1.4590600780346026, + "learning_rate": 2.964538710214073e-05, + "loss": 0.6796, + "step": 1322 + }, + { + "epoch": 0.16255068190195357, + "grad_norm": 1.5737879130112384, + "learning_rate": 2.9643995168524827e-05, + "loss": 0.6996, + "step": 1323 + }, + { + "epoch": 0.16267354711881066, + "grad_norm": 1.6146752771745299, + "learning_rate": 2.964260054124098e-05, + "loss": 0.714, + "step": 1324 + }, + { + "epoch": 0.16279641233566777, + "grad_norm": 1.349611935150245, + "learning_rate": 2.964120322054573e-05, + "loss": 0.6752, + "step": 1325 + }, + { + "epoch": 0.1629192775525249, + "grad_norm": 1.5678753556915583, + "learning_rate": 2.9639803206696102e-05, + "loss": 0.7888, + "step": 1326 + }, + { + "epoch": 0.16304214276938198, + "grad_norm": 1.5541977646372498, + "learning_rate": 2.963840049994961e-05, + "loss": 0.596, + "step": 1327 + }, + { + "epoch": 0.1631650079862391, + "grad_norm": 1.5600434465473811, + "learning_rate": 2.9636995100564282e-05, + "loss": 0.6501, + "step": 1328 + }, + { + "epoch": 0.1632878732030962, + "grad_norm": 1.6575221247506704, + "learning_rate": 2.9635587008798632e-05, + "loss": 0.7925, + "step": 1329 + }, + { + "epoch": 0.16341073841995332, + "grad_norm": 1.6385754037099134, + "learning_rate": 2.9634176224911665e-05, + "loss": 0.6807, + "step": 1330 + }, + { + "epoch": 0.1635336036368104, + "grad_norm": 1.4899317906270981, + "learning_rate": 2.9632762749162886e-05, + "loss": 0.6111, + "step": 1331 + }, + { + "epoch": 0.16365646885366752, + "grad_norm": 1.5247825681808087, + "learning_rate": 2.9631346581812293e-05, + "loss": 0.7122, + "step": 1332 + }, + { + "epoch": 0.16377933407052464, + "grad_norm": 1.7219166436024107, + "learning_rate": 2.962992772312039e-05, + "loss": 0.6558, + "step": 1333 + }, + { + "epoch": 0.16390219928738176, + "grad_norm": 1.7597285588803662, + "learning_rate": 2.9628506173348158e-05, + "loss": 0.6724, + "step": 1334 + }, + { + "epoch": 0.16402506450423884, + "grad_norm": 1.4997216536780074, + "learning_rate": 2.9627081932757084e-05, + "loss": 0.6164, + "step": 1335 + }, + { + "epoch": 0.16414792972109596, + "grad_norm": 1.805240604993113, + "learning_rate": 2.962565500160915e-05, + "loss": 0.6678, + "step": 1336 + }, + { + "epoch": 0.16427079493795307, + "grad_norm": 1.6460684168207265, + "learning_rate": 2.9624225380166827e-05, + "loss": 0.569, + "step": 1337 + }, + { + "epoch": 0.16439366015481016, + "grad_norm": 1.6348581199735779, + "learning_rate": 2.962279306869309e-05, + "loss": 0.6278, + "step": 1338 + }, + { + "epoch": 0.16451652537166728, + "grad_norm": 1.5927112744789222, + "learning_rate": 2.9621358067451398e-05, + "loss": 0.7032, + "step": 1339 + }, + { + "epoch": 0.1646393905885244, + "grad_norm": 1.4603178121679485, + "learning_rate": 2.961992037670571e-05, + "loss": 0.6792, + "step": 1340 + }, + { + "epoch": 0.1647622558053815, + "grad_norm": 1.5939427509466404, + "learning_rate": 2.9618479996720488e-05, + "loss": 0.7612, + "step": 1341 + }, + { + "epoch": 0.1648851210222386, + "grad_norm": 1.6256083327326802, + "learning_rate": 2.9617036927760672e-05, + "loss": 0.7003, + "step": 1342 + }, + { + "epoch": 0.1650079862390957, + "grad_norm": 1.5485164643724154, + "learning_rate": 2.9615591170091707e-05, + "loss": 0.6028, + "step": 1343 + }, + { + "epoch": 0.16513085145595283, + "grad_norm": 1.5306703195503275, + "learning_rate": 2.961414272397953e-05, + "loss": 0.6234, + "step": 1344 + }, + { + "epoch": 0.1652537166728099, + "grad_norm": 1.696072386920944, + "learning_rate": 2.961269158969058e-05, + "loss": 0.6911, + "step": 1345 + }, + { + "epoch": 0.16537658188966703, + "grad_norm": 1.6372621029860812, + "learning_rate": 2.9611237767491776e-05, + "loss": 0.6443, + "step": 1346 + }, + { + "epoch": 0.16549944710652414, + "grad_norm": 1.6747976764920425, + "learning_rate": 2.9609781257650543e-05, + "loss": 0.666, + "step": 1347 + }, + { + "epoch": 0.16562231232338126, + "grad_norm": 1.6346543852671003, + "learning_rate": 2.960832206043479e-05, + "loss": 0.6588, + "step": 1348 + }, + { + "epoch": 0.16574517754023835, + "grad_norm": 1.5856390012333887, + "learning_rate": 2.960686017611294e-05, + "loss": 0.6731, + "step": 1349 + }, + { + "epoch": 0.16586804275709546, + "grad_norm": 1.6043550671761344, + "learning_rate": 2.9605395604953888e-05, + "loss": 0.6289, + "step": 1350 + }, + { + "epoch": 0.16599090797395258, + "grad_norm": 1.6062708046937797, + "learning_rate": 2.960392834722703e-05, + "loss": 0.7362, + "step": 1351 + }, + { + "epoch": 0.1661137731908097, + "grad_norm": 1.446375309058786, + "learning_rate": 2.960245840320226e-05, + "loss": 0.6967, + "step": 1352 + }, + { + "epoch": 0.16623663840766678, + "grad_norm": 1.332971322015202, + "learning_rate": 2.9600985773149972e-05, + "loss": 0.5825, + "step": 1353 + }, + { + "epoch": 0.1663595036245239, + "grad_norm": 1.6718494779313504, + "learning_rate": 2.959951045734104e-05, + "loss": 0.6575, + "step": 1354 + }, + { + "epoch": 0.166482368841381, + "grad_norm": 1.5499521091339896, + "learning_rate": 2.9598032456046846e-05, + "loss": 0.7077, + "step": 1355 + }, + { + "epoch": 0.1666052340582381, + "grad_norm": 1.5810052948723623, + "learning_rate": 2.9596551769539248e-05, + "loss": 0.6392, + "step": 1356 + }, + { + "epoch": 0.16672809927509522, + "grad_norm": 1.420343980049429, + "learning_rate": 2.9595068398090614e-05, + "loss": 0.6015, + "step": 1357 + }, + { + "epoch": 0.16685096449195233, + "grad_norm": 1.717622148076989, + "learning_rate": 2.9593582341973803e-05, + "loss": 0.7126, + "step": 1358 + }, + { + "epoch": 0.16697382970880945, + "grad_norm": 1.453619714683216, + "learning_rate": 2.959209360146216e-05, + "loss": 0.6235, + "step": 1359 + }, + { + "epoch": 0.16709669492566653, + "grad_norm": 1.5989363714193026, + "learning_rate": 2.9590602176829532e-05, + "loss": 0.7054, + "step": 1360 + }, + { + "epoch": 0.16721956014252365, + "grad_norm": 1.4836542734074705, + "learning_rate": 2.958910806835026e-05, + "loss": 0.6497, + "step": 1361 + }, + { + "epoch": 0.16734242535938076, + "grad_norm": 1.5246243668553445, + "learning_rate": 2.958761127629917e-05, + "loss": 0.8059, + "step": 1362 + }, + { + "epoch": 0.16746529057623788, + "grad_norm": 1.376133113221235, + "learning_rate": 2.9586111800951588e-05, + "loss": 0.6262, + "step": 1363 + }, + { + "epoch": 0.16758815579309497, + "grad_norm": 1.666019599416163, + "learning_rate": 2.9584609642583337e-05, + "loss": 0.7523, + "step": 1364 + }, + { + "epoch": 0.16771102100995208, + "grad_norm": 1.4734539052591753, + "learning_rate": 2.958310480147073e-05, + "loss": 0.6735, + "step": 1365 + }, + { + "epoch": 0.1678338862268092, + "grad_norm": 1.2513640610651222, + "learning_rate": 2.9581597277890565e-05, + "loss": 0.5961, + "step": 1366 + }, + { + "epoch": 0.16795675144366629, + "grad_norm": 1.6134625411775323, + "learning_rate": 2.958008707212015e-05, + "loss": 0.7341, + "step": 1367 + }, + { + "epoch": 0.1680796166605234, + "grad_norm": 1.5643703066161072, + "learning_rate": 2.9578574184437264e-05, + "loss": 0.5905, + "step": 1368 + }, + { + "epoch": 0.16820248187738052, + "grad_norm": 1.4285380841745852, + "learning_rate": 2.9577058615120212e-05, + "loss": 0.628, + "step": 1369 + }, + { + "epoch": 0.16832534709423763, + "grad_norm": 1.6855752487774136, + "learning_rate": 2.9575540364447755e-05, + "loss": 0.6257, + "step": 1370 + }, + { + "epoch": 0.16844821231109472, + "grad_norm": 1.5499039797191638, + "learning_rate": 2.9574019432699182e-05, + "loss": 0.6711, + "step": 1371 + }, + { + "epoch": 0.16857107752795183, + "grad_norm": 1.644987364159324, + "learning_rate": 2.9572495820154245e-05, + "loss": 0.6704, + "step": 1372 + }, + { + "epoch": 0.16869394274480895, + "grad_norm": 1.3552428419139861, + "learning_rate": 2.957096952709321e-05, + "loss": 0.6244, + "step": 1373 + }, + { + "epoch": 0.16881680796166607, + "grad_norm": 1.440282391337794, + "learning_rate": 2.9569440553796824e-05, + "loss": 0.6276, + "step": 1374 + }, + { + "epoch": 0.16893967317852315, + "grad_norm": 1.6692397841242128, + "learning_rate": 2.9567908900546335e-05, + "loss": 0.6916, + "step": 1375 + }, + { + "epoch": 0.16906253839538027, + "grad_norm": 1.7035407511741656, + "learning_rate": 2.956637456762348e-05, + "loss": 0.6653, + "step": 1376 + }, + { + "epoch": 0.16918540361223738, + "grad_norm": 1.5031875118929463, + "learning_rate": 2.9564837555310494e-05, + "loss": 0.6172, + "step": 1377 + }, + { + "epoch": 0.16930826882909447, + "grad_norm": 1.5259437077998224, + "learning_rate": 2.9563297863890093e-05, + "loss": 0.6944, + "step": 1378 + }, + { + "epoch": 0.1694311340459516, + "grad_norm": 1.4713393645513817, + "learning_rate": 2.956175549364549e-05, + "loss": 0.6732, + "step": 1379 + }, + { + "epoch": 0.1695539992628087, + "grad_norm": 1.415962149721047, + "learning_rate": 2.956021044486041e-05, + "loss": 0.7165, + "step": 1380 + }, + { + "epoch": 0.16967686447966582, + "grad_norm": 1.6407949715626922, + "learning_rate": 2.9558662717819038e-05, + "loss": 0.7315, + "step": 1381 + }, + { + "epoch": 0.1697997296965229, + "grad_norm": 1.4748050161701602, + "learning_rate": 2.955711231280608e-05, + "loss": 0.701, + "step": 1382 + }, + { + "epoch": 0.16992259491338002, + "grad_norm": 1.859427635359362, + "learning_rate": 2.955555923010672e-05, + "loss": 0.7703, + "step": 1383 + }, + { + "epoch": 0.17004546013023714, + "grad_norm": 1.6152908335690546, + "learning_rate": 2.9554003470006633e-05, + "loss": 0.7203, + "step": 1384 + }, + { + "epoch": 0.17016832534709425, + "grad_norm": 1.614601380070933, + "learning_rate": 2.9552445032791988e-05, + "loss": 0.7042, + "step": 1385 + }, + { + "epoch": 0.17029119056395134, + "grad_norm": 1.6173387658269949, + "learning_rate": 2.955088391874946e-05, + "loss": 0.6326, + "step": 1386 + }, + { + "epoch": 0.17041405578080845, + "grad_norm": 1.660984461504832, + "learning_rate": 2.9549320128166202e-05, + "loss": 0.7155, + "step": 1387 + }, + { + "epoch": 0.17053692099766557, + "grad_norm": 1.7657798049176725, + "learning_rate": 2.954775366132986e-05, + "loss": 0.7123, + "step": 1388 + }, + { + "epoch": 0.17065978621452266, + "grad_norm": 1.7585763719882277, + "learning_rate": 2.954618451852858e-05, + "loss": 0.7122, + "step": 1389 + }, + { + "epoch": 0.17078265143137977, + "grad_norm": 1.4947594902784727, + "learning_rate": 2.9544612700050994e-05, + "loss": 0.577, + "step": 1390 + }, + { + "epoch": 0.1709055166482369, + "grad_norm": 1.726186252410145, + "learning_rate": 2.9543038206186223e-05, + "loss": 0.6054, + "step": 1391 + }, + { + "epoch": 0.171028381865094, + "grad_norm": 1.3504585798542077, + "learning_rate": 2.9541461037223888e-05, + "loss": 0.6494, + "step": 1392 + }, + { + "epoch": 0.1711512470819511, + "grad_norm": 1.5204637765791489, + "learning_rate": 2.9539881193454105e-05, + "loss": 0.738, + "step": 1393 + }, + { + "epoch": 0.1712741122988082, + "grad_norm": 1.4712842003096895, + "learning_rate": 2.953829867516747e-05, + "loss": 0.6545, + "step": 1394 + }, + { + "epoch": 0.17139697751566532, + "grad_norm": 1.7624811357671435, + "learning_rate": 2.9536713482655074e-05, + "loss": 0.7616, + "step": 1395 + }, + { + "epoch": 0.1715198427325224, + "grad_norm": 1.5793939489056135, + "learning_rate": 2.9535125616208507e-05, + "loss": 0.5927, + "step": 1396 + }, + { + "epoch": 0.17164270794937952, + "grad_norm": 1.5413068428272383, + "learning_rate": 2.953353507611985e-05, + "loss": 0.7394, + "step": 1397 + }, + { + "epoch": 0.17176557316623664, + "grad_norm": 1.534149029927234, + "learning_rate": 2.9531941862681667e-05, + "loss": 0.5921, + "step": 1398 + }, + { + "epoch": 0.17188843838309376, + "grad_norm": 1.6345962934802634, + "learning_rate": 2.953034597618702e-05, + "loss": 0.735, + "step": 1399 + }, + { + "epoch": 0.17201130359995084, + "grad_norm": 1.646560670364865, + "learning_rate": 2.9528747416929467e-05, + "loss": 0.8204, + "step": 1400 + }, + { + "epoch": 0.17213416881680796, + "grad_norm": 1.716818319300227, + "learning_rate": 2.952714618520305e-05, + "loss": 0.6774, + "step": 1401 + }, + { + "epoch": 0.17225703403366507, + "grad_norm": 1.5693943569446105, + "learning_rate": 2.95255422813023e-05, + "loss": 0.6844, + "step": 1402 + }, + { + "epoch": 0.1723798992505222, + "grad_norm": 1.5280648593827988, + "learning_rate": 2.952393570552225e-05, + "loss": 0.5662, + "step": 1403 + }, + { + "epoch": 0.17250276446737928, + "grad_norm": 1.6995423785868131, + "learning_rate": 2.9522326458158415e-05, + "loss": 0.6673, + "step": 1404 + }, + { + "epoch": 0.1726256296842364, + "grad_norm": 1.3420049178959992, + "learning_rate": 2.9520714539506812e-05, + "loss": 0.7075, + "step": 1405 + }, + { + "epoch": 0.1727484949010935, + "grad_norm": 1.7184323189753032, + "learning_rate": 2.951909994986394e-05, + "loss": 0.7002, + "step": 1406 + }, + { + "epoch": 0.1728713601179506, + "grad_norm": 1.4573732306749252, + "learning_rate": 2.951748268952679e-05, + "loss": 0.6235, + "step": 1407 + }, + { + "epoch": 0.1729942253348077, + "grad_norm": 1.4106934683650214, + "learning_rate": 2.951586275879285e-05, + "loss": 0.8082, + "step": 1408 + }, + { + "epoch": 0.17311709055166483, + "grad_norm": 1.4971725182764106, + "learning_rate": 2.9514240157960093e-05, + "loss": 0.7283, + "step": 1409 + }, + { + "epoch": 0.17323995576852194, + "grad_norm": 1.3416199512123435, + "learning_rate": 2.951261488732699e-05, + "loss": 0.6836, + "step": 1410 + }, + { + "epoch": 0.17336282098537903, + "grad_norm": 1.532201756127604, + "learning_rate": 2.9510986947192494e-05, + "loss": 0.6073, + "step": 1411 + }, + { + "epoch": 0.17348568620223614, + "grad_norm": 1.6605921112596864, + "learning_rate": 2.9509356337856054e-05, + "loss": 0.7326, + "step": 1412 + }, + { + "epoch": 0.17360855141909326, + "grad_norm": 1.6224598165853281, + "learning_rate": 2.9507723059617616e-05, + "loss": 0.7144, + "step": 1413 + }, + { + "epoch": 0.17373141663595038, + "grad_norm": 2.043972299822935, + "learning_rate": 2.9506087112777602e-05, + "loss": 0.7368, + "step": 1414 + }, + { + "epoch": 0.17385428185280746, + "grad_norm": 2.0465721933321923, + "learning_rate": 2.9504448497636945e-05, + "loss": 0.6609, + "step": 1415 + }, + { + "epoch": 0.17397714706966458, + "grad_norm": 1.5030607083624699, + "learning_rate": 2.9502807214497047e-05, + "loss": 0.6917, + "step": 1416 + }, + { + "epoch": 0.1741000122865217, + "grad_norm": 1.536416025960106, + "learning_rate": 2.9501163263659818e-05, + "loss": 0.6456, + "step": 1417 + }, + { + "epoch": 0.17422287750337878, + "grad_norm": 1.6335548826329411, + "learning_rate": 2.949951664542765e-05, + "loss": 0.6347, + "step": 1418 + }, + { + "epoch": 0.1743457427202359, + "grad_norm": 1.3684693715493643, + "learning_rate": 2.9497867360103427e-05, + "loss": 0.6867, + "step": 1419 + }, + { + "epoch": 0.174468607937093, + "grad_norm": 1.4759374721908372, + "learning_rate": 2.9496215407990524e-05, + "loss": 0.6684, + "step": 1420 + }, + { + "epoch": 0.17459147315395013, + "grad_norm": 1.5291765687504206, + "learning_rate": 2.949456078939281e-05, + "loss": 0.5793, + "step": 1421 + }, + { + "epoch": 0.17471433837080722, + "grad_norm": 1.4873522430689772, + "learning_rate": 2.949290350461464e-05, + "loss": 0.5832, + "step": 1422 + }, + { + "epoch": 0.17483720358766433, + "grad_norm": 2.096656429728606, + "learning_rate": 2.9491243553960856e-05, + "loss": 0.8136, + "step": 1423 + }, + { + "epoch": 0.17496006880452145, + "grad_norm": 1.9679070349029695, + "learning_rate": 2.9489580937736805e-05, + "loss": 0.7235, + "step": 1424 + }, + { + "epoch": 0.17508293402137856, + "grad_norm": 1.6044879880142335, + "learning_rate": 2.94879156562483e-05, + "loss": 0.6913, + "step": 1425 + }, + { + "epoch": 0.17520579923823565, + "grad_norm": 1.5466630601582532, + "learning_rate": 2.9486247709801674e-05, + "loss": 0.5825, + "step": 1426 + }, + { + "epoch": 0.17532866445509276, + "grad_norm": 2.0799826717479486, + "learning_rate": 2.948457709870373e-05, + "loss": 0.747, + "step": 1427 + }, + { + "epoch": 0.17545152967194988, + "grad_norm": 2.0136744683524577, + "learning_rate": 2.948290382326176e-05, + "loss": 0.7633, + "step": 1428 + }, + { + "epoch": 0.17557439488880697, + "grad_norm": 2.063264482844403, + "learning_rate": 2.948122788378356e-05, + "loss": 0.7632, + "step": 1429 + }, + { + "epoch": 0.17569726010566408, + "grad_norm": 1.6538766663957383, + "learning_rate": 2.9479549280577402e-05, + "loss": 0.6584, + "step": 1430 + }, + { + "epoch": 0.1758201253225212, + "grad_norm": 1.564633156023667, + "learning_rate": 2.947786801395206e-05, + "loss": 0.6585, + "step": 1431 + }, + { + "epoch": 0.1759429905393783, + "grad_norm": 1.9106723945180943, + "learning_rate": 2.947618408421679e-05, + "loss": 0.7811, + "step": 1432 + }, + { + "epoch": 0.1760658557562354, + "grad_norm": 1.4642936704120249, + "learning_rate": 2.9474497491681337e-05, + "loss": 0.5989, + "step": 1433 + }, + { + "epoch": 0.17618872097309252, + "grad_norm": 2.0912469796637296, + "learning_rate": 2.947280823665594e-05, + "loss": 0.7146, + "step": 1434 + }, + { + "epoch": 0.17631158618994963, + "grad_norm": 1.3978127297662788, + "learning_rate": 2.9471116319451324e-05, + "loss": 0.6926, + "step": 1435 + }, + { + "epoch": 0.17643445140680672, + "grad_norm": 1.4606903653395344, + "learning_rate": 2.9469421740378713e-05, + "loss": 0.6045, + "step": 1436 + }, + { + "epoch": 0.17655731662366383, + "grad_norm": 1.4623692323121391, + "learning_rate": 2.9467724499749813e-05, + "loss": 0.629, + "step": 1437 + }, + { + "epoch": 0.17668018184052095, + "grad_norm": 1.4507574393995153, + "learning_rate": 2.9466024597876814e-05, + "loss": 0.6791, + "step": 1438 + }, + { + "epoch": 0.17680304705737807, + "grad_norm": 1.569372059007248, + "learning_rate": 2.9464322035072407e-05, + "loss": 0.6276, + "step": 1439 + }, + { + "epoch": 0.17692591227423515, + "grad_norm": 1.4384239425415848, + "learning_rate": 2.9462616811649767e-05, + "loss": 0.5997, + "step": 1440 + }, + { + "epoch": 0.17704877749109227, + "grad_norm": 1.6405405258624208, + "learning_rate": 2.9460908927922557e-05, + "loss": 0.7524, + "step": 1441 + }, + { + "epoch": 0.17717164270794938, + "grad_norm": 1.5348827652034749, + "learning_rate": 2.945919838420493e-05, + "loss": 0.5959, + "step": 1442 + }, + { + "epoch": 0.1772945079248065, + "grad_norm": 1.3066900207645296, + "learning_rate": 2.9457485180811535e-05, + "loss": 0.6312, + "step": 1443 + }, + { + "epoch": 0.1774173731416636, + "grad_norm": 1.5978762084970735, + "learning_rate": 2.94557693180575e-05, + "loss": 0.6885, + "step": 1444 + }, + { + "epoch": 0.1775402383585207, + "grad_norm": 1.709069614002222, + "learning_rate": 2.9454050796258448e-05, + "loss": 0.6845, + "step": 1445 + }, + { + "epoch": 0.17766310357537782, + "grad_norm": 1.545706411211335, + "learning_rate": 2.9452329615730488e-05, + "loss": 0.6297, + "step": 1446 + }, + { + "epoch": 0.1777859687922349, + "grad_norm": 1.5730727277601777, + "learning_rate": 2.9450605776790225e-05, + "loss": 0.7458, + "step": 1447 + }, + { + "epoch": 0.17790883400909202, + "grad_norm": 1.4097299448069807, + "learning_rate": 2.9448879279754743e-05, + "loss": 0.7984, + "step": 1448 + }, + { + "epoch": 0.17803169922594914, + "grad_norm": 1.567873443564787, + "learning_rate": 2.944715012494162e-05, + "loss": 0.6279, + "step": 1449 + }, + { + "epoch": 0.17815456444280625, + "grad_norm": 1.3985384103621674, + "learning_rate": 2.9445418312668924e-05, + "loss": 0.5952, + "step": 1450 + }, + { + "epoch": 0.17827742965966334, + "grad_norm": 1.493695719527005, + "learning_rate": 2.944368384325522e-05, + "loss": 0.6776, + "step": 1451 + }, + { + "epoch": 0.17840029487652045, + "grad_norm": 1.62654721001496, + "learning_rate": 2.9441946717019535e-05, + "loss": 0.5808, + "step": 1452 + }, + { + "epoch": 0.17852316009337757, + "grad_norm": 1.972363365551275, + "learning_rate": 2.9440206934281413e-05, + "loss": 0.7768, + "step": 1453 + }, + { + "epoch": 0.17864602531023469, + "grad_norm": 1.585423769802939, + "learning_rate": 2.943846449536087e-05, + "loss": 0.6339, + "step": 1454 + }, + { + "epoch": 0.17876889052709177, + "grad_norm": 1.4837724119117175, + "learning_rate": 2.9436719400578426e-05, + "loss": 0.599, + "step": 1455 + }, + { + "epoch": 0.1788917557439489, + "grad_norm": 1.5058697390196099, + "learning_rate": 2.9434971650255067e-05, + "loss": 0.637, + "step": 1456 + }, + { + "epoch": 0.179014620960806, + "grad_norm": 1.610007553136736, + "learning_rate": 2.9433221244712293e-05, + "loss": 0.6487, + "step": 1457 + }, + { + "epoch": 0.1791374861776631, + "grad_norm": 1.4929631736720275, + "learning_rate": 2.9431468184272072e-05, + "loss": 0.6298, + "step": 1458 + }, + { + "epoch": 0.1792603513945202, + "grad_norm": 1.5586690893882018, + "learning_rate": 2.942971246925687e-05, + "loss": 0.6707, + "step": 1459 + }, + { + "epoch": 0.17938321661137732, + "grad_norm": 1.513648086155852, + "learning_rate": 2.942795409998964e-05, + "loss": 0.5753, + "step": 1460 + }, + { + "epoch": 0.17950608182823444, + "grad_norm": 1.7002616785187883, + "learning_rate": 2.9426193076793817e-05, + "loss": 0.6875, + "step": 1461 + }, + { + "epoch": 0.17962894704509152, + "grad_norm": 1.4660620005206715, + "learning_rate": 2.942442939999334e-05, + "loss": 0.5373, + "step": 1462 + }, + { + "epoch": 0.17975181226194864, + "grad_norm": 1.4872749341512093, + "learning_rate": 2.9422663069912616e-05, + "loss": 0.6818, + "step": 1463 + }, + { + "epoch": 0.17987467747880576, + "grad_norm": 1.6770990236996794, + "learning_rate": 2.942089408687656e-05, + "loss": 0.7107, + "step": 1464 + }, + { + "epoch": 0.17999754269566287, + "grad_norm": 1.4447846004930602, + "learning_rate": 2.9419122451210556e-05, + "loss": 0.8194, + "step": 1465 + }, + { + "epoch": 0.18012040791251996, + "grad_norm": 1.2955118740243192, + "learning_rate": 2.941734816324049e-05, + "loss": 0.7268, + "step": 1466 + }, + { + "epoch": 0.18024327312937707, + "grad_norm": 1.4048086415234264, + "learning_rate": 2.9415571223292726e-05, + "loss": 0.5934, + "step": 1467 + }, + { + "epoch": 0.1803661383462342, + "grad_norm": 2.1032243579165075, + "learning_rate": 2.9413791631694128e-05, + "loss": 0.7375, + "step": 1468 + }, + { + "epoch": 0.18048900356309128, + "grad_norm": 1.3718159405342165, + "learning_rate": 2.9412009388772033e-05, + "loss": 0.636, + "step": 1469 + }, + { + "epoch": 0.1806118687799484, + "grad_norm": 1.5341274586113056, + "learning_rate": 2.941022449485428e-05, + "loss": 0.7376, + "step": 1470 + }, + { + "epoch": 0.1807347339968055, + "grad_norm": 1.692185668962046, + "learning_rate": 2.940843695026918e-05, + "loss": 0.732, + "step": 1471 + }, + { + "epoch": 0.18085759921366262, + "grad_norm": 1.5734363321314713, + "learning_rate": 2.9406646755345544e-05, + "loss": 0.6489, + "step": 1472 + }, + { + "epoch": 0.1809804644305197, + "grad_norm": 1.2534477778005653, + "learning_rate": 2.9404853910412674e-05, + "loss": 0.6126, + "step": 1473 + }, + { + "epoch": 0.18110332964737683, + "grad_norm": 1.6140586630188694, + "learning_rate": 2.9403058415800344e-05, + "loss": 0.6208, + "step": 1474 + }, + { + "epoch": 0.18122619486423394, + "grad_norm": 1.6799085476886144, + "learning_rate": 2.9401260271838822e-05, + "loss": 0.6901, + "step": 1475 + }, + { + "epoch": 0.18134906008109106, + "grad_norm": 2.434275639498713, + "learning_rate": 2.9399459478858872e-05, + "loss": 0.7896, + "step": 1476 + }, + { + "epoch": 0.18147192529794814, + "grad_norm": 1.4462437780478459, + "learning_rate": 2.939765603719173e-05, + "loss": 0.6225, + "step": 1477 + }, + { + "epoch": 0.18159479051480526, + "grad_norm": 1.4544564306575352, + "learning_rate": 2.9395849947169136e-05, + "loss": 0.6707, + "step": 1478 + }, + { + "epoch": 0.18171765573166238, + "grad_norm": 1.4806034262109453, + "learning_rate": 2.939404120912331e-05, + "loss": 0.5727, + "step": 1479 + }, + { + "epoch": 0.18184052094851946, + "grad_norm": 1.7589791823827159, + "learning_rate": 2.9392229823386944e-05, + "loss": 0.6641, + "step": 1480 + }, + { + "epoch": 0.18196338616537658, + "grad_norm": 1.7478586267721083, + "learning_rate": 2.9390415790293236e-05, + "loss": 0.9111, + "step": 1481 + }, + { + "epoch": 0.1820862513822337, + "grad_norm": 1.5863274778027279, + "learning_rate": 2.938859911017588e-05, + "loss": 0.6995, + "step": 1482 + }, + { + "epoch": 0.1822091165990908, + "grad_norm": 1.5006370865097798, + "learning_rate": 2.938677978336902e-05, + "loss": 0.6935, + "step": 1483 + }, + { + "epoch": 0.1823319818159479, + "grad_norm": 1.7905091415311891, + "learning_rate": 2.9384957810207326e-05, + "loss": 0.6946, + "step": 1484 + }, + { + "epoch": 0.182454847032805, + "grad_norm": 1.6044825728879877, + "learning_rate": 2.938313319102593e-05, + "loss": 0.728, + "step": 1485 + }, + { + "epoch": 0.18257771224966213, + "grad_norm": 1.4864421671004941, + "learning_rate": 2.9381305926160464e-05, + "loss": 0.6502, + "step": 1486 + }, + { + "epoch": 0.18270057746651922, + "grad_norm": 1.484714343444669, + "learning_rate": 2.9379476015947035e-05, + "loss": 0.5789, + "step": 1487 + }, + { + "epoch": 0.18282344268337633, + "grad_norm": 1.6513371986804872, + "learning_rate": 2.9377643460722256e-05, + "loss": 0.7086, + "step": 1488 + }, + { + "epoch": 0.18294630790023345, + "grad_norm": 1.4165022328757289, + "learning_rate": 2.9375808260823192e-05, + "loss": 0.619, + "step": 1489 + }, + { + "epoch": 0.18306917311709056, + "grad_norm": 1.6931364091071508, + "learning_rate": 2.9373970416587437e-05, + "loss": 0.7376, + "step": 1490 + }, + { + "epoch": 0.18319203833394765, + "grad_norm": 1.6716148326451667, + "learning_rate": 2.9372129928353042e-05, + "loss": 0.7297, + "step": 1491 + }, + { + "epoch": 0.18331490355080476, + "grad_norm": 1.667771272677289, + "learning_rate": 2.9370286796458552e-05, + "loss": 0.7033, + "step": 1492 + }, + { + "epoch": 0.18343776876766188, + "grad_norm": 1.7326509464381088, + "learning_rate": 2.9368441021243e-05, + "loss": 0.7811, + "step": 1493 + }, + { + "epoch": 0.183560633984519, + "grad_norm": 1.4292263527245446, + "learning_rate": 2.9366592603045906e-05, + "loss": 0.6343, + "step": 1494 + }, + { + "epoch": 0.18368349920137608, + "grad_norm": 1.663022711438851, + "learning_rate": 2.936474154220727e-05, + "loss": 0.7252, + "step": 1495 + }, + { + "epoch": 0.1838063644182332, + "grad_norm": 1.6162570598946928, + "learning_rate": 2.936288783906759e-05, + "loss": 0.7427, + "step": 1496 + }, + { + "epoch": 0.1839292296350903, + "grad_norm": 1.496257304840906, + "learning_rate": 2.936103149396784e-05, + "loss": 0.5657, + "step": 1497 + }, + { + "epoch": 0.1840520948519474, + "grad_norm": 1.6335991716099814, + "learning_rate": 2.9359172507249477e-05, + "loss": 0.6262, + "step": 1498 + }, + { + "epoch": 0.18417496006880452, + "grad_norm": 1.7260012366680195, + "learning_rate": 2.935731087925445e-05, + "loss": 0.8514, + "step": 1499 + }, + { + "epoch": 0.18429782528566163, + "grad_norm": 1.513128308073426, + "learning_rate": 2.935544661032521e-05, + "loss": 0.5813, + "step": 1500 + }, + { + "epoch": 0.18442069050251875, + "grad_norm": 1.4795557148068579, + "learning_rate": 2.935357970080465e-05, + "loss": 0.7416, + "step": 1501 + }, + { + "epoch": 0.18454355571937583, + "grad_norm": 1.6926262296767476, + "learning_rate": 2.93517101510362e-05, + "loss": 0.6584, + "step": 1502 + }, + { + "epoch": 0.18466642093623295, + "grad_norm": 1.5687106587158242, + "learning_rate": 2.9349837961363736e-05, + "loss": 0.7823, + "step": 1503 + }, + { + "epoch": 0.18478928615309007, + "grad_norm": 1.3364470843203262, + "learning_rate": 2.9347963132131644e-05, + "loss": 0.6914, + "step": 1504 + }, + { + "epoch": 0.18491215136994718, + "grad_norm": 1.7139684852781043, + "learning_rate": 2.9346085663684784e-05, + "loss": 0.7796, + "step": 1505 + }, + { + "epoch": 0.18503501658680427, + "grad_norm": 1.5063708524916688, + "learning_rate": 2.9344205556368502e-05, + "loss": 0.7022, + "step": 1506 + }, + { + "epoch": 0.18515788180366138, + "grad_norm": 1.5356429623950476, + "learning_rate": 2.9342322810528635e-05, + "loss": 0.7742, + "step": 1507 + }, + { + "epoch": 0.1852807470205185, + "grad_norm": 1.9736472888985805, + "learning_rate": 2.93404374265115e-05, + "loss": 0.8254, + "step": 1508 + }, + { + "epoch": 0.1854036122373756, + "grad_norm": 1.869106464888736, + "learning_rate": 2.93385494046639e-05, + "loss": 0.605, + "step": 1509 + }, + { + "epoch": 0.1855264774542327, + "grad_norm": 1.6469977417990107, + "learning_rate": 2.933665874533313e-05, + "loss": 0.6525, + "step": 1510 + }, + { + "epoch": 0.18564934267108982, + "grad_norm": 1.6422489834952425, + "learning_rate": 2.9334765448866953e-05, + "loss": 0.6857, + "step": 1511 + }, + { + "epoch": 0.18577220788794693, + "grad_norm": 1.62996031733984, + "learning_rate": 2.933286951561364e-05, + "loss": 0.6388, + "step": 1512 + }, + { + "epoch": 0.18589507310480402, + "grad_norm": 1.4418348971842323, + "learning_rate": 2.9330970945921932e-05, + "loss": 0.6961, + "step": 1513 + }, + { + "epoch": 0.18601793832166114, + "grad_norm": 1.6620861807216256, + "learning_rate": 2.9329069740141057e-05, + "loss": 0.7901, + "step": 1514 + }, + { + "epoch": 0.18614080353851825, + "grad_norm": 1.4551387470193682, + "learning_rate": 2.9327165898620734e-05, + "loss": 0.7056, + "step": 1515 + }, + { + "epoch": 0.18626366875537537, + "grad_norm": 1.6602896132356881, + "learning_rate": 2.9325259421711155e-05, + "loss": 0.6555, + "step": 1516 + }, + { + "epoch": 0.18638653397223245, + "grad_norm": 1.3818113457285126, + "learning_rate": 2.9323350309763006e-05, + "loss": 0.6939, + "step": 1517 + }, + { + "epoch": 0.18650939918908957, + "grad_norm": 1.549184757941032, + "learning_rate": 2.9321438563127464e-05, + "loss": 0.7145, + "step": 1518 + }, + { + "epoch": 0.18663226440594669, + "grad_norm": 1.379294041488474, + "learning_rate": 2.931952418215617e-05, + "loss": 0.5793, + "step": 1519 + }, + { + "epoch": 0.18675512962280377, + "grad_norm": 1.9104707622454937, + "learning_rate": 2.9317607167201273e-05, + "loss": 0.7943, + "step": 1520 + }, + { + "epoch": 0.1868779948396609, + "grad_norm": 1.7545533044454762, + "learning_rate": 2.931568751861539e-05, + "loss": 0.7043, + "step": 1521 + }, + { + "epoch": 0.187000860056518, + "grad_norm": 1.5472200050991374, + "learning_rate": 2.9313765236751626e-05, + "loss": 0.6722, + "step": 1522 + }, + { + "epoch": 0.18712372527337512, + "grad_norm": 1.623862825971186, + "learning_rate": 2.9311840321963578e-05, + "loss": 0.7201, + "step": 1523 + }, + { + "epoch": 0.1872465904902322, + "grad_norm": 1.431466881205994, + "learning_rate": 2.9309912774605313e-05, + "loss": 0.7115, + "step": 1524 + }, + { + "epoch": 0.18736945570708932, + "grad_norm": 1.6504068544430703, + "learning_rate": 2.9307982595031398e-05, + "loss": 0.7444, + "step": 1525 + }, + { + "epoch": 0.18749232092394644, + "grad_norm": 1.6238139865404828, + "learning_rate": 2.9306049783596875e-05, + "loss": 0.7516, + "step": 1526 + }, + { + "epoch": 0.18761518614080352, + "grad_norm": 1.5858565823399937, + "learning_rate": 2.9304114340657272e-05, + "loss": 0.6484, + "step": 1527 + }, + { + "epoch": 0.18773805135766064, + "grad_norm": 1.5517009774000043, + "learning_rate": 2.9302176266568607e-05, + "loss": 0.6721, + "step": 1528 + }, + { + "epoch": 0.18786091657451776, + "grad_norm": 1.588523876048512, + "learning_rate": 2.9300235561687368e-05, + "loss": 0.7541, + "step": 1529 + }, + { + "epoch": 0.18798378179137487, + "grad_norm": 1.58050264717144, + "learning_rate": 2.9298292226370533e-05, + "loss": 0.5834, + "step": 1530 + }, + { + "epoch": 0.18810664700823196, + "grad_norm": 1.2484084357824006, + "learning_rate": 2.9296346260975576e-05, + "loss": 0.701, + "step": 1531 + }, + { + "epoch": 0.18822951222508907, + "grad_norm": 1.3173928950782057, + "learning_rate": 2.9294397665860437e-05, + "loss": 0.6236, + "step": 1532 + }, + { + "epoch": 0.1883523774419462, + "grad_norm": 1.574290578535979, + "learning_rate": 2.929244644138355e-05, + "loss": 0.7638, + "step": 1533 + }, + { + "epoch": 0.1884752426588033, + "grad_norm": 1.659760237307229, + "learning_rate": 2.929049258790383e-05, + "loss": 0.5903, + "step": 1534 + }, + { + "epoch": 0.1885981078756604, + "grad_norm": 1.5940384340066525, + "learning_rate": 2.9288536105780674e-05, + "loss": 0.6189, + "step": 1535 + }, + { + "epoch": 0.1887209730925175, + "grad_norm": 1.7517293387038757, + "learning_rate": 2.9286576995373966e-05, + "loss": 0.795, + "step": 1536 + }, + { + "epoch": 0.18884383830937462, + "grad_norm": 1.3511009081519965, + "learning_rate": 2.9284615257044076e-05, + "loss": 0.6965, + "step": 1537 + }, + { + "epoch": 0.1889667035262317, + "grad_norm": 1.829616935600136, + "learning_rate": 2.9282650891151844e-05, + "loss": 0.7593, + "step": 1538 + }, + { + "epoch": 0.18908956874308883, + "grad_norm": 1.520531927883508, + "learning_rate": 2.9280683898058608e-05, + "loss": 0.7871, + "step": 1539 + }, + { + "epoch": 0.18921243395994594, + "grad_norm": 1.619066654880713, + "learning_rate": 2.9278714278126182e-05, + "loss": 0.6959, + "step": 1540 + }, + { + "epoch": 0.18933529917680306, + "grad_norm": 1.4416485848040632, + "learning_rate": 2.9276742031716866e-05, + "loss": 0.6978, + "step": 1541 + }, + { + "epoch": 0.18945816439366014, + "grad_norm": 1.5184159629377585, + "learning_rate": 2.9274767159193438e-05, + "loss": 0.7382, + "step": 1542 + }, + { + "epoch": 0.18958102961051726, + "grad_norm": 1.4090159140056246, + "learning_rate": 2.927278966091917e-05, + "loss": 0.6666, + "step": 1543 + }, + { + "epoch": 0.18970389482737438, + "grad_norm": 1.3951599685413352, + "learning_rate": 2.9270809537257805e-05, + "loss": 0.6848, + "step": 1544 + }, + { + "epoch": 0.1898267600442315, + "grad_norm": 1.36127163797329, + "learning_rate": 2.926882678857358e-05, + "loss": 0.6937, + "step": 1545 + }, + { + "epoch": 0.18994962526108858, + "grad_norm": 1.5991557482790169, + "learning_rate": 2.92668414152312e-05, + "loss": 0.7124, + "step": 1546 + }, + { + "epoch": 0.1900724904779457, + "grad_norm": 1.3625166879810149, + "learning_rate": 2.926485341759586e-05, + "loss": 0.6658, + "step": 1547 + }, + { + "epoch": 0.1901953556948028, + "grad_norm": 1.5706104814715964, + "learning_rate": 2.926286279603325e-05, + "loss": 0.6403, + "step": 1548 + }, + { + "epoch": 0.1903182209116599, + "grad_norm": 1.6422892295033031, + "learning_rate": 2.9260869550909526e-05, + "loss": 0.5654, + "step": 1549 + }, + { + "epoch": 0.190441086128517, + "grad_norm": 1.5222071453972204, + "learning_rate": 2.9258873682591334e-05, + "loss": 0.5574, + "step": 1550 + }, + { + "epoch": 0.19056395134537413, + "grad_norm": 1.6156389559199866, + "learning_rate": 2.9256875191445797e-05, + "loss": 0.7303, + "step": 1551 + }, + { + "epoch": 0.19068681656223124, + "grad_norm": 1.5947713682173652, + "learning_rate": 2.925487407784053e-05, + "loss": 0.6444, + "step": 1552 + }, + { + "epoch": 0.19080968177908833, + "grad_norm": 1.6707565948408543, + "learning_rate": 2.925287034214362e-05, + "loss": 0.6571, + "step": 1553 + }, + { + "epoch": 0.19093254699594545, + "grad_norm": 1.4505669503151828, + "learning_rate": 2.925086398472365e-05, + "loss": 0.6501, + "step": 1554 + }, + { + "epoch": 0.19105541221280256, + "grad_norm": 1.7156793882199235, + "learning_rate": 2.9248855005949665e-05, + "loss": 0.6766, + "step": 1555 + }, + { + "epoch": 0.19117827742965968, + "grad_norm": 1.4264438589909567, + "learning_rate": 2.924684340619121e-05, + "loss": 0.6399, + "step": 1556 + }, + { + "epoch": 0.19130114264651676, + "grad_norm": 1.5491430630933203, + "learning_rate": 2.92448291858183e-05, + "loss": 0.6539, + "step": 1557 + }, + { + "epoch": 0.19142400786337388, + "grad_norm": 1.4042897266093388, + "learning_rate": 2.924281234520145e-05, + "loss": 0.7379, + "step": 1558 + }, + { + "epoch": 0.191546873080231, + "grad_norm": 1.367344640663153, + "learning_rate": 2.924079288471163e-05, + "loss": 0.7131, + "step": 1559 + }, + { + "epoch": 0.19166973829708808, + "grad_norm": 1.6183571897181392, + "learning_rate": 2.9238770804720318e-05, + "loss": 0.6521, + "step": 1560 + }, + { + "epoch": 0.1917926035139452, + "grad_norm": 1.4065130121967313, + "learning_rate": 2.923674610559946e-05, + "loss": 0.5831, + "step": 1561 + }, + { + "epoch": 0.1919154687308023, + "grad_norm": 1.4576924300441743, + "learning_rate": 2.9234718787721477e-05, + "loss": 0.7174, + "step": 1562 + }, + { + "epoch": 0.19203833394765943, + "grad_norm": 1.6217778913744338, + "learning_rate": 2.9232688851459293e-05, + "loss": 0.665, + "step": 1563 + }, + { + "epoch": 0.19216119916451652, + "grad_norm": 1.3824628027346642, + "learning_rate": 2.9230656297186298e-05, + "loss": 0.6806, + "step": 1564 + }, + { + "epoch": 0.19228406438137363, + "grad_norm": 1.5832668683858522, + "learning_rate": 2.9228621125276363e-05, + "loss": 0.6586, + "step": 1565 + }, + { + "epoch": 0.19240692959823075, + "grad_norm": 1.8482722821301765, + "learning_rate": 2.9226583336103855e-05, + "loss": 0.7928, + "step": 1566 + }, + { + "epoch": 0.19252979481508786, + "grad_norm": 1.4563549945831427, + "learning_rate": 2.9224542930043595e-05, + "loss": 0.6073, + "step": 1567 + }, + { + "epoch": 0.19265266003194495, + "grad_norm": 1.5434520062285009, + "learning_rate": 2.9222499907470917e-05, + "loss": 0.6018, + "step": 1568 + }, + { + "epoch": 0.19277552524880207, + "grad_norm": 1.379709637897164, + "learning_rate": 2.922045426876162e-05, + "loss": 0.7192, + "step": 1569 + }, + { + "epoch": 0.19289839046565918, + "grad_norm": 1.5353816038853487, + "learning_rate": 2.921840601429198e-05, + "loss": 0.5746, + "step": 1570 + }, + { + "epoch": 0.19302125568251627, + "grad_norm": 1.531225158293345, + "learning_rate": 2.9216355144438766e-05, + "loss": 0.651, + "step": 1571 + }, + { + "epoch": 0.19314412089937338, + "grad_norm": 1.49018278226954, + "learning_rate": 2.9214301659579218e-05, + "loss": 0.7497, + "step": 1572 + }, + { + "epoch": 0.1932669861162305, + "grad_norm": 1.3912687312973346, + "learning_rate": 2.921224556009106e-05, + "loss": 0.7307, + "step": 1573 + }, + { + "epoch": 0.19338985133308761, + "grad_norm": 1.454294378763509, + "learning_rate": 2.9210186846352504e-05, + "loss": 0.6316, + "step": 1574 + }, + { + "epoch": 0.1935127165499447, + "grad_norm": 1.4811443958039792, + "learning_rate": 2.9208125518742232e-05, + "loss": 0.6694, + "step": 1575 + }, + { + "epoch": 0.19363558176680182, + "grad_norm": 1.313593747041495, + "learning_rate": 2.9206061577639415e-05, + "loss": 0.6968, + "step": 1576 + }, + { + "epoch": 0.19375844698365893, + "grad_norm": 1.297190236562062, + "learning_rate": 2.9203995023423697e-05, + "loss": 0.7406, + "step": 1577 + }, + { + "epoch": 0.19388131220051602, + "grad_norm": 1.336160829884966, + "learning_rate": 2.9201925856475214e-05, + "loss": 0.6216, + "step": 1578 + }, + { + "epoch": 0.19400417741737314, + "grad_norm": 1.196270908681761, + "learning_rate": 2.9199854077174573e-05, + "loss": 0.6768, + "step": 1579 + }, + { + "epoch": 0.19412704263423025, + "grad_norm": 1.5209644363959263, + "learning_rate": 2.9197779685902862e-05, + "loss": 0.7167, + "step": 1580 + }, + { + "epoch": 0.19424990785108737, + "grad_norm": 1.5330637943422534, + "learning_rate": 2.9195702683041657e-05, + "loss": 0.8041, + "step": 1581 + }, + { + "epoch": 0.19437277306794445, + "grad_norm": 1.545754629473385, + "learning_rate": 2.9193623068973003e-05, + "loss": 0.745, + "step": 1582 + }, + { + "epoch": 0.19449563828480157, + "grad_norm": 1.514294344831554, + "learning_rate": 2.919154084407943e-05, + "loss": 0.6673, + "step": 1583 + }, + { + "epoch": 0.19461850350165869, + "grad_norm": 1.4557884006798436, + "learning_rate": 2.9189456008743964e-05, + "loss": 0.6302, + "step": 1584 + }, + { + "epoch": 0.1947413687185158, + "grad_norm": 1.554529455103597, + "learning_rate": 2.918736856335008e-05, + "loss": 0.6605, + "step": 1585 + }, + { + "epoch": 0.1948642339353729, + "grad_norm": 1.4549208583768032, + "learning_rate": 2.9185278508281757e-05, + "loss": 0.6651, + "step": 1586 + }, + { + "epoch": 0.19498709915223, + "grad_norm": 1.3396714158847536, + "learning_rate": 2.9183185843923446e-05, + "loss": 0.6812, + "step": 1587 + }, + { + "epoch": 0.19510996436908712, + "grad_norm": 1.4205516359365997, + "learning_rate": 2.9181090570660086e-05, + "loss": 0.6601, + "step": 1588 + }, + { + "epoch": 0.1952328295859442, + "grad_norm": 1.535752433115311, + "learning_rate": 2.917899268887708e-05, + "loss": 0.7288, + "step": 1589 + }, + { + "epoch": 0.19535569480280132, + "grad_norm": 1.4555939672481808, + "learning_rate": 2.9176892198960324e-05, + "loss": 0.616, + "step": 1590 + }, + { + "epoch": 0.19547856001965844, + "grad_norm": 1.6822131490240755, + "learning_rate": 2.9174789101296186e-05, + "loss": 0.6132, + "step": 1591 + }, + { + "epoch": 0.19560142523651555, + "grad_norm": 1.5449470743658773, + "learning_rate": 2.9172683396271523e-05, + "loss": 0.6089, + "step": 1592 + }, + { + "epoch": 0.19572429045337264, + "grad_norm": 1.3445564784788413, + "learning_rate": 2.917057508427366e-05, + "loss": 0.6428, + "step": 1593 + }, + { + "epoch": 0.19584715567022976, + "grad_norm": 1.5500281690320337, + "learning_rate": 2.916846416569041e-05, + "loss": 0.6773, + "step": 1594 + }, + { + "epoch": 0.19597002088708687, + "grad_norm": 1.4891458988894621, + "learning_rate": 2.916635064091006e-05, + "loss": 0.6301, + "step": 1595 + }, + { + "epoch": 0.196092886103944, + "grad_norm": 1.593426660361374, + "learning_rate": 2.9164234510321387e-05, + "loss": 0.7132, + "step": 1596 + }, + { + "epoch": 0.19621575132080107, + "grad_norm": 1.6631861828897245, + "learning_rate": 2.9162115774313628e-05, + "loss": 0.6467, + "step": 1597 + }, + { + "epoch": 0.1963386165376582, + "grad_norm": 1.5597530037909644, + "learning_rate": 2.9159994433276525e-05, + "loss": 0.7454, + "step": 1598 + }, + { + "epoch": 0.1964614817545153, + "grad_norm": 1.6225931338152804, + "learning_rate": 2.9157870487600268e-05, + "loss": 0.6913, + "step": 1599 + }, + { + "epoch": 0.1965843469713724, + "grad_norm": 1.4108804250856053, + "learning_rate": 2.9155743937675556e-05, + "loss": 0.7428, + "step": 1600 + }, + { + "epoch": 0.1967072121882295, + "grad_norm": 1.5127448153613658, + "learning_rate": 2.915361478389355e-05, + "loss": 0.701, + "step": 1601 + }, + { + "epoch": 0.19683007740508662, + "grad_norm": 1.8286886070251531, + "learning_rate": 2.9151483026645895e-05, + "loss": 0.6531, + "step": 1602 + }, + { + "epoch": 0.19695294262194374, + "grad_norm": 1.5240621458511348, + "learning_rate": 2.914934866632471e-05, + "loss": 0.6708, + "step": 1603 + }, + { + "epoch": 0.19707580783880083, + "grad_norm": 1.3688855262444584, + "learning_rate": 2.91472117033226e-05, + "loss": 0.7017, + "step": 1604 + }, + { + "epoch": 0.19719867305565794, + "grad_norm": 2.1590638973241187, + "learning_rate": 2.9145072138032648e-05, + "loss": 0.7431, + "step": 1605 + }, + { + "epoch": 0.19732153827251506, + "grad_norm": 1.8660765371331238, + "learning_rate": 2.9142929970848406e-05, + "loss": 0.706, + "step": 1606 + }, + { + "epoch": 0.19744440348937217, + "grad_norm": 1.5193081379209106, + "learning_rate": 2.9140785202163918e-05, + "loss": 0.6693, + "step": 1607 + }, + { + "epoch": 0.19756726870622926, + "grad_norm": 1.3984267071154053, + "learning_rate": 2.91386378323737e-05, + "loss": 0.5885, + "step": 1608 + }, + { + "epoch": 0.19769013392308638, + "grad_norm": 1.730707603972452, + "learning_rate": 2.9136487861872743e-05, + "loss": 0.6696, + "step": 1609 + }, + { + "epoch": 0.1978129991399435, + "grad_norm": 1.794023298084845, + "learning_rate": 2.9134335291056522e-05, + "loss": 0.7202, + "step": 1610 + }, + { + "epoch": 0.19793586435680058, + "grad_norm": 1.6472503209427598, + "learning_rate": 2.9132180120320987e-05, + "loss": 0.757, + "step": 1611 + }, + { + "epoch": 0.1980587295736577, + "grad_norm": 1.6638515515381833, + "learning_rate": 2.9130022350062573e-05, + "loss": 0.6791, + "step": 1612 + }, + { + "epoch": 0.1981815947905148, + "grad_norm": 1.7607942925792015, + "learning_rate": 2.9127861980678185e-05, + "loss": 0.7019, + "step": 1613 + }, + { + "epoch": 0.19830446000737192, + "grad_norm": 1.4053922742222311, + "learning_rate": 2.9125699012565204e-05, + "loss": 0.7324, + "step": 1614 + }, + { + "epoch": 0.198427325224229, + "grad_norm": 1.4855802277836725, + "learning_rate": 2.91235334461215e-05, + "loss": 0.7791, + "step": 1615 + }, + { + "epoch": 0.19855019044108613, + "grad_norm": 1.601259286498872, + "learning_rate": 2.912136528174541e-05, + "loss": 0.7129, + "step": 1616 + }, + { + "epoch": 0.19867305565794324, + "grad_norm": 1.3942407270162824, + "learning_rate": 2.9119194519835762e-05, + "loss": 0.6473, + "step": 1617 + }, + { + "epoch": 0.19879592087480033, + "grad_norm": 1.4677964964170078, + "learning_rate": 2.9117021160791844e-05, + "loss": 0.6139, + "step": 1618 + }, + { + "epoch": 0.19891878609165745, + "grad_norm": 1.4957720312259364, + "learning_rate": 2.9114845205013436e-05, + "loss": 0.7522, + "step": 1619 + }, + { + "epoch": 0.19904165130851456, + "grad_norm": 1.4341689364412344, + "learning_rate": 2.9112666652900796e-05, + "loss": 0.7148, + "step": 1620 + }, + { + "epoch": 0.19916451652537168, + "grad_norm": 1.7839646142469772, + "learning_rate": 2.9110485504854643e-05, + "loss": 0.8059, + "step": 1621 + }, + { + "epoch": 0.19928738174222876, + "grad_norm": 1.5799980431649903, + "learning_rate": 2.9108301761276194e-05, + "loss": 0.6312, + "step": 1622 + }, + { + "epoch": 0.19941024695908588, + "grad_norm": 1.5074495787454885, + "learning_rate": 2.910611542256713e-05, + "loss": 0.7504, + "step": 1623 + }, + { + "epoch": 0.199533112175943, + "grad_norm": 1.3902197503306994, + "learning_rate": 2.9103926489129616e-05, + "loss": 0.7071, + "step": 1624 + }, + { + "epoch": 0.1996559773928001, + "grad_norm": 1.389046690626052, + "learning_rate": 2.910173496136629e-05, + "loss": 0.6806, + "step": 1625 + }, + { + "epoch": 0.1997788426096572, + "grad_norm": 1.7108880435381788, + "learning_rate": 2.9099540839680272e-05, + "loss": 0.7894, + "step": 1626 + }, + { + "epoch": 0.1999017078265143, + "grad_norm": 1.921771248252148, + "learning_rate": 2.9097344124475155e-05, + "loss": 0.8078, + "step": 1627 + }, + { + "epoch": 0.20002457304337143, + "grad_norm": 1.8101104696472938, + "learning_rate": 2.909514481615501e-05, + "loss": 0.7091, + "step": 1628 + }, + { + "epoch": 0.20014743826022852, + "grad_norm": 1.8057931076620009, + "learning_rate": 2.9092942915124386e-05, + "loss": 0.7527, + "step": 1629 + }, + { + "epoch": 0.20027030347708563, + "grad_norm": 1.4369971982417507, + "learning_rate": 2.909073842178831e-05, + "loss": 0.7754, + "step": 1630 + }, + { + "epoch": 0.20039316869394275, + "grad_norm": 1.6744242000126663, + "learning_rate": 2.9088531336552285e-05, + "loss": 0.7873, + "step": 1631 + }, + { + "epoch": 0.20051603391079986, + "grad_norm": 1.5133157883872566, + "learning_rate": 2.9086321659822285e-05, + "loss": 0.6318, + "step": 1632 + }, + { + "epoch": 0.20063889912765695, + "grad_norm": 1.6404257409183727, + "learning_rate": 2.908410939200477e-05, + "loss": 0.6416, + "step": 1633 + }, + { + "epoch": 0.20076176434451407, + "grad_norm": 1.6374959631074553, + "learning_rate": 2.908189453350667e-05, + "loss": 0.7259, + "step": 1634 + }, + { + "epoch": 0.20088462956137118, + "grad_norm": 1.4474412863344543, + "learning_rate": 2.9079677084735396e-05, + "loss": 0.6849, + "step": 1635 + }, + { + "epoch": 0.2010074947782283, + "grad_norm": 1.4721500781791519, + "learning_rate": 2.9077457046098833e-05, + "loss": 0.7043, + "step": 1636 + }, + { + "epoch": 0.20113035999508538, + "grad_norm": 1.3680991355322443, + "learning_rate": 2.9075234418005344e-05, + "loss": 0.7373, + "step": 1637 + }, + { + "epoch": 0.2012532252119425, + "grad_norm": 1.487452804266133, + "learning_rate": 2.907300920086376e-05, + "loss": 0.617, + "step": 1638 + }, + { + "epoch": 0.20137609042879961, + "grad_norm": 1.309122641263006, + "learning_rate": 2.90707813950834e-05, + "loss": 0.7302, + "step": 1639 + }, + { + "epoch": 0.2014989556456567, + "grad_norm": 1.6229428895960165, + "learning_rate": 2.906855100107406e-05, + "loss": 0.745, + "step": 1640 + }, + { + "epoch": 0.20162182086251382, + "grad_norm": 1.3879452780343724, + "learning_rate": 2.9066318019245994e-05, + "loss": 0.7327, + "step": 1641 + }, + { + "epoch": 0.20174468607937093, + "grad_norm": 1.3463820755968419, + "learning_rate": 2.9064082450009956e-05, + "loss": 0.6063, + "step": 1642 + }, + { + "epoch": 0.20186755129622805, + "grad_norm": 1.7694458731699079, + "learning_rate": 2.9061844293777156e-05, + "loss": 0.8044, + "step": 1643 + }, + { + "epoch": 0.20199041651308514, + "grad_norm": 1.3381725169774552, + "learning_rate": 2.9059603550959296e-05, + "loss": 0.6198, + "step": 1644 + }, + { + "epoch": 0.20211328172994225, + "grad_norm": 1.583569908139625, + "learning_rate": 2.9057360221968546e-05, + "loss": 0.7266, + "step": 1645 + }, + { + "epoch": 0.20223614694679937, + "grad_norm": 1.5902436387506211, + "learning_rate": 2.9055114307217543e-05, + "loss": 0.6147, + "step": 1646 + }, + { + "epoch": 0.20235901216365648, + "grad_norm": 1.2384204233225307, + "learning_rate": 2.9052865807119415e-05, + "loss": 0.6601, + "step": 1647 + }, + { + "epoch": 0.20248187738051357, + "grad_norm": 1.283595881513416, + "learning_rate": 2.905061472208776e-05, + "loss": 0.6981, + "step": 1648 + }, + { + "epoch": 0.20260474259737069, + "grad_norm": 1.6211949007645106, + "learning_rate": 2.9048361052536644e-05, + "loss": 0.7928, + "step": 1649 + }, + { + "epoch": 0.2027276078142278, + "grad_norm": 1.7813330700334447, + "learning_rate": 2.904610479888062e-05, + "loss": 0.6544, + "step": 1650 + }, + { + "epoch": 0.2028504730310849, + "grad_norm": 1.4553175051762477, + "learning_rate": 2.9043845961534713e-05, + "loss": 0.6717, + "step": 1651 + }, + { + "epoch": 0.202973338247942, + "grad_norm": 1.4379750392426947, + "learning_rate": 2.904158454091442e-05, + "loss": 0.6424, + "step": 1652 + }, + { + "epoch": 0.20309620346479912, + "grad_norm": 1.3874089102540252, + "learning_rate": 2.9039320537435706e-05, + "loss": 0.6969, + "step": 1653 + }, + { + "epoch": 0.20321906868165623, + "grad_norm": 1.4189976041542967, + "learning_rate": 2.9037053951515036e-05, + "loss": 0.6535, + "step": 1654 + }, + { + "epoch": 0.20334193389851332, + "grad_norm": 1.3806059748873656, + "learning_rate": 2.9034784783569324e-05, + "loss": 0.7327, + "step": 1655 + }, + { + "epoch": 0.20346479911537044, + "grad_norm": 1.4983034351050566, + "learning_rate": 2.9032513034015965e-05, + "loss": 0.7382, + "step": 1656 + }, + { + "epoch": 0.20358766433222755, + "grad_norm": 1.5704116166851976, + "learning_rate": 2.903023870327284e-05, + "loss": 0.73, + "step": 1657 + }, + { + "epoch": 0.20371052954908467, + "grad_norm": 1.6151523676181114, + "learning_rate": 2.90279617917583e-05, + "loss": 0.7412, + "step": 1658 + }, + { + "epoch": 0.20383339476594176, + "grad_norm": 1.4303070173605275, + "learning_rate": 2.9025682299891154e-05, + "loss": 0.6355, + "step": 1659 + }, + { + "epoch": 0.20395625998279887, + "grad_norm": 1.5155897888626866, + "learning_rate": 2.902340022809071e-05, + "loss": 0.7118, + "step": 1660 + }, + { + "epoch": 0.204079125199656, + "grad_norm": 1.1877163078645256, + "learning_rate": 2.9021115576776745e-05, + "loss": 0.7047, + "step": 1661 + }, + { + "epoch": 0.20420199041651307, + "grad_norm": 1.6650373555078317, + "learning_rate": 2.9018828346369496e-05, + "loss": 0.6151, + "step": 1662 + }, + { + "epoch": 0.2043248556333702, + "grad_norm": 1.6152373801219506, + "learning_rate": 2.9016538537289688e-05, + "loss": 0.7038, + "step": 1663 + }, + { + "epoch": 0.2044477208502273, + "grad_norm": 1.648680790409684, + "learning_rate": 2.901424614995852e-05, + "loss": 0.6862, + "step": 1664 + }, + { + "epoch": 0.20457058606708442, + "grad_norm": 1.5298108437265414, + "learning_rate": 2.901195118479765e-05, + "loss": 0.7646, + "step": 1665 + }, + { + "epoch": 0.2046934512839415, + "grad_norm": 1.3332001841817516, + "learning_rate": 2.900965364222924e-05, + "loss": 0.7028, + "step": 1666 + }, + { + "epoch": 0.20481631650079862, + "grad_norm": 1.520768631073332, + "learning_rate": 2.900735352267589e-05, + "loss": 0.7101, + "step": 1667 + }, + { + "epoch": 0.20493918171765574, + "grad_norm": 1.3152825982927783, + "learning_rate": 2.9005050826560704e-05, + "loss": 0.6148, + "step": 1668 + }, + { + "epoch": 0.20506204693451283, + "grad_norm": 1.5678536764861069, + "learning_rate": 2.9002745554307247e-05, + "loss": 0.6781, + "step": 1669 + }, + { + "epoch": 0.20518491215136994, + "grad_norm": 1.3813906838666152, + "learning_rate": 2.900043770633955e-05, + "loss": 0.6862, + "step": 1670 + }, + { + "epoch": 0.20530777736822706, + "grad_norm": 1.3079929022763197, + "learning_rate": 2.8998127283082138e-05, + "loss": 0.6354, + "step": 1671 + }, + { + "epoch": 0.20543064258508417, + "grad_norm": 1.4832568552529857, + "learning_rate": 2.8995814284959992e-05, + "loss": 0.6236, + "step": 1672 + }, + { + "epoch": 0.20555350780194126, + "grad_norm": 1.6295987795734126, + "learning_rate": 2.8993498712398575e-05, + "loss": 0.6912, + "step": 1673 + }, + { + "epoch": 0.20567637301879838, + "grad_norm": 1.4125495836035356, + "learning_rate": 2.8991180565823823e-05, + "loss": 0.6692, + "step": 1674 + }, + { + "epoch": 0.2057992382356555, + "grad_norm": 1.3545420821753, + "learning_rate": 2.8988859845662137e-05, + "loss": 0.6491, + "step": 1675 + }, + { + "epoch": 0.2059221034525126, + "grad_norm": 1.5546704341036024, + "learning_rate": 2.8986536552340406e-05, + "loss": 0.6654, + "step": 1676 + }, + { + "epoch": 0.2060449686693697, + "grad_norm": 1.492660865361205, + "learning_rate": 2.8984210686285982e-05, + "loss": 0.784, + "step": 1677 + }, + { + "epoch": 0.2061678338862268, + "grad_norm": 1.5339453008430162, + "learning_rate": 2.8981882247926695e-05, + "loss": 0.6454, + "step": 1678 + }, + { + "epoch": 0.20629069910308392, + "grad_norm": 1.6822679833947132, + "learning_rate": 2.897955123769084e-05, + "loss": 0.6576, + "step": 1679 + }, + { + "epoch": 0.206413564319941, + "grad_norm": 1.4328013377571083, + "learning_rate": 2.8977217656007198e-05, + "loss": 0.708, + "step": 1680 + }, + { + "epoch": 0.20653642953679813, + "grad_norm": 1.5427939691784718, + "learning_rate": 2.897488150330501e-05, + "loss": 0.5728, + "step": 1681 + }, + { + "epoch": 0.20665929475365524, + "grad_norm": 1.6820652315736742, + "learning_rate": 2.8972542780014002e-05, + "loss": 0.7099, + "step": 1682 + }, + { + "epoch": 0.20678215997051236, + "grad_norm": 1.6925280981348614, + "learning_rate": 2.8970201486564367e-05, + "loss": 0.7588, + "step": 1683 + }, + { + "epoch": 0.20690502518736945, + "grad_norm": 1.6081883107423713, + "learning_rate": 2.896785762338677e-05, + "loss": 0.6657, + "step": 1684 + }, + { + "epoch": 0.20702789040422656, + "grad_norm": 1.3206277129869408, + "learning_rate": 2.8965511190912342e-05, + "loss": 0.6172, + "step": 1685 + }, + { + "epoch": 0.20715075562108368, + "grad_norm": 1.6377462720759686, + "learning_rate": 2.89631621895727e-05, + "loss": 0.7675, + "step": 1686 + }, + { + "epoch": 0.2072736208379408, + "grad_norm": 1.3512720596998788, + "learning_rate": 2.8960810619799933e-05, + "loss": 0.8258, + "step": 1687 + }, + { + "epoch": 0.20739648605479788, + "grad_norm": 1.3218751814536698, + "learning_rate": 2.8958456482026586e-05, + "loss": 0.6191, + "step": 1688 + }, + { + "epoch": 0.207519351271655, + "grad_norm": 1.8032349722544163, + "learning_rate": 2.8956099776685695e-05, + "loss": 0.7192, + "step": 1689 + }, + { + "epoch": 0.2076422164885121, + "grad_norm": 1.4750747974460565, + "learning_rate": 2.895374050421076e-05, + "loss": 0.6765, + "step": 1690 + }, + { + "epoch": 0.2077650817053692, + "grad_norm": 1.5287990582693514, + "learning_rate": 2.8951378665035754e-05, + "loss": 0.6421, + "step": 1691 + }, + { + "epoch": 0.2078879469222263, + "grad_norm": 1.2977201443061555, + "learning_rate": 2.894901425959512e-05, + "loss": 0.5847, + "step": 1692 + }, + { + "epoch": 0.20801081213908343, + "grad_norm": 1.7088501750159508, + "learning_rate": 2.894664728832377e-05, + "loss": 0.7014, + "step": 1693 + }, + { + "epoch": 0.20813367735594054, + "grad_norm": 1.6431184004192918, + "learning_rate": 2.8944277751657106e-05, + "loss": 0.5736, + "step": 1694 + }, + { + "epoch": 0.20825654257279763, + "grad_norm": 1.2948597164135192, + "learning_rate": 2.894190565003097e-05, + "loss": 0.6285, + "step": 1695 + }, + { + "epoch": 0.20837940778965475, + "grad_norm": 1.4769031727677895, + "learning_rate": 2.893953098388172e-05, + "loss": 0.6046, + "step": 1696 + }, + { + "epoch": 0.20850227300651186, + "grad_norm": 1.4780181675878536, + "learning_rate": 2.8937153753646138e-05, + "loss": 0.6234, + "step": 1697 + }, + { + "epoch": 0.20862513822336898, + "grad_norm": 1.3846827088364708, + "learning_rate": 2.8934773959761512e-05, + "loss": 0.6055, + "step": 1698 + }, + { + "epoch": 0.20874800344022607, + "grad_norm": 1.411126940256544, + "learning_rate": 2.8932391602665585e-05, + "loss": 0.8787, + "step": 1699 + }, + { + "epoch": 0.20887086865708318, + "grad_norm": 1.3921815278284932, + "learning_rate": 2.8930006682796578e-05, + "loss": 0.6707, + "step": 1700 + }, + { + "epoch": 0.2089937338739403, + "grad_norm": 1.5844157918780364, + "learning_rate": 2.892761920059318e-05, + "loss": 0.6061, + "step": 1701 + }, + { + "epoch": 0.20911659909079738, + "grad_norm": 1.4600790001930777, + "learning_rate": 2.8925229156494553e-05, + "loss": 0.6653, + "step": 1702 + }, + { + "epoch": 0.2092394643076545, + "grad_norm": 1.6150312724381297, + "learning_rate": 2.892283655094033e-05, + "loss": 0.731, + "step": 1703 + }, + { + "epoch": 0.20936232952451161, + "grad_norm": 1.4590384021555642, + "learning_rate": 2.892044138437062e-05, + "loss": 0.6231, + "step": 1704 + }, + { + "epoch": 0.20948519474136873, + "grad_norm": 1.4312947639080624, + "learning_rate": 2.8918043657225994e-05, + "loss": 0.7578, + "step": 1705 + }, + { + "epoch": 0.20960805995822582, + "grad_norm": 1.615633210017766, + "learning_rate": 2.8915643369947497e-05, + "loss": 0.6215, + "step": 1706 + }, + { + "epoch": 0.20973092517508293, + "grad_norm": 1.653993153982696, + "learning_rate": 2.891324052297665e-05, + "loss": 0.6563, + "step": 1707 + }, + { + "epoch": 0.20985379039194005, + "grad_norm": 1.3667664419908012, + "learning_rate": 2.891083511675544e-05, + "loss": 0.6844, + "step": 1708 + }, + { + "epoch": 0.20997665560879714, + "grad_norm": 1.6737442186169018, + "learning_rate": 2.8908427151726325e-05, + "loss": 0.7301, + "step": 1709 + }, + { + "epoch": 0.21009952082565425, + "grad_norm": 1.4057509236606869, + "learning_rate": 2.8906016628332233e-05, + "loss": 0.6721, + "step": 1710 + }, + { + "epoch": 0.21022238604251137, + "grad_norm": 1.507623090861582, + "learning_rate": 2.8903603547016565e-05, + "loss": 0.6401, + "step": 1711 + }, + { + "epoch": 0.21034525125936848, + "grad_norm": 1.5914086023911689, + "learning_rate": 2.8901187908223195e-05, + "loss": 0.6029, + "step": 1712 + }, + { + "epoch": 0.21046811647622557, + "grad_norm": 1.6610035575577686, + "learning_rate": 2.8898769712396458e-05, + "loss": 0.8083, + "step": 1713 + }, + { + "epoch": 0.21059098169308269, + "grad_norm": 1.6287325315481873, + "learning_rate": 2.8896348959981173e-05, + "loss": 0.6621, + "step": 1714 + }, + { + "epoch": 0.2107138469099398, + "grad_norm": 1.3682318121893053, + "learning_rate": 2.8893925651422614e-05, + "loss": 0.6496, + "step": 1715 + }, + { + "epoch": 0.21083671212679692, + "grad_norm": 1.3551051674257304, + "learning_rate": 2.8891499787166542e-05, + "loss": 0.7096, + "step": 1716 + }, + { + "epoch": 0.210959577343654, + "grad_norm": 1.4646539105025689, + "learning_rate": 2.8889071367659172e-05, + "loss": 0.6574, + "step": 1717 + }, + { + "epoch": 0.21108244256051112, + "grad_norm": 1.5297321968854536, + "learning_rate": 2.8886640393347195e-05, + "loss": 0.6508, + "step": 1718 + }, + { + "epoch": 0.21120530777736823, + "grad_norm": 1.444121528999513, + "learning_rate": 2.888420686467778e-05, + "loss": 0.7345, + "step": 1719 + }, + { + "epoch": 0.21132817299422532, + "grad_norm": 1.359755292370642, + "learning_rate": 2.8881770782098547e-05, + "loss": 0.6974, + "step": 1720 + }, + { + "epoch": 0.21145103821108244, + "grad_norm": 1.390170937914581, + "learning_rate": 2.8879332146057612e-05, + "loss": 0.6322, + "step": 1721 + }, + { + "epoch": 0.21157390342793955, + "grad_norm": 1.386463572232355, + "learning_rate": 2.887689095700354e-05, + "loss": 0.696, + "step": 1722 + }, + { + "epoch": 0.21169676864479667, + "grad_norm": 1.38407009288635, + "learning_rate": 2.8874447215385365e-05, + "loss": 0.7194, + "step": 1723 + }, + { + "epoch": 0.21181963386165376, + "grad_norm": 1.2854769170260811, + "learning_rate": 2.8872000921652607e-05, + "loss": 0.5832, + "step": 1724 + }, + { + "epoch": 0.21194249907851087, + "grad_norm": 1.2962323461402037, + "learning_rate": 2.8869552076255243e-05, + "loss": 0.6037, + "step": 1725 + }, + { + "epoch": 0.212065364295368, + "grad_norm": 1.5511157855954367, + "learning_rate": 2.886710067964372e-05, + "loss": 0.6841, + "step": 1726 + }, + { + "epoch": 0.2121882295122251, + "grad_norm": 1.6900615186355474, + "learning_rate": 2.8864646732268962e-05, + "loss": 0.7129, + "step": 1727 + }, + { + "epoch": 0.2123110947290822, + "grad_norm": 1.470829333081114, + "learning_rate": 2.8862190234582348e-05, + "loss": 0.7278, + "step": 1728 + }, + { + "epoch": 0.2124339599459393, + "grad_norm": 1.5819966706250743, + "learning_rate": 2.8859731187035746e-05, + "loss": 0.8149, + "step": 1729 + }, + { + "epoch": 0.21255682516279642, + "grad_norm": 1.4206511652040512, + "learning_rate": 2.8857269590081472e-05, + "loss": 0.5662, + "step": 1730 + }, + { + "epoch": 0.2126796903796535, + "grad_norm": 1.4799409179640088, + "learning_rate": 2.885480544417232e-05, + "loss": 0.6105, + "step": 1731 + }, + { + "epoch": 0.21280255559651062, + "grad_norm": 1.5124468947449239, + "learning_rate": 2.8852338749761566e-05, + "loss": 0.7432, + "step": 1732 + }, + { + "epoch": 0.21292542081336774, + "grad_norm": 1.6110618134312789, + "learning_rate": 2.884986950730293e-05, + "loss": 0.7073, + "step": 1733 + }, + { + "epoch": 0.21304828603022485, + "grad_norm": 1.494918466398621, + "learning_rate": 2.884739771725062e-05, + "loss": 0.701, + "step": 1734 + }, + { + "epoch": 0.21317115124708194, + "grad_norm": 1.5218926694626034, + "learning_rate": 2.88449233800593e-05, + "loss": 0.6231, + "step": 1735 + }, + { + "epoch": 0.21329401646393906, + "grad_norm": 1.653000812316923, + "learning_rate": 2.8842446496184114e-05, + "loss": 0.7462, + "step": 1736 + }, + { + "epoch": 0.21341688168079617, + "grad_norm": 1.3626794496716919, + "learning_rate": 2.883996706608067e-05, + "loss": 0.6156, + "step": 1737 + }, + { + "epoch": 0.2135397468976533, + "grad_norm": 1.4132252004912156, + "learning_rate": 2.8837485090205033e-05, + "loss": 0.6293, + "step": 1738 + }, + { + "epoch": 0.21366261211451038, + "grad_norm": 1.644794069283531, + "learning_rate": 2.883500056901376e-05, + "loss": 0.7024, + "step": 1739 + }, + { + "epoch": 0.2137854773313675, + "grad_norm": 1.408983802114771, + "learning_rate": 2.883251350296385e-05, + "loss": 0.717, + "step": 1740 + }, + { + "epoch": 0.2139083425482246, + "grad_norm": 1.4665305687447852, + "learning_rate": 2.8830023892512792e-05, + "loss": 0.7488, + "step": 1741 + }, + { + "epoch": 0.2140312077650817, + "grad_norm": 1.3830222138380757, + "learning_rate": 2.8827531738118526e-05, + "loss": 0.6517, + "step": 1742 + }, + { + "epoch": 0.2141540729819388, + "grad_norm": 1.4663141184833655, + "learning_rate": 2.8825037040239473e-05, + "loss": 0.6005, + "step": 1743 + }, + { + "epoch": 0.21427693819879592, + "grad_norm": 1.6544962013607898, + "learning_rate": 2.8822539799334513e-05, + "loss": 0.6614, + "step": 1744 + }, + { + "epoch": 0.21439980341565304, + "grad_norm": 1.7495929272576076, + "learning_rate": 2.8820040015863e-05, + "loss": 0.7829, + "step": 1745 + }, + { + "epoch": 0.21452266863251013, + "grad_norm": 1.4634542962181936, + "learning_rate": 2.8817537690284755e-05, + "loss": 0.5953, + "step": 1746 + }, + { + "epoch": 0.21464553384936724, + "grad_norm": 1.5928780033757222, + "learning_rate": 2.881503282306006e-05, + "loss": 0.7012, + "step": 1747 + }, + { + "epoch": 0.21476839906622436, + "grad_norm": 1.7188477888120064, + "learning_rate": 2.881252541464967e-05, + "loss": 0.6554, + "step": 1748 + }, + { + "epoch": 0.21489126428308147, + "grad_norm": 1.6118413090895642, + "learning_rate": 2.8810015465514808e-05, + "loss": 0.7479, + "step": 1749 + }, + { + "epoch": 0.21501412949993856, + "grad_norm": 1.7470599753442921, + "learning_rate": 2.880750297611716e-05, + "loss": 0.6362, + "step": 1750 + }, + { + "epoch": 0.21513699471679568, + "grad_norm": 1.4151079133492444, + "learning_rate": 2.8804987946918888e-05, + "loss": 0.6841, + "step": 1751 + }, + { + "epoch": 0.2152598599336528, + "grad_norm": 1.6180307610219546, + "learning_rate": 2.880247037838261e-05, + "loss": 0.6621, + "step": 1752 + }, + { + "epoch": 0.21538272515050988, + "grad_norm": 1.4359963924404517, + "learning_rate": 2.879995027097142e-05, + "loss": 0.6427, + "step": 1753 + }, + { + "epoch": 0.215505590367367, + "grad_norm": 1.3586896565007869, + "learning_rate": 2.8797427625148872e-05, + "loss": 0.6621, + "step": 1754 + }, + { + "epoch": 0.2156284555842241, + "grad_norm": 1.6493736411929218, + "learning_rate": 2.8794902441378992e-05, + "loss": 0.6716, + "step": 1755 + }, + { + "epoch": 0.21575132080108123, + "grad_norm": 1.6452055558898349, + "learning_rate": 2.8792374720126268e-05, + "loss": 0.6743, + "step": 1756 + }, + { + "epoch": 0.2158741860179383, + "grad_norm": 1.8291232347196271, + "learning_rate": 2.8789844461855665e-05, + "loss": 0.7119, + "step": 1757 + }, + { + "epoch": 0.21599705123479543, + "grad_norm": 1.5352373294254873, + "learning_rate": 2.8787311667032603e-05, + "loss": 0.7149, + "step": 1758 + }, + { + "epoch": 0.21611991645165254, + "grad_norm": 1.5047466091176325, + "learning_rate": 2.8784776336122975e-05, + "loss": 0.7167, + "step": 1759 + }, + { + "epoch": 0.21624278166850963, + "grad_norm": 1.4292318707631428, + "learning_rate": 2.8782238469593136e-05, + "loss": 0.7608, + "step": 1760 + }, + { + "epoch": 0.21636564688536675, + "grad_norm": 1.4982783802237056, + "learning_rate": 2.8779698067909907e-05, + "loss": 0.6563, + "step": 1761 + }, + { + "epoch": 0.21648851210222386, + "grad_norm": 1.6649616664077795, + "learning_rate": 2.8777155131540588e-05, + "loss": 0.601, + "step": 1762 + }, + { + "epoch": 0.21661137731908098, + "grad_norm": 1.2412184310057395, + "learning_rate": 2.877460966095293e-05, + "loss": 0.6617, + "step": 1763 + }, + { + "epoch": 0.21673424253593807, + "grad_norm": 1.2961334045033739, + "learning_rate": 2.8772061656615155e-05, + "loss": 0.6643, + "step": 1764 + }, + { + "epoch": 0.21685710775279518, + "grad_norm": 1.3738017224088097, + "learning_rate": 2.8769511118995955e-05, + "loss": 0.6603, + "step": 1765 + }, + { + "epoch": 0.2169799729696523, + "grad_norm": 1.4864887850673314, + "learning_rate": 2.8766958048564477e-05, + "loss": 0.6948, + "step": 1766 + }, + { + "epoch": 0.2171028381865094, + "grad_norm": 1.3171959692020287, + "learning_rate": 2.8764402445790358e-05, + "loss": 0.7002, + "step": 1767 + }, + { + "epoch": 0.2172257034033665, + "grad_norm": 1.3897416607890123, + "learning_rate": 2.8761844311143665e-05, + "loss": 0.7061, + "step": 1768 + }, + { + "epoch": 0.21734856862022361, + "grad_norm": 1.7659557145356548, + "learning_rate": 2.8759283645094964e-05, + "loss": 0.7633, + "step": 1769 + }, + { + "epoch": 0.21747143383708073, + "grad_norm": 1.5495038265499104, + "learning_rate": 2.875672044811527e-05, + "loss": 0.6362, + "step": 1770 + }, + { + "epoch": 0.21759429905393782, + "grad_norm": 1.4035436606901, + "learning_rate": 2.8754154720676063e-05, + "loss": 0.63, + "step": 1771 + }, + { + "epoch": 0.21771716427079493, + "grad_norm": 1.4889221156055759, + "learning_rate": 2.8751586463249294e-05, + "loss": 0.6905, + "step": 1772 + }, + { + "epoch": 0.21784002948765205, + "grad_norm": 1.696148356290709, + "learning_rate": 2.8749015676307378e-05, + "loss": 0.6379, + "step": 1773 + }, + { + "epoch": 0.21796289470450916, + "grad_norm": 1.2936723184721173, + "learning_rate": 2.8746442360323192e-05, + "loss": 0.6278, + "step": 1774 + }, + { + "epoch": 0.21808575992136625, + "grad_norm": 1.9392152732515384, + "learning_rate": 2.8743866515770083e-05, + "loss": 0.7102, + "step": 1775 + }, + { + "epoch": 0.21820862513822337, + "grad_norm": 2.063666802902862, + "learning_rate": 2.8741288143121862e-05, + "loss": 0.7141, + "step": 1776 + }, + { + "epoch": 0.21833149035508048, + "grad_norm": 1.456365443379661, + "learning_rate": 2.8738707242852802e-05, + "loss": 0.5908, + "step": 1777 + }, + { + "epoch": 0.2184543555719376, + "grad_norm": 1.7810228877686733, + "learning_rate": 2.8736123815437646e-05, + "loss": 0.6452, + "step": 1778 + }, + { + "epoch": 0.21857722078879468, + "grad_norm": 1.7681707080122373, + "learning_rate": 2.8733537861351592e-05, + "loss": 0.6617, + "step": 1779 + }, + { + "epoch": 0.2187000860056518, + "grad_norm": 1.612447661769684, + "learning_rate": 2.8730949381070315e-05, + "loss": 0.5422, + "step": 1780 + }, + { + "epoch": 0.21882295122250892, + "grad_norm": 1.6082381294024288, + "learning_rate": 2.8728358375069946e-05, + "loss": 0.5493, + "step": 1781 + }, + { + "epoch": 0.218945816439366, + "grad_norm": 1.622571206700607, + "learning_rate": 2.8725764843827087e-05, + "loss": 0.7665, + "step": 1782 + }, + { + "epoch": 0.21906868165622312, + "grad_norm": 1.4748839329593193, + "learning_rate": 2.8723168787818804e-05, + "loss": 0.6619, + "step": 1783 + }, + { + "epoch": 0.21919154687308023, + "grad_norm": 1.4807385238567103, + "learning_rate": 2.8720570207522613e-05, + "loss": 0.6154, + "step": 1784 + }, + { + "epoch": 0.21931441208993735, + "grad_norm": 1.3288500790531856, + "learning_rate": 2.8717969103416516e-05, + "loss": 0.6564, + "step": 1785 + }, + { + "epoch": 0.21943727730679444, + "grad_norm": 1.2675067733686276, + "learning_rate": 2.871536547597897e-05, + "loss": 0.7659, + "step": 1786 + }, + { + "epoch": 0.21956014252365155, + "grad_norm": 1.4956245316524286, + "learning_rate": 2.871275932568889e-05, + "loss": 0.6505, + "step": 1787 + }, + { + "epoch": 0.21968300774050867, + "grad_norm": 1.4780631890590037, + "learning_rate": 2.8710150653025656e-05, + "loss": 0.6954, + "step": 1788 + }, + { + "epoch": 0.21980587295736578, + "grad_norm": 1.3506483466718562, + "learning_rate": 2.870753945846913e-05, + "loss": 0.7045, + "step": 1789 + }, + { + "epoch": 0.21992873817422287, + "grad_norm": 1.4965637481819267, + "learning_rate": 2.8704925742499614e-05, + "loss": 0.6832, + "step": 1790 + }, + { + "epoch": 0.22005160339108, + "grad_norm": 1.4411701957827523, + "learning_rate": 2.870230950559789e-05, + "loss": 0.7041, + "step": 1791 + }, + { + "epoch": 0.2201744686079371, + "grad_norm": 1.5752583301358818, + "learning_rate": 2.8699690748245194e-05, + "loss": 0.5988, + "step": 1792 + }, + { + "epoch": 0.2202973338247942, + "grad_norm": 1.8518471474217646, + "learning_rate": 2.8697069470923233e-05, + "loss": 0.617, + "step": 1793 + }, + { + "epoch": 0.2204201990416513, + "grad_norm": 1.5954333430800083, + "learning_rate": 2.8694445674114163e-05, + "loss": 0.6451, + "step": 1794 + }, + { + "epoch": 0.22054306425850842, + "grad_norm": 1.340741659774446, + "learning_rate": 2.8691819358300633e-05, + "loss": 0.617, + "step": 1795 + }, + { + "epoch": 0.22066592947536554, + "grad_norm": 1.6268783840507084, + "learning_rate": 2.868919052396572e-05, + "loss": 0.6857, + "step": 1796 + }, + { + "epoch": 0.22078879469222262, + "grad_norm": 1.5431271528733617, + "learning_rate": 2.8686559171592987e-05, + "loss": 0.7879, + "step": 1797 + }, + { + "epoch": 0.22091165990907974, + "grad_norm": 1.3544678277513882, + "learning_rate": 2.868392530166646e-05, + "loss": 0.7673, + "step": 1798 + }, + { + "epoch": 0.22103452512593685, + "grad_norm": 1.6520752541366406, + "learning_rate": 2.8681288914670615e-05, + "loss": 0.6755, + "step": 1799 + }, + { + "epoch": 0.22115739034279394, + "grad_norm": 1.3953205395508201, + "learning_rate": 2.86786500110904e-05, + "loss": 0.6975, + "step": 1800 + }, + { + "epoch": 0.22128025555965106, + "grad_norm": 1.4240899850580566, + "learning_rate": 2.867600859141122e-05, + "loss": 0.7133, + "step": 1801 + }, + { + "epoch": 0.22140312077650817, + "grad_norm": 1.4606511771391453, + "learning_rate": 2.8673364656118962e-05, + "loss": 0.6106, + "step": 1802 + }, + { + "epoch": 0.2215259859933653, + "grad_norm": 1.507556908392169, + "learning_rate": 2.8670718205699944e-05, + "loss": 0.6209, + "step": 1803 + }, + { + "epoch": 0.22164885121022238, + "grad_norm": 1.560981927890189, + "learning_rate": 2.866806924064097e-05, + "loss": 0.6045, + "step": 1804 + }, + { + "epoch": 0.2217717164270795, + "grad_norm": 1.273827898829418, + "learning_rate": 2.86654177614293e-05, + "loss": 0.6268, + "step": 1805 + }, + { + "epoch": 0.2218945816439366, + "grad_norm": 1.3727638870448748, + "learning_rate": 2.8662763768552656e-05, + "loss": 0.6789, + "step": 1806 + }, + { + "epoch": 0.22201744686079372, + "grad_norm": 2.965088387699349, + "learning_rate": 2.8660107262499223e-05, + "loss": 0.6154, + "step": 1807 + }, + { + "epoch": 0.2221403120776508, + "grad_norm": 1.5320266932756215, + "learning_rate": 2.8657448243757646e-05, + "loss": 0.6275, + "step": 1808 + }, + { + "epoch": 0.22226317729450792, + "grad_norm": 1.4076417079411998, + "learning_rate": 2.865478671281704e-05, + "loss": 0.7576, + "step": 1809 + }, + { + "epoch": 0.22238604251136504, + "grad_norm": 1.3196260480798208, + "learning_rate": 2.865212267016697e-05, + "loss": 0.6821, + "step": 1810 + }, + { + "epoch": 0.22250890772822213, + "grad_norm": 1.4942395573439717, + "learning_rate": 2.8649456116297475e-05, + "loss": 0.6465, + "step": 1811 + }, + { + "epoch": 0.22263177294507924, + "grad_norm": 1.4094611287058796, + "learning_rate": 2.8646787051699045e-05, + "loss": 0.6689, + "step": 1812 + }, + { + "epoch": 0.22275463816193636, + "grad_norm": 1.2904676310116452, + "learning_rate": 2.8644115476862636e-05, + "loss": 0.7223, + "step": 1813 + }, + { + "epoch": 0.22287750337879347, + "grad_norm": 1.651307371565112, + "learning_rate": 2.8641441392279676e-05, + "loss": 0.7286, + "step": 1814 + }, + { + "epoch": 0.22300036859565056, + "grad_norm": 1.5714536274111897, + "learning_rate": 2.8638764798442037e-05, + "loss": 0.6185, + "step": 1815 + }, + { + "epoch": 0.22312323381250768, + "grad_norm": 1.5031820102894207, + "learning_rate": 2.8636085695842063e-05, + "loss": 0.7403, + "step": 1816 + }, + { + "epoch": 0.2232460990293648, + "grad_norm": 1.399536515435283, + "learning_rate": 2.863340408497256e-05, + "loss": 0.6991, + "step": 1817 + }, + { + "epoch": 0.2233689642462219, + "grad_norm": 1.3094755651613494, + "learning_rate": 2.8630719966326793e-05, + "loss": 0.7518, + "step": 1818 + }, + { + "epoch": 0.223491829463079, + "grad_norm": 1.453795310909441, + "learning_rate": 2.8628033340398484e-05, + "loss": 0.6327, + "step": 1819 + }, + { + "epoch": 0.2236146946799361, + "grad_norm": 1.5432237232195412, + "learning_rate": 2.8625344207681822e-05, + "loss": 0.6831, + "step": 1820 + }, + { + "epoch": 0.22373755989679323, + "grad_norm": 2.335319782247208, + "learning_rate": 2.8622652568671458e-05, + "loss": 0.6936, + "step": 1821 + }, + { + "epoch": 0.2238604251136503, + "grad_norm": 1.4490215956993102, + "learning_rate": 2.86199584238625e-05, + "loss": 0.627, + "step": 1822 + }, + { + "epoch": 0.22398329033050743, + "grad_norm": 1.4626782776332612, + "learning_rate": 2.861726177375052e-05, + "loss": 0.7695, + "step": 1823 + }, + { + "epoch": 0.22410615554736454, + "grad_norm": 1.6923764823402485, + "learning_rate": 2.8614562618831543e-05, + "loss": 0.632, + "step": 1824 + }, + { + "epoch": 0.22422902076422166, + "grad_norm": 1.3686704539437728, + "learning_rate": 2.861186095960207e-05, + "loss": 0.6189, + "step": 1825 + }, + { + "epoch": 0.22435188598107875, + "grad_norm": 1.4344424061312906, + "learning_rate": 2.860915679655905e-05, + "loss": 0.5722, + "step": 1826 + }, + { + "epoch": 0.22447475119793586, + "grad_norm": 1.5240052673499629, + "learning_rate": 2.8606450130199895e-05, + "loss": 0.7051, + "step": 1827 + }, + { + "epoch": 0.22459761641479298, + "grad_norm": 1.3927712366216236, + "learning_rate": 2.8603740961022475e-05, + "loss": 0.6596, + "step": 1828 + }, + { + "epoch": 0.2247204816316501, + "grad_norm": 1.5471873600885089, + "learning_rate": 2.8601029289525133e-05, + "loss": 0.6439, + "step": 1829 + }, + { + "epoch": 0.22484334684850718, + "grad_norm": 1.4924486996742614, + "learning_rate": 2.8598315116206657e-05, + "loss": 0.7835, + "step": 1830 + }, + { + "epoch": 0.2249662120653643, + "grad_norm": 1.506633874583061, + "learning_rate": 2.8595598441566304e-05, + "loss": 0.6586, + "step": 1831 + }, + { + "epoch": 0.2250890772822214, + "grad_norm": 1.4699631897961438, + "learning_rate": 2.859287926610379e-05, + "loss": 0.6096, + "step": 1832 + }, + { + "epoch": 0.2252119424990785, + "grad_norm": 1.6905102808049715, + "learning_rate": 2.859015759031929e-05, + "loss": 0.6395, + "step": 1833 + }, + { + "epoch": 0.22533480771593561, + "grad_norm": 1.6741314167365013, + "learning_rate": 2.8587433414713433e-05, + "loss": 0.6452, + "step": 1834 + }, + { + "epoch": 0.22545767293279273, + "grad_norm": 1.397473365256371, + "learning_rate": 2.8584706739787315e-05, + "loss": 0.7386, + "step": 1835 + }, + { + "epoch": 0.22558053814964985, + "grad_norm": 1.6106582101490519, + "learning_rate": 2.8581977566042495e-05, + "loss": 0.5945, + "step": 1836 + }, + { + "epoch": 0.22570340336650693, + "grad_norm": 1.5356480867436941, + "learning_rate": 2.8579245893980984e-05, + "loss": 0.6676, + "step": 1837 + }, + { + "epoch": 0.22582626858336405, + "grad_norm": 1.3130595771816067, + "learning_rate": 2.8576511724105255e-05, + "loss": 0.711, + "step": 1838 + }, + { + "epoch": 0.22594913380022116, + "grad_norm": 1.5168601136604052, + "learning_rate": 2.857377505691824e-05, + "loss": 0.6571, + "step": 1839 + }, + { + "epoch": 0.22607199901707828, + "grad_norm": 1.6261293011707636, + "learning_rate": 2.8571035892923333e-05, + "loss": 0.6324, + "step": 1840 + }, + { + "epoch": 0.22619486423393537, + "grad_norm": 1.5098840129023414, + "learning_rate": 2.8568294232624384e-05, + "loss": 0.5987, + "step": 1841 + }, + { + "epoch": 0.22631772945079248, + "grad_norm": 1.4585375332168942, + "learning_rate": 2.856555007652571e-05, + "loss": 0.7053, + "step": 1842 + }, + { + "epoch": 0.2264405946676496, + "grad_norm": 1.6290144906544173, + "learning_rate": 2.856280342513207e-05, + "loss": 0.6671, + "step": 1843 + }, + { + "epoch": 0.22656345988450668, + "grad_norm": 1.5708948869055401, + "learning_rate": 2.8560054278948694e-05, + "loss": 0.7438, + "step": 1844 + }, + { + "epoch": 0.2266863251013638, + "grad_norm": 1.5181812268096109, + "learning_rate": 2.855730263848128e-05, + "loss": 0.7283, + "step": 1845 + }, + { + "epoch": 0.22680919031822092, + "grad_norm": 1.5222937630607194, + "learning_rate": 2.8554548504235963e-05, + "loss": 0.6848, + "step": 1846 + }, + { + "epoch": 0.22693205553507803, + "grad_norm": 1.63717186695256, + "learning_rate": 2.8551791876719357e-05, + "loss": 0.6274, + "step": 1847 + }, + { + "epoch": 0.22705492075193512, + "grad_norm": 1.4794082949391054, + "learning_rate": 2.8549032756438523e-05, + "loss": 0.7562, + "step": 1848 + }, + { + "epoch": 0.22717778596879223, + "grad_norm": 1.3567159547460232, + "learning_rate": 2.8546271143900976e-05, + "loss": 0.6093, + "step": 1849 + }, + { + "epoch": 0.22730065118564935, + "grad_norm": 1.4085643840451179, + "learning_rate": 2.8543507039614706e-05, + "loss": 0.6321, + "step": 1850 + }, + { + "epoch": 0.22742351640250644, + "grad_norm": 1.2530998619493356, + "learning_rate": 2.8540740444088148e-05, + "loss": 0.6336, + "step": 1851 + }, + { + "epoch": 0.22754638161936355, + "grad_norm": 1.8706096307005218, + "learning_rate": 2.8537971357830198e-05, + "loss": 0.7, + "step": 1852 + }, + { + "epoch": 0.22766924683622067, + "grad_norm": 1.4380390698268792, + "learning_rate": 2.853519978135022e-05, + "loss": 0.7924, + "step": 1853 + }, + { + "epoch": 0.22779211205307778, + "grad_norm": 1.4996711325471876, + "learning_rate": 2.8532425715158018e-05, + "loss": 0.665, + "step": 1854 + }, + { + "epoch": 0.22791497726993487, + "grad_norm": 1.5077874358423065, + "learning_rate": 2.8529649159763868e-05, + "loss": 0.5998, + "step": 1855 + }, + { + "epoch": 0.228037842486792, + "grad_norm": 1.4531640965162798, + "learning_rate": 2.852687011567849e-05, + "loss": 0.6146, + "step": 1856 + }, + { + "epoch": 0.2281607077036491, + "grad_norm": 1.659888400424189, + "learning_rate": 2.852408858341309e-05, + "loss": 0.774, + "step": 1857 + }, + { + "epoch": 0.22828357292050622, + "grad_norm": 1.4286494053277208, + "learning_rate": 2.8521304563479295e-05, + "loss": 0.6878, + "step": 1858 + }, + { + "epoch": 0.2284064381373633, + "grad_norm": 1.4015496633660385, + "learning_rate": 2.8518518056389217e-05, + "loss": 0.562, + "step": 1859 + }, + { + "epoch": 0.22852930335422042, + "grad_norm": 1.4366999349835166, + "learning_rate": 2.851572906265541e-05, + "loss": 0.7344, + "step": 1860 + }, + { + "epoch": 0.22865216857107754, + "grad_norm": 1.2157170599344538, + "learning_rate": 2.8512937582790896e-05, + "loss": 0.6858, + "step": 1861 + }, + { + "epoch": 0.22877503378793462, + "grad_norm": 1.5036358484362105, + "learning_rate": 2.851014361730915e-05, + "loss": 0.6228, + "step": 1862 + }, + { + "epoch": 0.22889789900479174, + "grad_norm": 1.3508153229437634, + "learning_rate": 2.85073471667241e-05, + "loss": 0.7308, + "step": 1863 + }, + { + "epoch": 0.22902076422164885, + "grad_norm": 2.001244753692986, + "learning_rate": 2.8504548231550143e-05, + "loss": 0.8058, + "step": 1864 + }, + { + "epoch": 0.22914362943850597, + "grad_norm": 1.2961830146593696, + "learning_rate": 2.850174681230211e-05, + "loss": 0.6452, + "step": 1865 + }, + { + "epoch": 0.22926649465536306, + "grad_norm": 1.516515852490975, + "learning_rate": 2.8498942909495316e-05, + "loss": 0.5926, + "step": 1866 + }, + { + "epoch": 0.22938935987222017, + "grad_norm": 2.5403673589903, + "learning_rate": 2.849613652364552e-05, + "loss": 0.8644, + "step": 1867 + }, + { + "epoch": 0.2295122250890773, + "grad_norm": 1.2829369270450584, + "learning_rate": 2.8493327655268934e-05, + "loss": 0.6448, + "step": 1868 + }, + { + "epoch": 0.2296350903059344, + "grad_norm": 1.6178841583904429, + "learning_rate": 2.8490516304882233e-05, + "loss": 0.809, + "step": 1869 + }, + { + "epoch": 0.2297579555227915, + "grad_norm": 1.510433625584689, + "learning_rate": 2.8487702473002548e-05, + "loss": 0.755, + "step": 1870 + }, + { + "epoch": 0.2298808207396486, + "grad_norm": 4.370268235241678, + "learning_rate": 2.8484886160147463e-05, + "loss": 0.7278, + "step": 1871 + }, + { + "epoch": 0.23000368595650572, + "grad_norm": 1.4465108565121152, + "learning_rate": 2.8482067366835017e-05, + "loss": 0.6684, + "step": 1872 + }, + { + "epoch": 0.2301265511733628, + "grad_norm": 1.887210192886744, + "learning_rate": 2.847924609358372e-05, + "loss": 0.5999, + "step": 1873 + }, + { + "epoch": 0.23024941639021992, + "grad_norm": 1.4057346451767663, + "learning_rate": 2.8476422340912517e-05, + "loss": 0.6011, + "step": 1874 + }, + { + "epoch": 0.23037228160707704, + "grad_norm": 1.7446034416496725, + "learning_rate": 2.8473596109340824e-05, + "loss": 0.7953, + "step": 1875 + }, + { + "epoch": 0.23049514682393415, + "grad_norm": 1.3633189839077127, + "learning_rate": 2.8470767399388505e-05, + "loss": 0.7228, + "step": 1876 + }, + { + "epoch": 0.23061801204079124, + "grad_norm": 1.7754713428117912, + "learning_rate": 2.846793621157588e-05, + "loss": 0.8221, + "step": 1877 + }, + { + "epoch": 0.23074087725764836, + "grad_norm": 2.2716535844899033, + "learning_rate": 2.8465102546423734e-05, + "loss": 0.6776, + "step": 1878 + }, + { + "epoch": 0.23086374247450547, + "grad_norm": 1.7960038709897752, + "learning_rate": 2.84622664044533e-05, + "loss": 0.6547, + "step": 1879 + }, + { + "epoch": 0.2309866076913626, + "grad_norm": 1.496169058963142, + "learning_rate": 2.845942778618627e-05, + "loss": 0.5122, + "step": 1880 + }, + { + "epoch": 0.23110947290821968, + "grad_norm": 1.287323081903452, + "learning_rate": 2.8456586692144783e-05, + "loss": 0.6124, + "step": 1881 + }, + { + "epoch": 0.2312323381250768, + "grad_norm": 1.4827077860582607, + "learning_rate": 2.845374312285144e-05, + "loss": 0.7361, + "step": 1882 + }, + { + "epoch": 0.2313552033419339, + "grad_norm": 1.6143459011730577, + "learning_rate": 2.8450897078829305e-05, + "loss": 0.7147, + "step": 1883 + }, + { + "epoch": 0.231478068558791, + "grad_norm": 1.6004899107365949, + "learning_rate": 2.8448048560601882e-05, + "loss": 0.6763, + "step": 1884 + }, + { + "epoch": 0.2316009337756481, + "grad_norm": 1.2148936782904822, + "learning_rate": 2.844519756869314e-05, + "loss": 0.6314, + "step": 1885 + }, + { + "epoch": 0.23172379899250523, + "grad_norm": 1.329826147481339, + "learning_rate": 2.8442344103627502e-05, + "loss": 0.6754, + "step": 1886 + }, + { + "epoch": 0.23184666420936234, + "grad_norm": 1.3399048439164958, + "learning_rate": 2.843948816592984e-05, + "loss": 0.5972, + "step": 1887 + }, + { + "epoch": 0.23196952942621943, + "grad_norm": 1.7135085008999384, + "learning_rate": 2.8436629756125486e-05, + "loss": 0.6374, + "step": 1888 + }, + { + "epoch": 0.23209239464307654, + "grad_norm": 1.258186454586413, + "learning_rate": 2.8433768874740236e-05, + "loss": 0.6523, + "step": 1889 + }, + { + "epoch": 0.23221525985993366, + "grad_norm": 1.5234352936862412, + "learning_rate": 2.843090552230032e-05, + "loss": 0.6393, + "step": 1890 + }, + { + "epoch": 0.23233812507679075, + "grad_norm": 1.4465023307084601, + "learning_rate": 2.8428039699332427e-05, + "loss": 0.6297, + "step": 1891 + }, + { + "epoch": 0.23246099029364786, + "grad_norm": 1.2853270720512924, + "learning_rate": 2.8425171406363722e-05, + "loss": 0.5533, + "step": 1892 + }, + { + "epoch": 0.23258385551050498, + "grad_norm": 1.8448448951737586, + "learning_rate": 2.8422300643921806e-05, + "loss": 0.7058, + "step": 1893 + }, + { + "epoch": 0.2327067207273621, + "grad_norm": 1.4982710264026378, + "learning_rate": 2.8419427412534727e-05, + "loss": 0.6527, + "step": 1894 + }, + { + "epoch": 0.23282958594421918, + "grad_norm": 1.2485865223947104, + "learning_rate": 2.841655171273101e-05, + "loss": 0.6914, + "step": 1895 + }, + { + "epoch": 0.2329524511610763, + "grad_norm": 1.3923614941702425, + "learning_rate": 2.8413673545039608e-05, + "loss": 0.6072, + "step": 1896 + }, + { + "epoch": 0.2330753163779334, + "grad_norm": 1.404476769690423, + "learning_rate": 2.841079290998995e-05, + "loss": 0.7326, + "step": 1897 + }, + { + "epoch": 0.23319818159479053, + "grad_norm": 1.2852237261488053, + "learning_rate": 2.840790980811191e-05, + "loss": 0.6571, + "step": 1898 + }, + { + "epoch": 0.23332104681164761, + "grad_norm": 1.3972692087599041, + "learning_rate": 2.8405024239935813e-05, + "loss": 0.729, + "step": 1899 + }, + { + "epoch": 0.23344391202850473, + "grad_norm": 1.4864621357061747, + "learning_rate": 2.840213620599244e-05, + "loss": 0.5903, + "step": 1900 + }, + { + "epoch": 0.23356677724536185, + "grad_norm": 1.3152534089536692, + "learning_rate": 2.839924570681303e-05, + "loss": 0.5965, + "step": 1901 + }, + { + "epoch": 0.23368964246221893, + "grad_norm": 1.485620835511369, + "learning_rate": 2.839635274292927e-05, + "loss": 0.7124, + "step": 1902 + }, + { + "epoch": 0.23381250767907605, + "grad_norm": 1.551200356633622, + "learning_rate": 2.83934573148733e-05, + "loss": 0.665, + "step": 1903 + }, + { + "epoch": 0.23393537289593316, + "grad_norm": 1.4198346625122482, + "learning_rate": 2.8390559423177718e-05, + "loss": 0.6506, + "step": 1904 + }, + { + "epoch": 0.23405823811279028, + "grad_norm": 1.4235842540004153, + "learning_rate": 2.8387659068375566e-05, + "loss": 0.7081, + "step": 1905 + }, + { + "epoch": 0.23418110332964737, + "grad_norm": 1.4344891158873314, + "learning_rate": 2.8384756251000354e-05, + "loss": 0.6662, + "step": 1906 + }, + { + "epoch": 0.23430396854650448, + "grad_norm": 1.5332555586173227, + "learning_rate": 2.8381850971586035e-05, + "loss": 0.66, + "step": 1907 + }, + { + "epoch": 0.2344268337633616, + "grad_norm": 1.2967297594161404, + "learning_rate": 2.8378943230667012e-05, + "loss": 0.6624, + "step": 1908 + }, + { + "epoch": 0.2345496989802187, + "grad_norm": 1.359915295372671, + "learning_rate": 2.8376033028778145e-05, + "loss": 0.6995, + "step": 1909 + }, + { + "epoch": 0.2346725641970758, + "grad_norm": 1.3942555135273065, + "learning_rate": 2.8373120366454753e-05, + "loss": 0.6181, + "step": 1910 + }, + { + "epoch": 0.23479542941393292, + "grad_norm": 1.269484709785803, + "learning_rate": 2.8370205244232598e-05, + "loss": 0.5707, + "step": 1911 + }, + { + "epoch": 0.23491829463079003, + "grad_norm": 1.3514178140455484, + "learning_rate": 2.8367287662647894e-05, + "loss": 0.6359, + "step": 1912 + }, + { + "epoch": 0.23504115984764712, + "grad_norm": 1.3466151119197496, + "learning_rate": 2.8364367622237314e-05, + "loss": 0.6952, + "step": 1913 + }, + { + "epoch": 0.23516402506450423, + "grad_norm": 1.4843714452836088, + "learning_rate": 2.8361445123537982e-05, + "loss": 0.5425, + "step": 1914 + }, + { + "epoch": 0.23528689028136135, + "grad_norm": 1.4956555371305023, + "learning_rate": 2.8358520167087472e-05, + "loss": 0.646, + "step": 1915 + }, + { + "epoch": 0.23540975549821846, + "grad_norm": 1.4511761513252135, + "learning_rate": 2.835559275342381e-05, + "loss": 0.6671, + "step": 1916 + }, + { + "epoch": 0.23553262071507555, + "grad_norm": 1.4228126773926266, + "learning_rate": 2.8352662883085475e-05, + "loss": 0.7286, + "step": 1917 + }, + { + "epoch": 0.23565548593193267, + "grad_norm": 1.4820952365994307, + "learning_rate": 2.8349730556611394e-05, + "loss": 0.6744, + "step": 1918 + }, + { + "epoch": 0.23577835114878978, + "grad_norm": 1.5351919469200723, + "learning_rate": 2.8346795774540958e-05, + "loss": 0.6479, + "step": 1919 + }, + { + "epoch": 0.2359012163656469, + "grad_norm": 1.37057691539937, + "learning_rate": 2.8343858537414e-05, + "loss": 0.6084, + "step": 1920 + }, + { + "epoch": 0.236024081582504, + "grad_norm": 1.3661348280550993, + "learning_rate": 2.8340918845770795e-05, + "loss": 0.7809, + "step": 1921 + }, + { + "epoch": 0.2361469467993611, + "grad_norm": 1.322116260135962, + "learning_rate": 2.8337976700152087e-05, + "loss": 0.6246, + "step": 1922 + }, + { + "epoch": 0.23626981201621822, + "grad_norm": 1.381105141645205, + "learning_rate": 2.833503210109907e-05, + "loss": 0.6137, + "step": 1923 + }, + { + "epoch": 0.2363926772330753, + "grad_norm": 1.208097543212404, + "learning_rate": 2.8332085049153374e-05, + "loss": 0.6599, + "step": 1924 + }, + { + "epoch": 0.23651554244993242, + "grad_norm": 1.5428124159588716, + "learning_rate": 2.8329135544857096e-05, + "loss": 0.6494, + "step": 1925 + }, + { + "epoch": 0.23663840766678954, + "grad_norm": 1.313583034498013, + "learning_rate": 2.8326183588752778e-05, + "loss": 0.582, + "step": 1926 + }, + { + "epoch": 0.23676127288364665, + "grad_norm": 1.5473375659027344, + "learning_rate": 2.832322918138341e-05, + "loss": 0.598, + "step": 1927 + }, + { + "epoch": 0.23688413810050374, + "grad_norm": 1.7003842564348794, + "learning_rate": 2.8320272323292443e-05, + "loss": 0.6766, + "step": 1928 + }, + { + "epoch": 0.23700700331736085, + "grad_norm": 1.4806989926966707, + "learning_rate": 2.8317313015023762e-05, + "loss": 0.6477, + "step": 1929 + }, + { + "epoch": 0.23712986853421797, + "grad_norm": 1.853546081859132, + "learning_rate": 2.8314351257121724e-05, + "loss": 0.7146, + "step": 1930 + }, + { + "epoch": 0.23725273375107508, + "grad_norm": 1.558019557252877, + "learning_rate": 2.8311387050131112e-05, + "loss": 0.6598, + "step": 1931 + }, + { + "epoch": 0.23737559896793217, + "grad_norm": 1.4023278675571245, + "learning_rate": 2.830842039459718e-05, + "loss": 0.7657, + "step": 1932 + }, + { + "epoch": 0.2374984641847893, + "grad_norm": 1.468190300270811, + "learning_rate": 2.830545129106563e-05, + "loss": 0.6496, + "step": 1933 + }, + { + "epoch": 0.2376213294016464, + "grad_norm": 1.379582875261381, + "learning_rate": 2.83024797400826e-05, + "loss": 0.5525, + "step": 1934 + }, + { + "epoch": 0.2377441946185035, + "grad_norm": 1.7511789748946138, + "learning_rate": 2.8299505742194693e-05, + "loss": 0.6082, + "step": 1935 + }, + { + "epoch": 0.2378670598353606, + "grad_norm": 1.5702784966862657, + "learning_rate": 2.829652929794895e-05, + "loss": 0.7082, + "step": 1936 + }, + { + "epoch": 0.23798992505221772, + "grad_norm": 1.4967315420014118, + "learning_rate": 2.829355040789288e-05, + "loss": 0.7402, + "step": 1937 + }, + { + "epoch": 0.23811279026907484, + "grad_norm": 1.437117788317812, + "learning_rate": 2.829056907257442e-05, + "loss": 0.778, + "step": 1938 + }, + { + "epoch": 0.23823565548593192, + "grad_norm": 1.2239769140368264, + "learning_rate": 2.8287585292541977e-05, + "loss": 0.6213, + "step": 1939 + }, + { + "epoch": 0.23835852070278904, + "grad_norm": 1.4129269706960363, + "learning_rate": 2.828459906834439e-05, + "loss": 0.607, + "step": 1940 + }, + { + "epoch": 0.23848138591964615, + "grad_norm": 1.4626347584386912, + "learning_rate": 2.828161040053096e-05, + "loss": 0.7302, + "step": 1941 + }, + { + "epoch": 0.23860425113650324, + "grad_norm": 1.3770035681366322, + "learning_rate": 2.8278619289651423e-05, + "loss": 0.7243, + "step": 1942 + }, + { + "epoch": 0.23872711635336036, + "grad_norm": 1.3957130521888303, + "learning_rate": 2.827562573625599e-05, + "loss": 0.677, + "step": 1943 + }, + { + "epoch": 0.23884998157021747, + "grad_norm": 1.5511831974565842, + "learning_rate": 2.8272629740895294e-05, + "loss": 0.7573, + "step": 1944 + }, + { + "epoch": 0.2389728467870746, + "grad_norm": 1.519656144361857, + "learning_rate": 2.8269631304120433e-05, + "loss": 0.7558, + "step": 1945 + }, + { + "epoch": 0.23909571200393168, + "grad_norm": 1.2579960576983111, + "learning_rate": 2.8266630426482956e-05, + "loss": 0.6597, + "step": 1946 + }, + { + "epoch": 0.2392185772207888, + "grad_norm": 1.7786790968784365, + "learning_rate": 2.8263627108534843e-05, + "loss": 0.6336, + "step": 1947 + }, + { + "epoch": 0.2393414424376459, + "grad_norm": 1.4270768300179948, + "learning_rate": 2.826062135082854e-05, + "loss": 0.6777, + "step": 1948 + }, + { + "epoch": 0.23946430765450302, + "grad_norm": 1.466380685865264, + "learning_rate": 2.8257613153916946e-05, + "loss": 0.6341, + "step": 1949 + }, + { + "epoch": 0.2395871728713601, + "grad_norm": 1.3061907239609516, + "learning_rate": 2.8254602518353384e-05, + "loss": 0.66, + "step": 1950 + }, + { + "epoch": 0.23971003808821723, + "grad_norm": 1.2442035326962946, + "learning_rate": 2.825158944469165e-05, + "loss": 0.6808, + "step": 1951 + }, + { + "epoch": 0.23983290330507434, + "grad_norm": 1.6694452677742735, + "learning_rate": 2.8248573933485977e-05, + "loss": 0.6226, + "step": 1952 + }, + { + "epoch": 0.23995576852193143, + "grad_norm": 1.6442217807427004, + "learning_rate": 2.824555598529105e-05, + "loss": 0.6075, + "step": 1953 + }, + { + "epoch": 0.24007863373878854, + "grad_norm": 1.4546350857553723, + "learning_rate": 2.8242535600662e-05, + "loss": 0.5795, + "step": 1954 + }, + { + "epoch": 0.24020149895564566, + "grad_norm": 1.2840767694102855, + "learning_rate": 2.8239512780154406e-05, + "loss": 0.6792, + "step": 1955 + }, + { + "epoch": 0.24032436417250277, + "grad_norm": 1.277044929046929, + "learning_rate": 2.8236487524324298e-05, + "loss": 0.6443, + "step": 1956 + }, + { + "epoch": 0.24044722938935986, + "grad_norm": 1.6442892528511088, + "learning_rate": 2.823345983372815e-05, + "loss": 0.8002, + "step": 1957 + }, + { + "epoch": 0.24057009460621698, + "grad_norm": 1.3335844209295504, + "learning_rate": 2.8230429708922886e-05, + "loss": 0.7032, + "step": 1958 + }, + { + "epoch": 0.2406929598230741, + "grad_norm": 1.5505960780571655, + "learning_rate": 2.8227397150465884e-05, + "loss": 0.6724, + "step": 1959 + }, + { + "epoch": 0.2408158250399312, + "grad_norm": 1.498695084337685, + "learning_rate": 2.8224362158914958e-05, + "loss": 0.7534, + "step": 1960 + }, + { + "epoch": 0.2409386902567883, + "grad_norm": 1.1470341279046108, + "learning_rate": 2.8221324734828376e-05, + "loss": 0.7165, + "step": 1961 + }, + { + "epoch": 0.2410615554736454, + "grad_norm": 1.475911707098852, + "learning_rate": 2.8218284878764848e-05, + "loss": 0.6775, + "step": 1962 + }, + { + "epoch": 0.24118442069050253, + "grad_norm": 1.3115133290906498, + "learning_rate": 2.8215242591283545e-05, + "loss": 0.676, + "step": 1963 + }, + { + "epoch": 0.24130728590735961, + "grad_norm": 1.2209383075535614, + "learning_rate": 2.8212197872944067e-05, + "loss": 0.7392, + "step": 1964 + }, + { + "epoch": 0.24143015112421673, + "grad_norm": 1.6089871437004832, + "learning_rate": 2.820915072430648e-05, + "loss": 0.7093, + "step": 1965 + }, + { + "epoch": 0.24155301634107385, + "grad_norm": 1.6515281515423277, + "learning_rate": 2.8206101145931275e-05, + "loss": 0.6113, + "step": 1966 + }, + { + "epoch": 0.24167588155793096, + "grad_norm": 1.4162345428042113, + "learning_rate": 2.8203049138379415e-05, + "loss": 0.5544, + "step": 1967 + }, + { + "epoch": 0.24179874677478805, + "grad_norm": 1.3971084905694233, + "learning_rate": 2.8199994702212287e-05, + "loss": 0.7639, + "step": 1968 + }, + { + "epoch": 0.24192161199164516, + "grad_norm": 1.3855702449642475, + "learning_rate": 2.819693783799174e-05, + "loss": 0.6513, + "step": 1969 + }, + { + "epoch": 0.24204447720850228, + "grad_norm": 2.190686437100372, + "learning_rate": 2.8193878546280067e-05, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.2421673424253594, + "grad_norm": 1.5240940782536632, + "learning_rate": 2.8190816827639994e-05, + "loss": 0.7127, + "step": 1971 + }, + { + "epoch": 0.24229020764221648, + "grad_norm": 1.438662761978373, + "learning_rate": 2.8187752682634715e-05, + "loss": 0.6844, + "step": 1972 + }, + { + "epoch": 0.2424130728590736, + "grad_norm": 1.5707932477258382, + "learning_rate": 2.818468611182785e-05, + "loss": 0.7183, + "step": 1973 + }, + { + "epoch": 0.2425359380759307, + "grad_norm": 1.3053956466734793, + "learning_rate": 2.8181617115783483e-05, + "loss": 0.5957, + "step": 1974 + }, + { + "epoch": 0.2426588032927878, + "grad_norm": 1.4710345098565927, + "learning_rate": 2.8178545695066137e-05, + "loss": 0.6912, + "step": 1975 + }, + { + "epoch": 0.24278166850964492, + "grad_norm": 1.7154406904606465, + "learning_rate": 2.817547185024077e-05, + "loss": 0.6374, + "step": 1976 + }, + { + "epoch": 0.24290453372650203, + "grad_norm": 1.5493212875778228, + "learning_rate": 2.8172395581872808e-05, + "loss": 0.6356, + "step": 1977 + }, + { + "epoch": 0.24302739894335915, + "grad_norm": 1.6100947857834758, + "learning_rate": 2.81693168905281e-05, + "loss": 0.8758, + "step": 1978 + }, + { + "epoch": 0.24315026416021623, + "grad_norm": 1.399459702540571, + "learning_rate": 2.816623577677296e-05, + "loss": 0.6635, + "step": 1979 + }, + { + "epoch": 0.24327312937707335, + "grad_norm": 1.7494671086516875, + "learning_rate": 2.8163152241174133e-05, + "loss": 0.714, + "step": 1980 + }, + { + "epoch": 0.24339599459393046, + "grad_norm": 1.2564794400626866, + "learning_rate": 2.816006628429882e-05, + "loss": 0.5881, + "step": 1981 + }, + { + "epoch": 0.24351885981078755, + "grad_norm": 1.9919130720505156, + "learning_rate": 2.8156977906714657e-05, + "loss": 0.6083, + "step": 1982 + }, + { + "epoch": 0.24364172502764467, + "grad_norm": 1.3745146721708865, + "learning_rate": 2.8153887108989734e-05, + "loss": 0.6943, + "step": 1983 + }, + { + "epoch": 0.24376459024450178, + "grad_norm": 1.268344795727946, + "learning_rate": 2.8150793891692582e-05, + "loss": 0.6086, + "step": 1984 + }, + { + "epoch": 0.2438874554613589, + "grad_norm": 1.3449328304993349, + "learning_rate": 2.8147698255392183e-05, + "loss": 0.6589, + "step": 1985 + }, + { + "epoch": 0.244010320678216, + "grad_norm": 1.439757763112672, + "learning_rate": 2.8144600200657953e-05, + "loss": 0.641, + "step": 1986 + }, + { + "epoch": 0.2441331858950731, + "grad_norm": 1.762316814535619, + "learning_rate": 2.8141499728059765e-05, + "loss": 0.7405, + "step": 1987 + }, + { + "epoch": 0.24425605111193022, + "grad_norm": 1.25115716754034, + "learning_rate": 2.8138396838167925e-05, + "loss": 0.6958, + "step": 1988 + }, + { + "epoch": 0.24437891632878733, + "grad_norm": 1.6456221819877082, + "learning_rate": 2.8135291531553192e-05, + "loss": 0.6639, + "step": 1989 + }, + { + "epoch": 0.24450178154564442, + "grad_norm": 1.382825629217359, + "learning_rate": 2.8132183808786772e-05, + "loss": 0.5401, + "step": 1990 + }, + { + "epoch": 0.24462464676250154, + "grad_norm": 1.786367532191298, + "learning_rate": 2.8129073670440297e-05, + "loss": 0.7728, + "step": 1991 + }, + { + "epoch": 0.24474751197935865, + "grad_norm": 1.3163113255729968, + "learning_rate": 2.812596111708587e-05, + "loss": 0.7267, + "step": 1992 + }, + { + "epoch": 0.24487037719621574, + "grad_norm": 1.4706158448855873, + "learning_rate": 2.8122846149296025e-05, + "loss": 0.5809, + "step": 1993 + }, + { + "epoch": 0.24499324241307285, + "grad_norm": 1.4015651795879798, + "learning_rate": 2.8119728767643725e-05, + "loss": 0.6112, + "step": 1994 + }, + { + "epoch": 0.24511610762992997, + "grad_norm": 1.1622595536295528, + "learning_rate": 2.8116608972702414e-05, + "loss": 0.6915, + "step": 1995 + }, + { + "epoch": 0.24523897284678708, + "grad_norm": 1.3986387553198485, + "learning_rate": 2.811348676504594e-05, + "loss": 0.5903, + "step": 1996 + }, + { + "epoch": 0.24536183806364417, + "grad_norm": 1.3818424229418955, + "learning_rate": 2.8110362145248617e-05, + "loss": 0.6585, + "step": 1997 + }, + { + "epoch": 0.2454847032805013, + "grad_norm": 1.4204339916761488, + "learning_rate": 2.8107235113885206e-05, + "loss": 0.7314, + "step": 1998 + }, + { + "epoch": 0.2456075684973584, + "grad_norm": 1.3194562893888564, + "learning_rate": 2.81041056715309e-05, + "loss": 0.7335, + "step": 1999 + }, + { + "epoch": 0.24573043371421552, + "grad_norm": 1.3313819364191601, + "learning_rate": 2.8100973818761332e-05, + "loss": 0.6538, + "step": 2000 + }, + { + "epoch": 0.2458532989310726, + "grad_norm": 1.3209600194662847, + "learning_rate": 2.80978395561526e-05, + "loss": 0.566, + "step": 2001 + }, + { + "epoch": 0.24597616414792972, + "grad_norm": 1.6123311151225557, + "learning_rate": 2.8094702884281224e-05, + "loss": 0.7222, + "step": 2002 + }, + { + "epoch": 0.24609902936478684, + "grad_norm": 1.6141413148952697, + "learning_rate": 2.8091563803724172e-05, + "loss": 0.7919, + "step": 2003 + }, + { + "epoch": 0.24622189458164392, + "grad_norm": 1.4805022083182833, + "learning_rate": 2.808842231505886e-05, + "loss": 0.742, + "step": 2004 + }, + { + "epoch": 0.24634475979850104, + "grad_norm": 1.3726328526200597, + "learning_rate": 2.8085278418863142e-05, + "loss": 0.6494, + "step": 2005 + }, + { + "epoch": 0.24646762501535815, + "grad_norm": 1.5902594275810606, + "learning_rate": 2.8082132115715323e-05, + "loss": 0.6231, + "step": 2006 + }, + { + "epoch": 0.24659049023221527, + "grad_norm": 1.4146670874059721, + "learning_rate": 2.8078983406194142e-05, + "loss": 0.6781, + "step": 2007 + }, + { + "epoch": 0.24671335544907236, + "grad_norm": 1.4037907866357373, + "learning_rate": 2.8075832290878782e-05, + "loss": 0.7053, + "step": 2008 + }, + { + "epoch": 0.24683622066592947, + "grad_norm": 1.3838576241359217, + "learning_rate": 2.8072678770348876e-05, + "loss": 0.6959, + "step": 2009 + }, + { + "epoch": 0.2469590858827866, + "grad_norm": 1.3256960909047104, + "learning_rate": 2.8069522845184484e-05, + "loss": 0.7072, + "step": 2010 + }, + { + "epoch": 0.2470819510996437, + "grad_norm": 1.6990816204286645, + "learning_rate": 2.8066364515966126e-05, + "loss": 0.5782, + "step": 2011 + }, + { + "epoch": 0.2472048163165008, + "grad_norm": 1.5760441863662125, + "learning_rate": 2.8063203783274755e-05, + "loss": 0.6522, + "step": 2012 + }, + { + "epoch": 0.2473276815333579, + "grad_norm": 1.3017445847462996, + "learning_rate": 2.8060040647691765e-05, + "loss": 0.7389, + "step": 2013 + }, + { + "epoch": 0.24745054675021502, + "grad_norm": 1.5322655558793852, + "learning_rate": 2.8056875109798994e-05, + "loss": 0.6701, + "step": 2014 + }, + { + "epoch": 0.2475734119670721, + "grad_norm": 1.3239277810471157, + "learning_rate": 2.8053707170178726e-05, + "loss": 0.5927, + "step": 2015 + }, + { + "epoch": 0.24769627718392923, + "grad_norm": 1.7122023756727283, + "learning_rate": 2.8050536829413676e-05, + "loss": 0.6685, + "step": 2016 + }, + { + "epoch": 0.24781914240078634, + "grad_norm": 1.379929756253379, + "learning_rate": 2.8047364088087015e-05, + "loss": 0.672, + "step": 2017 + }, + { + "epoch": 0.24794200761764346, + "grad_norm": 1.4672385508645174, + "learning_rate": 2.8044188946782344e-05, + "loss": 0.6875, + "step": 2018 + }, + { + "epoch": 0.24806487283450054, + "grad_norm": 1.2818332175213603, + "learning_rate": 2.804101140608371e-05, + "loss": 0.6817, + "step": 2019 + }, + { + "epoch": 0.24818773805135766, + "grad_norm": 1.3443159922526535, + "learning_rate": 2.8037831466575603e-05, + "loss": 0.6257, + "step": 2020 + }, + { + "epoch": 0.24831060326821477, + "grad_norm": 1.3684341436030807, + "learning_rate": 2.8034649128842952e-05, + "loss": 0.6237, + "step": 2021 + }, + { + "epoch": 0.2484334684850719, + "grad_norm": 1.521236477562804, + "learning_rate": 2.8031464393471126e-05, + "loss": 0.765, + "step": 2022 + }, + { + "epoch": 0.24855633370192898, + "grad_norm": 1.2700414537613245, + "learning_rate": 2.8028277261045933e-05, + "loss": 0.5835, + "step": 2023 + }, + { + "epoch": 0.2486791989187861, + "grad_norm": 1.3247446259924713, + "learning_rate": 2.8025087732153634e-05, + "loss": 0.6208, + "step": 2024 + }, + { + "epoch": 0.2488020641356432, + "grad_norm": 1.7381089541706063, + "learning_rate": 2.802189580738092e-05, + "loss": 0.6529, + "step": 2025 + }, + { + "epoch": 0.2489249293525003, + "grad_norm": 1.5512176696744036, + "learning_rate": 2.8018701487314917e-05, + "loss": 0.7041, + "step": 2026 + }, + { + "epoch": 0.2490477945693574, + "grad_norm": 1.1850654755163519, + "learning_rate": 2.8015504772543204e-05, + "loss": 0.5769, + "step": 2027 + }, + { + "epoch": 0.24917065978621453, + "grad_norm": 1.6554833465691456, + "learning_rate": 2.8012305663653797e-05, + "loss": 0.6996, + "step": 2028 + }, + { + "epoch": 0.24929352500307164, + "grad_norm": 1.319000723517553, + "learning_rate": 2.800910416123515e-05, + "loss": 0.6831, + "step": 2029 + }, + { + "epoch": 0.24941639021992873, + "grad_norm": 1.3765260986952075, + "learning_rate": 2.8005900265876167e-05, + "loss": 0.6499, + "step": 2030 + }, + { + "epoch": 0.24953925543678585, + "grad_norm": 1.4254504164566582, + "learning_rate": 2.8002693978166166e-05, + "loss": 0.6654, + "step": 2031 + }, + { + "epoch": 0.24966212065364296, + "grad_norm": 1.5581203637553405, + "learning_rate": 2.799948529869494e-05, + "loss": 0.6108, + "step": 2032 + }, + { + "epoch": 0.24978498587050005, + "grad_norm": 1.7270384715397098, + "learning_rate": 2.7996274228052698e-05, + "loss": 0.6546, + "step": 2033 + }, + { + "epoch": 0.24990785108735716, + "grad_norm": 1.5185268134010126, + "learning_rate": 2.7993060766830093e-05, + "loss": 0.703, + "step": 2034 + }, + { + "epoch": 0.25003071630421425, + "grad_norm": 1.4411454688133087, + "learning_rate": 2.7989844915618226e-05, + "loss": 0.6304, + "step": 2035 + }, + { + "epoch": 0.25015358152107137, + "grad_norm": 1.2515171408718433, + "learning_rate": 2.7986626675008625e-05, + "loss": 0.5689, + "step": 2036 + }, + { + "epoch": 0.2502764467379285, + "grad_norm": 1.448157319298629, + "learning_rate": 2.7983406045593273e-05, + "loss": 0.6073, + "step": 2037 + }, + { + "epoch": 0.2503993119547856, + "grad_norm": 1.3144484869998798, + "learning_rate": 2.7980183027964573e-05, + "loss": 0.6572, + "step": 2038 + }, + { + "epoch": 0.2505221771716427, + "grad_norm": 1.196309578451918, + "learning_rate": 2.797695762271539e-05, + "loss": 0.5874, + "step": 2039 + }, + { + "epoch": 0.25064504238849983, + "grad_norm": 1.2855448807610967, + "learning_rate": 2.7973729830439008e-05, + "loss": 0.6337, + "step": 2040 + }, + { + "epoch": 0.25076790760535694, + "grad_norm": 1.3613547160121815, + "learning_rate": 2.797049965172916e-05, + "loss": 0.6862, + "step": 2041 + }, + { + "epoch": 0.25089077282221406, + "grad_norm": 1.4575532276815764, + "learning_rate": 2.7967267087180018e-05, + "loss": 0.6878, + "step": 2042 + }, + { + "epoch": 0.2510136380390711, + "grad_norm": 1.3916449777980728, + "learning_rate": 2.7964032137386192e-05, + "loss": 0.6505, + "step": 2043 + }, + { + "epoch": 0.25113650325592823, + "grad_norm": 1.3487525980286055, + "learning_rate": 2.796079480294273e-05, + "loss": 0.6936, + "step": 2044 + }, + { + "epoch": 0.25125936847278535, + "grad_norm": 1.317822115541718, + "learning_rate": 2.7957555084445114e-05, + "loss": 0.5685, + "step": 2045 + }, + { + "epoch": 0.25138223368964246, + "grad_norm": 1.3056679803764912, + "learning_rate": 2.7954312982489278e-05, + "loss": 0.6463, + "step": 2046 + }, + { + "epoch": 0.2515050989064996, + "grad_norm": 1.3143736319303578, + "learning_rate": 2.7951068497671582e-05, + "loss": 0.6001, + "step": 2047 + }, + { + "epoch": 0.2516279641233567, + "grad_norm": 1.5302056586043324, + "learning_rate": 2.794782163058882e-05, + "loss": 0.7196, + "step": 2048 + }, + { + "epoch": 0.2517508293402138, + "grad_norm": 1.4517710429408015, + "learning_rate": 2.794457238183824e-05, + "loss": 0.6671, + "step": 2049 + }, + { + "epoch": 0.25187369455707087, + "grad_norm": 1.4189706665823618, + "learning_rate": 2.7941320752017522e-05, + "loss": 0.5962, + "step": 2050 + }, + { + "epoch": 0.251996559773928, + "grad_norm": 1.409911880614214, + "learning_rate": 2.793806674172478e-05, + "loss": 0.6532, + "step": 2051 + }, + { + "epoch": 0.2521194249907851, + "grad_norm": 1.4355712605246402, + "learning_rate": 2.7934810351558565e-05, + "loss": 0.6702, + "step": 2052 + }, + { + "epoch": 0.2522422902076422, + "grad_norm": 1.3680268738511183, + "learning_rate": 2.7931551582117868e-05, + "loss": 0.6984, + "step": 2053 + }, + { + "epoch": 0.25236515542449933, + "grad_norm": 1.5158394993907192, + "learning_rate": 2.7928290434002122e-05, + "loss": 0.7366, + "step": 2054 + }, + { + "epoch": 0.25248802064135645, + "grad_norm": 1.3276417349823233, + "learning_rate": 2.79250269078112e-05, + "loss": 0.647, + "step": 2055 + }, + { + "epoch": 0.25261088585821356, + "grad_norm": 1.1892433636539788, + "learning_rate": 2.7921761004145397e-05, + "loss": 0.618, + "step": 2056 + }, + { + "epoch": 0.2527337510750706, + "grad_norm": 1.2995502616887598, + "learning_rate": 2.7918492723605453e-05, + "loss": 0.6051, + "step": 2057 + }, + { + "epoch": 0.25285661629192774, + "grad_norm": 1.4393118315165196, + "learning_rate": 2.791522206679256e-05, + "loss": 0.7341, + "step": 2058 + }, + { + "epoch": 0.25297948150878485, + "grad_norm": 1.416351976417614, + "learning_rate": 2.7911949034308318e-05, + "loss": 0.6209, + "step": 2059 + }, + { + "epoch": 0.25310234672564197, + "grad_norm": 1.4101241333228298, + "learning_rate": 2.7908673626754794e-05, + "loss": 0.5653, + "step": 2060 + }, + { + "epoch": 0.2532252119424991, + "grad_norm": 1.2497521245860952, + "learning_rate": 2.7905395844734468e-05, + "loss": 0.5654, + "step": 2061 + }, + { + "epoch": 0.2533480771593562, + "grad_norm": 1.7816117582628301, + "learning_rate": 2.7902115688850272e-05, + "loss": 0.6599, + "step": 2062 + }, + { + "epoch": 0.2534709423762133, + "grad_norm": 1.5339936286451272, + "learning_rate": 2.789883315970557e-05, + "loss": 0.6931, + "step": 2063 + }, + { + "epoch": 0.2535938075930704, + "grad_norm": 1.336525118968716, + "learning_rate": 2.7895548257904157e-05, + "loss": 0.6429, + "step": 2064 + }, + { + "epoch": 0.2537166728099275, + "grad_norm": 1.5365976861474295, + "learning_rate": 2.789226098405028e-05, + "loss": 0.6349, + "step": 2065 + }, + { + "epoch": 0.2538395380267846, + "grad_norm": 1.5516525587852044, + "learning_rate": 2.7888971338748595e-05, + "loss": 0.6247, + "step": 2066 + }, + { + "epoch": 0.2539624032436417, + "grad_norm": 1.4106425222097636, + "learning_rate": 2.7885679322604223e-05, + "loss": 0.7662, + "step": 2067 + }, + { + "epoch": 0.25408526846049884, + "grad_norm": 1.312450836229963, + "learning_rate": 2.7882384936222703e-05, + "loss": 0.7126, + "step": 2068 + }, + { + "epoch": 0.25420813367735595, + "grad_norm": 1.4018332370863573, + "learning_rate": 2.787908818021003e-05, + "loss": 0.6215, + "step": 2069 + }, + { + "epoch": 0.25433099889421307, + "grad_norm": 1.6019722504001375, + "learning_rate": 2.78757890551726e-05, + "loss": 0.7116, + "step": 2070 + }, + { + "epoch": 0.2544538641110702, + "grad_norm": 1.4574848834263765, + "learning_rate": 2.7872487561717277e-05, + "loss": 0.7279, + "step": 2071 + }, + { + "epoch": 0.25457672932792724, + "grad_norm": 1.4984013147583055, + "learning_rate": 2.7869183700451352e-05, + "loss": 0.626, + "step": 2072 + }, + { + "epoch": 0.25469959454478436, + "grad_norm": 1.4312591911916634, + "learning_rate": 2.7865877471982544e-05, + "loss": 0.7502, + "step": 2073 + }, + { + "epoch": 0.2548224597616415, + "grad_norm": 1.8394043855394422, + "learning_rate": 2.786256887691901e-05, + "loss": 0.8096, + "step": 2074 + }, + { + "epoch": 0.2549453249784986, + "grad_norm": 1.6297681072444639, + "learning_rate": 2.785925791586935e-05, + "loss": 0.6899, + "step": 2075 + }, + { + "epoch": 0.2550681901953557, + "grad_norm": 1.3894080562402324, + "learning_rate": 2.785594458944259e-05, + "loss": 0.6231, + "step": 2076 + }, + { + "epoch": 0.2551910554122128, + "grad_norm": 1.2103984041485563, + "learning_rate": 2.7852628898248203e-05, + "loss": 0.6276, + "step": 2077 + }, + { + "epoch": 0.25531392062906993, + "grad_norm": 1.881288732183288, + "learning_rate": 2.7849310842896074e-05, + "loss": 0.6726, + "step": 2078 + }, + { + "epoch": 0.255436785845927, + "grad_norm": 1.3062621046781762, + "learning_rate": 2.7845990423996548e-05, + "loss": 0.6258, + "step": 2079 + }, + { + "epoch": 0.2555596510627841, + "grad_norm": 1.4450225965009083, + "learning_rate": 2.7842667642160394e-05, + "loss": 0.6036, + "step": 2080 + }, + { + "epoch": 0.2556825162796412, + "grad_norm": 1.387796452428006, + "learning_rate": 2.7839342497998813e-05, + "loss": 0.6142, + "step": 2081 + }, + { + "epoch": 0.25580538149649834, + "grad_norm": 1.440237236019156, + "learning_rate": 2.783601499212345e-05, + "loss": 0.8645, + "step": 2082 + }, + { + "epoch": 0.25592824671335546, + "grad_norm": 1.3175757504491177, + "learning_rate": 2.783268512514637e-05, + "loss": 0.5592, + "step": 2083 + }, + { + "epoch": 0.25605111193021257, + "grad_norm": 1.3847895025094408, + "learning_rate": 2.7829352897680087e-05, + "loss": 0.6797, + "step": 2084 + }, + { + "epoch": 0.2561739771470697, + "grad_norm": 1.2910575047563868, + "learning_rate": 2.782601831033754e-05, + "loss": 0.667, + "step": 2085 + }, + { + "epoch": 0.25629684236392675, + "grad_norm": 2.77514937157323, + "learning_rate": 2.7822681363732104e-05, + "loss": 0.8667, + "step": 2086 + }, + { + "epoch": 0.25641970758078386, + "grad_norm": 1.5265498536792226, + "learning_rate": 2.7819342058477584e-05, + "loss": 0.5974, + "step": 2087 + }, + { + "epoch": 0.256542572797641, + "grad_norm": 1.4163444645459913, + "learning_rate": 2.7816000395188232e-05, + "loss": 0.6379, + "step": 2088 + }, + { + "epoch": 0.2566654380144981, + "grad_norm": 1.6117640832817466, + "learning_rate": 2.7812656374478723e-05, + "loss": 0.6976, + "step": 2089 + }, + { + "epoch": 0.2567883032313552, + "grad_norm": 1.5137109393877402, + "learning_rate": 2.780930999696417e-05, + "loss": 0.5938, + "step": 2090 + }, + { + "epoch": 0.2569111684482123, + "grad_norm": 1.375929592031223, + "learning_rate": 2.7805961263260108e-05, + "loss": 0.6832, + "step": 2091 + }, + { + "epoch": 0.25703403366506944, + "grad_norm": 1.7277477749128631, + "learning_rate": 2.7802610173982523e-05, + "loss": 0.599, + "step": 2092 + }, + { + "epoch": 0.2571568988819265, + "grad_norm": 1.443486133800164, + "learning_rate": 2.7799256729747825e-05, + "loss": 0.5719, + "step": 2093 + }, + { + "epoch": 0.2572797640987836, + "grad_norm": 1.1737349500311147, + "learning_rate": 2.7795900931172856e-05, + "loss": 0.5578, + "step": 2094 + }, + { + "epoch": 0.25740262931564073, + "grad_norm": 2.219065701095662, + "learning_rate": 2.7792542778874896e-05, + "loss": 0.714, + "step": 2095 + }, + { + "epoch": 0.25752549453249785, + "grad_norm": 1.546312990814226, + "learning_rate": 2.778918227347166e-05, + "loss": 0.7182, + "step": 2096 + }, + { + "epoch": 0.25764835974935496, + "grad_norm": 1.398153311389245, + "learning_rate": 2.778581941558128e-05, + "loss": 0.5815, + "step": 2097 + }, + { + "epoch": 0.2577712249662121, + "grad_norm": 1.6711550533205222, + "learning_rate": 2.778245420582234e-05, + "loss": 0.651, + "step": 2098 + }, + { + "epoch": 0.2578940901830692, + "grad_norm": 1.4431149125973532, + "learning_rate": 2.777908664481384e-05, + "loss": 0.6646, + "step": 2099 + }, + { + "epoch": 0.2580169553999263, + "grad_norm": 1.3542876554499295, + "learning_rate": 2.7775716733175232e-05, + "loss": 0.6063, + "step": 2100 + }, + { + "epoch": 0.25813982061678337, + "grad_norm": 1.5285540148532393, + "learning_rate": 2.7772344471526385e-05, + "loss": 0.7424, + "step": 2101 + }, + { + "epoch": 0.2582626858336405, + "grad_norm": 1.7274805340268136, + "learning_rate": 2.77689698604876e-05, + "loss": 0.7143, + "step": 2102 + }, + { + "epoch": 0.2583855510504976, + "grad_norm": 1.5490892808878842, + "learning_rate": 2.7765592900679622e-05, + "loss": 0.565, + "step": 2103 + }, + { + "epoch": 0.2585084162673547, + "grad_norm": 1.5019864981957478, + "learning_rate": 2.7762213592723616e-05, + "loss": 0.5823, + "step": 2104 + }, + { + "epoch": 0.25863128148421183, + "grad_norm": 1.4459465352960252, + "learning_rate": 2.7758831937241188e-05, + "loss": 0.6863, + "step": 2105 + }, + { + "epoch": 0.25875414670106894, + "grad_norm": 1.4372127389772587, + "learning_rate": 2.775544793485437e-05, + "loss": 0.6835, + "step": 2106 + }, + { + "epoch": 0.25887701191792606, + "grad_norm": 1.454992340907955, + "learning_rate": 2.775206158618562e-05, + "loss": 0.7744, + "step": 2107 + }, + { + "epoch": 0.2589998771347831, + "grad_norm": 1.5095289508936582, + "learning_rate": 2.7748672891857847e-05, + "loss": 0.6772, + "step": 2108 + }, + { + "epoch": 0.25912274235164023, + "grad_norm": 1.3852078540421113, + "learning_rate": 2.7745281852494373e-05, + "loss": 0.6214, + "step": 2109 + }, + { + "epoch": 0.25924560756849735, + "grad_norm": 1.444143182715382, + "learning_rate": 2.7741888468718956e-05, + "loss": 0.5929, + "step": 2110 + }, + { + "epoch": 0.25936847278535446, + "grad_norm": 1.4715099296645295, + "learning_rate": 2.773849274115579e-05, + "loss": 0.6854, + "step": 2111 + }, + { + "epoch": 0.2594913380022116, + "grad_norm": 1.4451012188790329, + "learning_rate": 2.77350946704295e-05, + "loss": 0.7399, + "step": 2112 + }, + { + "epoch": 0.2596142032190687, + "grad_norm": 1.4149925137777644, + "learning_rate": 2.7731694257165126e-05, + "loss": 0.6749, + "step": 2113 + }, + { + "epoch": 0.2597370684359258, + "grad_norm": 1.5725347801238525, + "learning_rate": 2.7728291501988173e-05, + "loss": 0.7158, + "step": 2114 + }, + { + "epoch": 0.25985993365278287, + "grad_norm": 1.7063697810859408, + "learning_rate": 2.7724886405524536e-05, + "loss": 0.7486, + "step": 2115 + }, + { + "epoch": 0.25998279886964, + "grad_norm": 1.399879803900202, + "learning_rate": 2.7721478968400573e-05, + "loss": 0.6344, + "step": 2116 + }, + { + "epoch": 0.2601056640864971, + "grad_norm": 1.5124289611093618, + "learning_rate": 2.771806919124305e-05, + "loss": 0.739, + "step": 2117 + }, + { + "epoch": 0.2602285293033542, + "grad_norm": 1.7801915748098678, + "learning_rate": 2.771465707467918e-05, + "loss": 0.6882, + "step": 2118 + }, + { + "epoch": 0.26035139452021133, + "grad_norm": 1.382162167378409, + "learning_rate": 2.7711242619336605e-05, + "loss": 0.5842, + "step": 2119 + }, + { + "epoch": 0.26047425973706845, + "grad_norm": 1.8503044524625418, + "learning_rate": 2.7707825825843382e-05, + "loss": 0.7363, + "step": 2120 + }, + { + "epoch": 0.26059712495392556, + "grad_norm": 1.4958925788035258, + "learning_rate": 2.770440669482801e-05, + "loss": 0.7736, + "step": 2121 + }, + { + "epoch": 0.2607199901707827, + "grad_norm": 1.3263634608897843, + "learning_rate": 2.7700985226919415e-05, + "loss": 0.7854, + "step": 2122 + }, + { + "epoch": 0.26084285538763974, + "grad_norm": 1.5571784643491906, + "learning_rate": 2.769756142274696e-05, + "loss": 0.7648, + "step": 2123 + }, + { + "epoch": 0.26096572060449685, + "grad_norm": 1.4602358756805132, + "learning_rate": 2.769413528294043e-05, + "loss": 0.6341, + "step": 2124 + }, + { + "epoch": 0.26108858582135397, + "grad_norm": 1.419661500007916, + "learning_rate": 2.7690706808130037e-05, + "loss": 0.6322, + "step": 2125 + }, + { + "epoch": 0.2612114510382111, + "grad_norm": 1.5114218457355253, + "learning_rate": 2.7687275998946426e-05, + "loss": 0.53, + "step": 2126 + }, + { + "epoch": 0.2613343162550682, + "grad_norm": 1.6070198548888042, + "learning_rate": 2.768384285602068e-05, + "loss": 0.7168, + "step": 2127 + }, + { + "epoch": 0.2614571814719253, + "grad_norm": 1.3979567047885573, + "learning_rate": 2.7680407379984292e-05, + "loss": 0.5876, + "step": 2128 + }, + { + "epoch": 0.26158004668878243, + "grad_norm": 1.617948812139079, + "learning_rate": 2.7676969571469207e-05, + "loss": 0.7524, + "step": 2129 + }, + { + "epoch": 0.2617029119056395, + "grad_norm": 1.296002999164522, + "learning_rate": 2.7673529431107777e-05, + "loss": 0.5363, + "step": 2130 + }, + { + "epoch": 0.2618257771224966, + "grad_norm": 1.2524432238247738, + "learning_rate": 2.7670086959532807e-05, + "loss": 0.655, + "step": 2131 + }, + { + "epoch": 0.2619486423393537, + "grad_norm": 1.562395168839403, + "learning_rate": 2.7666642157377504e-05, + "loss": 0.6504, + "step": 2132 + }, + { + "epoch": 0.26207150755621084, + "grad_norm": 1.4467627738174942, + "learning_rate": 2.766319502527552e-05, + "loss": 0.8018, + "step": 2133 + }, + { + "epoch": 0.26219437277306795, + "grad_norm": 1.433704280435961, + "learning_rate": 2.765974556386094e-05, + "loss": 0.68, + "step": 2134 + }, + { + "epoch": 0.26231723798992507, + "grad_norm": 1.2775876661458148, + "learning_rate": 2.7656293773768262e-05, + "loss": 0.5918, + "step": 2135 + }, + { + "epoch": 0.2624401032067822, + "grad_norm": 1.3281122365921707, + "learning_rate": 2.7652839655632423e-05, + "loss": 0.6307, + "step": 2136 + }, + { + "epoch": 0.26256296842363924, + "grad_norm": 1.246159354741134, + "learning_rate": 2.764938321008879e-05, + "loss": 0.5968, + "step": 2137 + }, + { + "epoch": 0.26268583364049636, + "grad_norm": 1.2280704652836798, + "learning_rate": 2.7645924437773144e-05, + "loss": 0.671, + "step": 2138 + }, + { + "epoch": 0.2628086988573535, + "grad_norm": 1.3961619959640614, + "learning_rate": 2.764246333932171e-05, + "loss": 0.752, + "step": 2139 + }, + { + "epoch": 0.2629315640742106, + "grad_norm": 1.7375098735610799, + "learning_rate": 2.7638999915371137e-05, + "loss": 0.7658, + "step": 2140 + }, + { + "epoch": 0.2630544292910677, + "grad_norm": 1.5240563096135558, + "learning_rate": 2.7635534166558495e-05, + "loss": 0.6849, + "step": 2141 + }, + { + "epoch": 0.2631772945079248, + "grad_norm": 1.1669348625132332, + "learning_rate": 2.7632066093521283e-05, + "loss": 0.5777, + "step": 2142 + }, + { + "epoch": 0.26330015972478193, + "grad_norm": 1.3719422974200397, + "learning_rate": 2.7628595696897443e-05, + "loss": 0.6143, + "step": 2143 + }, + { + "epoch": 0.263423024941639, + "grad_norm": 1.6675898314527708, + "learning_rate": 2.7625122977325318e-05, + "loss": 0.5698, + "step": 2144 + }, + { + "epoch": 0.2635458901584961, + "grad_norm": 1.4751562852537439, + "learning_rate": 2.76216479354437e-05, + "loss": 0.748, + "step": 2145 + }, + { + "epoch": 0.2636687553753532, + "grad_norm": 1.328346255953098, + "learning_rate": 2.76181705718918e-05, + "loss": 0.6594, + "step": 2146 + }, + { + "epoch": 0.26379162059221034, + "grad_norm": 1.3707341599022775, + "learning_rate": 2.7614690887309253e-05, + "loss": 0.6703, + "step": 2147 + }, + { + "epoch": 0.26391448580906746, + "grad_norm": 1.3637620678180664, + "learning_rate": 2.7611208882336128e-05, + "loss": 0.6051, + "step": 2148 + }, + { + "epoch": 0.26403735102592457, + "grad_norm": 1.5101264853049854, + "learning_rate": 2.760772455761291e-05, + "loss": 0.6319, + "step": 2149 + }, + { + "epoch": 0.2641602162427817, + "grad_norm": 1.532030865088229, + "learning_rate": 2.7604237913780533e-05, + "loss": 0.7999, + "step": 2150 + }, + { + "epoch": 0.2642830814596388, + "grad_norm": 1.4965143879817313, + "learning_rate": 2.7600748951480325e-05, + "loss": 0.5871, + "step": 2151 + }, + { + "epoch": 0.26440594667649586, + "grad_norm": 1.3505222969859052, + "learning_rate": 2.7597257671354072e-05, + "loss": 0.7339, + "step": 2152 + }, + { + "epoch": 0.264528811893353, + "grad_norm": 1.32156674300675, + "learning_rate": 2.7593764074043966e-05, + "loss": 0.744, + "step": 2153 + }, + { + "epoch": 0.2646516771102101, + "grad_norm": 1.3953137371066846, + "learning_rate": 2.759026816019263e-05, + "loss": 0.7325, + "step": 2154 + }, + { + "epoch": 0.2647745423270672, + "grad_norm": 1.3244712960340375, + "learning_rate": 2.7586769930443114e-05, + "loss": 0.6372, + "step": 2155 + }, + { + "epoch": 0.2648974075439243, + "grad_norm": 1.4241022912608328, + "learning_rate": 2.7583269385438903e-05, + "loss": 0.7049, + "step": 2156 + }, + { + "epoch": 0.26502027276078144, + "grad_norm": 1.4476134837306067, + "learning_rate": 2.7579766525823888e-05, + "loss": 0.6748, + "step": 2157 + }, + { + "epoch": 0.26514313797763855, + "grad_norm": 1.5122512829304895, + "learning_rate": 2.7576261352242407e-05, + "loss": 0.6322, + "step": 2158 + }, + { + "epoch": 0.2652660031944956, + "grad_norm": 1.3709312143319423, + "learning_rate": 2.757275386533921e-05, + "loss": 0.5941, + "step": 2159 + }, + { + "epoch": 0.26538886841135273, + "grad_norm": 1.3685914057260686, + "learning_rate": 2.7569244065759478e-05, + "loss": 0.5991, + "step": 2160 + }, + { + "epoch": 0.26551173362820985, + "grad_norm": 1.4371741879832187, + "learning_rate": 2.756573195414881e-05, + "loss": 0.6453, + "step": 2161 + }, + { + "epoch": 0.26563459884506696, + "grad_norm": 1.3399875365943785, + "learning_rate": 2.7562217531153248e-05, + "loss": 0.6373, + "step": 2162 + }, + { + "epoch": 0.2657574640619241, + "grad_norm": 1.6362428532937372, + "learning_rate": 2.7558700797419233e-05, + "loss": 0.5986, + "step": 2163 + }, + { + "epoch": 0.2658803292787812, + "grad_norm": 1.624435839372398, + "learning_rate": 2.755518175359365e-05, + "loss": 0.6351, + "step": 2164 + }, + { + "epoch": 0.2660031944956383, + "grad_norm": 1.5894189883737795, + "learning_rate": 2.7551660400323817e-05, + "loss": 0.6283, + "step": 2165 + }, + { + "epoch": 0.26612605971249537, + "grad_norm": 1.9640240637886188, + "learning_rate": 2.7548136738257447e-05, + "loss": 0.6784, + "step": 2166 + }, + { + "epoch": 0.2662489249293525, + "grad_norm": 1.2648723256027277, + "learning_rate": 2.7544610768042698e-05, + "loss": 0.7275, + "step": 2167 + }, + { + "epoch": 0.2663717901462096, + "grad_norm": 1.5243444890798117, + "learning_rate": 2.754108249032816e-05, + "loss": 0.5992, + "step": 2168 + }, + { + "epoch": 0.2664946553630667, + "grad_norm": 1.369203628490957, + "learning_rate": 2.753755190576283e-05, + "loss": 0.6897, + "step": 2169 + }, + { + "epoch": 0.26661752057992383, + "grad_norm": 1.4255366041217563, + "learning_rate": 2.7534019014996132e-05, + "loss": 0.7312, + "step": 2170 + }, + { + "epoch": 0.26674038579678094, + "grad_norm": 1.3542019455930876, + "learning_rate": 2.753048381867792e-05, + "loss": 0.6616, + "step": 2171 + }, + { + "epoch": 0.26686325101363806, + "grad_norm": 1.40536225955373, + "learning_rate": 2.7526946317458474e-05, + "loss": 0.6047, + "step": 2172 + }, + { + "epoch": 0.2669861162304952, + "grad_norm": 1.5548983164370185, + "learning_rate": 2.7523406511988497e-05, + "loss": 0.6993, + "step": 2173 + }, + { + "epoch": 0.26710898144735223, + "grad_norm": 1.3069629167553187, + "learning_rate": 2.7519864402919108e-05, + "loss": 0.6328, + "step": 2174 + }, + { + "epoch": 0.26723184666420935, + "grad_norm": 1.249218018727622, + "learning_rate": 2.7516319990901857e-05, + "loss": 0.6451, + "step": 2175 + }, + { + "epoch": 0.26735471188106646, + "grad_norm": 1.6327318951105683, + "learning_rate": 2.751277327658871e-05, + "loss": 0.6234, + "step": 2176 + }, + { + "epoch": 0.2674775770979236, + "grad_norm": 1.280400376901837, + "learning_rate": 2.750922426063207e-05, + "loss": 0.6307, + "step": 2177 + }, + { + "epoch": 0.2676004423147807, + "grad_norm": 1.18334924153182, + "learning_rate": 2.7505672943684753e-05, + "loss": 0.5466, + "step": 2178 + }, + { + "epoch": 0.2677233075316378, + "grad_norm": 1.318705527983527, + "learning_rate": 2.7502119326399997e-05, + "loss": 0.6632, + "step": 2179 + }, + { + "epoch": 0.2678461727484949, + "grad_norm": 1.5530553764696429, + "learning_rate": 2.7498563409431475e-05, + "loss": 0.6759, + "step": 2180 + }, + { + "epoch": 0.267969037965352, + "grad_norm": 1.5468681472047394, + "learning_rate": 2.7495005193433266e-05, + "loss": 0.5698, + "step": 2181 + }, + { + "epoch": 0.2680919031822091, + "grad_norm": 1.3373917501063668, + "learning_rate": 2.749144467905989e-05, + "loss": 0.6481, + "step": 2182 + }, + { + "epoch": 0.2682147683990662, + "grad_norm": 1.347379878442477, + "learning_rate": 2.7487881866966268e-05, + "loss": 0.6385, + "step": 2183 + }, + { + "epoch": 0.26833763361592333, + "grad_norm": 1.4894484085228086, + "learning_rate": 2.7484316757807768e-05, + "loss": 0.6736, + "step": 2184 + }, + { + "epoch": 0.26846049883278045, + "grad_norm": 1.3940801785878614, + "learning_rate": 2.7480749352240166e-05, + "loss": 0.5683, + "step": 2185 + }, + { + "epoch": 0.26858336404963756, + "grad_norm": 1.253628126026695, + "learning_rate": 2.7477179650919654e-05, + "loss": 0.6374, + "step": 2186 + }, + { + "epoch": 0.2687062292664947, + "grad_norm": 1.2026531826786953, + "learning_rate": 2.7473607654502866e-05, + "loss": 0.6063, + "step": 2187 + }, + { + "epoch": 0.26882909448335174, + "grad_norm": 1.5534227354083239, + "learning_rate": 2.7470033363646846e-05, + "loss": 0.613, + "step": 2188 + }, + { + "epoch": 0.26895195970020885, + "grad_norm": 1.5049063646444027, + "learning_rate": 2.7466456779009058e-05, + "loss": 0.6411, + "step": 2189 + }, + { + "epoch": 0.26907482491706597, + "grad_norm": 1.3820747766338286, + "learning_rate": 2.746287790124739e-05, + "loss": 0.7061, + "step": 2190 + }, + { + "epoch": 0.2691976901339231, + "grad_norm": 1.6070271862529732, + "learning_rate": 2.7459296731020163e-05, + "loss": 0.6266, + "step": 2191 + }, + { + "epoch": 0.2693205553507802, + "grad_norm": 1.3875573260692273, + "learning_rate": 2.7455713268986098e-05, + "loss": 0.6488, + "step": 2192 + }, + { + "epoch": 0.2694434205676373, + "grad_norm": 1.3224129011759234, + "learning_rate": 2.7452127515804357e-05, + "loss": 0.6151, + "step": 2193 + }, + { + "epoch": 0.26956628578449443, + "grad_norm": 1.4983280773667487, + "learning_rate": 2.7448539472134518e-05, + "loss": 0.6634, + "step": 2194 + }, + { + "epoch": 0.2696891510013515, + "grad_norm": 1.4755137131494507, + "learning_rate": 2.7444949138636576e-05, + "loss": 0.598, + "step": 2195 + }, + { + "epoch": 0.2698120162182086, + "grad_norm": 1.4045906381843318, + "learning_rate": 2.744135651597094e-05, + "loss": 0.6347, + "step": 2196 + }, + { + "epoch": 0.2699348814350657, + "grad_norm": 1.5764288357975977, + "learning_rate": 2.7437761604798465e-05, + "loss": 0.5907, + "step": 2197 + }, + { + "epoch": 0.27005774665192284, + "grad_norm": 1.5089574618361155, + "learning_rate": 2.74341644057804e-05, + "loss": 0.7146, + "step": 2198 + }, + { + "epoch": 0.27018061186877995, + "grad_norm": 1.3970143681680132, + "learning_rate": 2.7430564919578432e-05, + "loss": 0.6077, + "step": 2199 + }, + { + "epoch": 0.27030347708563707, + "grad_norm": 1.3561153237130887, + "learning_rate": 2.742696314685466e-05, + "loss": 0.7329, + "step": 2200 + }, + { + "epoch": 0.2704263423024942, + "grad_norm": 1.4785305669803246, + "learning_rate": 2.7423359088271614e-05, + "loss": 0.5988, + "step": 2201 + }, + { + "epoch": 0.2705492075193513, + "grad_norm": 1.3890442073773912, + "learning_rate": 2.741975274449223e-05, + "loss": 0.6184, + "step": 2202 + }, + { + "epoch": 0.27067207273620836, + "grad_norm": 1.6893708125236024, + "learning_rate": 2.7416144116179876e-05, + "loss": 0.7841, + "step": 2203 + }, + { + "epoch": 0.2707949379530655, + "grad_norm": 1.5143214431253462, + "learning_rate": 2.741253320399833e-05, + "loss": 0.791, + "step": 2204 + }, + { + "epoch": 0.2709178031699226, + "grad_norm": 1.464583745917509, + "learning_rate": 2.74089200086118e-05, + "loss": 0.6127, + "step": 2205 + }, + { + "epoch": 0.2710406683867797, + "grad_norm": 1.6665512317682036, + "learning_rate": 2.740530453068491e-05, + "loss": 0.5996, + "step": 2206 + }, + { + "epoch": 0.2711635336036368, + "grad_norm": 1.5104050447436361, + "learning_rate": 2.74016867708827e-05, + "loss": 0.8274, + "step": 2207 + }, + { + "epoch": 0.27128639882049393, + "grad_norm": 1.3081485920267748, + "learning_rate": 2.7398066729870637e-05, + "loss": 0.5848, + "step": 2208 + }, + { + "epoch": 0.27140926403735105, + "grad_norm": 1.418735511221626, + "learning_rate": 2.739444440831461e-05, + "loss": 0.6981, + "step": 2209 + }, + { + "epoch": 0.2715321292542081, + "grad_norm": 1.443231258812306, + "learning_rate": 2.7390819806880906e-05, + "loss": 0.5822, + "step": 2210 + }, + { + "epoch": 0.2716549944710652, + "grad_norm": 1.4798875722048717, + "learning_rate": 2.738719292623626e-05, + "loss": 0.6427, + "step": 2211 + }, + { + "epoch": 0.27177785968792234, + "grad_norm": 1.5740853877890568, + "learning_rate": 2.7383563767047808e-05, + "loss": 0.7005, + "step": 2212 + }, + { + "epoch": 0.27190072490477946, + "grad_norm": 1.7681786651129643, + "learning_rate": 2.7379932329983114e-05, + "loss": 0.5989, + "step": 2213 + }, + { + "epoch": 0.27202359012163657, + "grad_norm": 1.566579669806081, + "learning_rate": 2.737629861571015e-05, + "loss": 0.7634, + "step": 2214 + }, + { + "epoch": 0.2721464553384937, + "grad_norm": 1.3531828214631005, + "learning_rate": 2.737266262489732e-05, + "loss": 0.5892, + "step": 2215 + }, + { + "epoch": 0.2722693205553508, + "grad_norm": 1.2606401907410367, + "learning_rate": 2.7369024358213436e-05, + "loss": 0.8284, + "step": 2216 + }, + { + "epoch": 0.27239218577220786, + "grad_norm": 1.362709990863083, + "learning_rate": 2.7365383816327746e-05, + "loss": 0.629, + "step": 2217 + }, + { + "epoch": 0.272515050989065, + "grad_norm": 1.364286690363534, + "learning_rate": 2.736174099990989e-05, + "loss": 0.6686, + "step": 2218 + }, + { + "epoch": 0.2726379162059221, + "grad_norm": 1.4577900650783806, + "learning_rate": 2.7358095909629947e-05, + "loss": 0.7285, + "step": 2219 + }, + { + "epoch": 0.2727607814227792, + "grad_norm": 1.5091858859348273, + "learning_rate": 2.735444854615841e-05, + "loss": 0.6542, + "step": 2220 + }, + { + "epoch": 0.2728836466396363, + "grad_norm": 1.288406891606747, + "learning_rate": 2.7350798910166176e-05, + "loss": 0.7034, + "step": 2221 + }, + { + "epoch": 0.27300651185649344, + "grad_norm": 1.3973130422488302, + "learning_rate": 2.7347147002324587e-05, + "loss": 0.6631, + "step": 2222 + }, + { + "epoch": 0.27312937707335055, + "grad_norm": 1.3391316263694792, + "learning_rate": 2.7343492823305377e-05, + "loss": 0.6642, + "step": 2223 + }, + { + "epoch": 0.27325224229020767, + "grad_norm": 1.3960303226075372, + "learning_rate": 2.7339836373780712e-05, + "loss": 0.5774, + "step": 2224 + }, + { + "epoch": 0.27337510750706473, + "grad_norm": 1.3938939888604018, + "learning_rate": 2.733617765442318e-05, + "loss": 0.5726, + "step": 2225 + }, + { + "epoch": 0.27349797272392184, + "grad_norm": 1.5178227396989306, + "learning_rate": 2.7332516665905763e-05, + "loss": 0.7074, + "step": 2226 + }, + { + "epoch": 0.27362083794077896, + "grad_norm": 1.4399313715143345, + "learning_rate": 2.732885340890189e-05, + "loss": 0.7273, + "step": 2227 + }, + { + "epoch": 0.2737437031576361, + "grad_norm": 1.3075490667355374, + "learning_rate": 2.7325187884085385e-05, + "loss": 0.7258, + "step": 2228 + }, + { + "epoch": 0.2738665683744932, + "grad_norm": 1.3994372185063837, + "learning_rate": 2.73215200921305e-05, + "loss": 0.5769, + "step": 2229 + }, + { + "epoch": 0.2739894335913503, + "grad_norm": 1.4953157567032949, + "learning_rate": 2.7317850033711903e-05, + "loss": 0.6125, + "step": 2230 + }, + { + "epoch": 0.2741122988082074, + "grad_norm": 1.398277999171455, + "learning_rate": 2.7314177709504674e-05, + "loss": 0.7705, + "step": 2231 + }, + { + "epoch": 0.2742351640250645, + "grad_norm": 1.4963035765442916, + "learning_rate": 2.7310503120184326e-05, + "loss": 0.4962, + "step": 2232 + }, + { + "epoch": 0.2743580292419216, + "grad_norm": 1.7125132494292332, + "learning_rate": 2.730682626642675e-05, + "loss": 0.756, + "step": 2233 + }, + { + "epoch": 0.2744808944587787, + "grad_norm": 1.4396561801068752, + "learning_rate": 2.7303147148908305e-05, + "loss": 0.5997, + "step": 2234 + }, + { + "epoch": 0.27460375967563583, + "grad_norm": 1.7023444207189797, + "learning_rate": 2.729946576830573e-05, + "loss": 0.6045, + "step": 2235 + }, + { + "epoch": 0.27472662489249294, + "grad_norm": 1.369104175507042, + "learning_rate": 2.7295782125296188e-05, + "loss": 0.8375, + "step": 2236 + }, + { + "epoch": 0.27484949010935006, + "grad_norm": 1.2847284146441715, + "learning_rate": 2.7292096220557267e-05, + "loss": 0.5916, + "step": 2237 + }, + { + "epoch": 0.2749723553262072, + "grad_norm": 1.597488636836791, + "learning_rate": 2.728840805476696e-05, + "loss": 0.614, + "step": 2238 + }, + { + "epoch": 0.27509522054306423, + "grad_norm": 1.5160558212908166, + "learning_rate": 2.728471762860369e-05, + "loss": 0.5933, + "step": 2239 + }, + { + "epoch": 0.27521808575992135, + "grad_norm": 1.527198215470518, + "learning_rate": 2.728102494274628e-05, + "loss": 0.597, + "step": 2240 + }, + { + "epoch": 0.27534095097677846, + "grad_norm": 1.5974415265766118, + "learning_rate": 2.7277329997873974e-05, + "loss": 0.7841, + "step": 2241 + }, + { + "epoch": 0.2754638161936356, + "grad_norm": 1.3752038263794273, + "learning_rate": 2.727363279466644e-05, + "loss": 0.6482, + "step": 2242 + }, + { + "epoch": 0.2755866814104927, + "grad_norm": 1.4591634546320258, + "learning_rate": 2.726993333380375e-05, + "loss": 0.5563, + "step": 2243 + }, + { + "epoch": 0.2757095466273498, + "grad_norm": 1.3152996225917803, + "learning_rate": 2.7266231615966396e-05, + "loss": 0.6999, + "step": 2244 + }, + { + "epoch": 0.2758324118442069, + "grad_norm": 1.4015944989670872, + "learning_rate": 2.726252764183528e-05, + "loss": 0.6771, + "step": 2245 + }, + { + "epoch": 0.275955277061064, + "grad_norm": 1.4822846704893073, + "learning_rate": 2.7258821412091735e-05, + "loss": 0.5638, + "step": 2246 + }, + { + "epoch": 0.2760781422779211, + "grad_norm": 1.3037484199679858, + "learning_rate": 2.7255112927417494e-05, + "loss": 0.5913, + "step": 2247 + }, + { + "epoch": 0.2762010074947782, + "grad_norm": 1.4003464570600392, + "learning_rate": 2.7251402188494704e-05, + "loss": 0.7682, + "step": 2248 + }, + { + "epoch": 0.27632387271163533, + "grad_norm": 1.96633577811444, + "learning_rate": 2.7247689196005935e-05, + "loss": 0.6516, + "step": 2249 + }, + { + "epoch": 0.27644673792849245, + "grad_norm": 1.567342379400192, + "learning_rate": 2.7243973950634165e-05, + "loss": 0.6375, + "step": 2250 + }, + { + "epoch": 0.27656960314534956, + "grad_norm": 1.3111707652239746, + "learning_rate": 2.7240256453062796e-05, + "loss": 0.6489, + "step": 2251 + }, + { + "epoch": 0.2766924683622067, + "grad_norm": 1.2817226515151958, + "learning_rate": 2.7236536703975633e-05, + "loss": 0.6415, + "step": 2252 + }, + { + "epoch": 0.2768153335790638, + "grad_norm": 1.2876156554962757, + "learning_rate": 2.7232814704056902e-05, + "loss": 0.6395, + "step": 2253 + }, + { + "epoch": 0.27693819879592085, + "grad_norm": 1.1683153305133076, + "learning_rate": 2.7229090453991238e-05, + "loss": 0.7245, + "step": 2254 + }, + { + "epoch": 0.27706106401277797, + "grad_norm": 1.32949883128201, + "learning_rate": 2.722536395446369e-05, + "loss": 0.6153, + "step": 2255 + }, + { + "epoch": 0.2771839292296351, + "grad_norm": 1.4362028838830463, + "learning_rate": 2.7221635206159725e-05, + "loss": 0.6623, + "step": 2256 + }, + { + "epoch": 0.2773067944464922, + "grad_norm": 1.4403691070296074, + "learning_rate": 2.721790420976523e-05, + "loss": 0.6875, + "step": 2257 + }, + { + "epoch": 0.2774296596633493, + "grad_norm": 1.5391808806845022, + "learning_rate": 2.721417096596649e-05, + "loss": 0.636, + "step": 2258 + }, + { + "epoch": 0.27755252488020643, + "grad_norm": 1.2000198964466813, + "learning_rate": 2.7210435475450207e-05, + "loss": 0.6695, + "step": 2259 + }, + { + "epoch": 0.27767539009706355, + "grad_norm": 1.5848865359706197, + "learning_rate": 2.7206697738903513e-05, + "loss": 0.7984, + "step": 2260 + }, + { + "epoch": 0.2777982553139206, + "grad_norm": 1.3191703828149695, + "learning_rate": 2.720295775701393e-05, + "loss": 0.5858, + "step": 2261 + }, + { + "epoch": 0.2779211205307777, + "grad_norm": 1.259645792035103, + "learning_rate": 2.719921553046941e-05, + "loss": 0.7574, + "step": 2262 + }, + { + "epoch": 0.27804398574763484, + "grad_norm": 1.604692692509029, + "learning_rate": 2.71954710599583e-05, + "loss": 0.7284, + "step": 2263 + }, + { + "epoch": 0.27816685096449195, + "grad_norm": 1.3184213667385616, + "learning_rate": 2.719172434616938e-05, + "loss": 0.563, + "step": 2264 + }, + { + "epoch": 0.27828971618134907, + "grad_norm": 1.3557467558465117, + "learning_rate": 2.718797538979184e-05, + "loss": 0.7652, + "step": 2265 + }, + { + "epoch": 0.2784125813982062, + "grad_norm": 1.49860151814819, + "learning_rate": 2.7184224191515263e-05, + "loss": 0.6647, + "step": 2266 + }, + { + "epoch": 0.2785354466150633, + "grad_norm": 1.468681278118489, + "learning_rate": 2.718047075202967e-05, + "loss": 0.7328, + "step": 2267 + }, + { + "epoch": 0.27865831183192036, + "grad_norm": 1.3934761350995364, + "learning_rate": 2.717671507202547e-05, + "loss": 0.6451, + "step": 2268 + }, + { + "epoch": 0.2787811770487775, + "grad_norm": 1.7103092459301472, + "learning_rate": 2.71729571521935e-05, + "loss": 0.6337, + "step": 2269 + }, + { + "epoch": 0.2789040422656346, + "grad_norm": 1.396357480214934, + "learning_rate": 2.716919699322501e-05, + "loss": 0.7118, + "step": 2270 + }, + { + "epoch": 0.2790269074824917, + "grad_norm": 1.352466531314236, + "learning_rate": 2.716543459581165e-05, + "loss": 0.6501, + "step": 2271 + }, + { + "epoch": 0.2791497726993488, + "grad_norm": 1.2403023601255228, + "learning_rate": 2.7161669960645493e-05, + "loss": 0.6713, + "step": 2272 + }, + { + "epoch": 0.27927263791620593, + "grad_norm": 1.4364222547289218, + "learning_rate": 2.7157903088419016e-05, + "loss": 0.7437, + "step": 2273 + }, + { + "epoch": 0.27939550313306305, + "grad_norm": 1.524130593989765, + "learning_rate": 2.7154133979825116e-05, + "loss": 0.6898, + "step": 2274 + }, + { + "epoch": 0.2795183683499201, + "grad_norm": 1.3880654711349896, + "learning_rate": 2.715036263555709e-05, + "loss": 0.7524, + "step": 2275 + }, + { + "epoch": 0.2796412335667772, + "grad_norm": 1.2973963655164118, + "learning_rate": 2.714658905630866e-05, + "loss": 0.6608, + "step": 2276 + }, + { + "epoch": 0.27976409878363434, + "grad_norm": 1.5686317266154568, + "learning_rate": 2.714281324277394e-05, + "loss": 0.7424, + "step": 2277 + }, + { + "epoch": 0.27988696400049146, + "grad_norm": 1.1566924872636943, + "learning_rate": 2.7139035195647475e-05, + "loss": 0.6246, + "step": 2278 + }, + { + "epoch": 0.28000982921734857, + "grad_norm": 1.5532158791320374, + "learning_rate": 2.7135254915624213e-05, + "loss": 0.6139, + "step": 2279 + }, + { + "epoch": 0.2801326944342057, + "grad_norm": 1.559824768815689, + "learning_rate": 2.7131472403399505e-05, + "loss": 0.6547, + "step": 2280 + }, + { + "epoch": 0.2802555596510628, + "grad_norm": 1.2841037789864103, + "learning_rate": 2.7127687659669126e-05, + "loss": 0.671, + "step": 2281 + }, + { + "epoch": 0.2803784248679199, + "grad_norm": 1.436638426174409, + "learning_rate": 2.7123900685129253e-05, + "loss": 0.721, + "step": 2282 + }, + { + "epoch": 0.280501290084777, + "grad_norm": 1.5534864197694331, + "learning_rate": 2.7120111480476476e-05, + "loss": 0.5315, + "step": 2283 + }, + { + "epoch": 0.2806241553016341, + "grad_norm": 1.7154422017030828, + "learning_rate": 2.7116320046407795e-05, + "loss": 0.616, + "step": 2284 + }, + { + "epoch": 0.2807470205184912, + "grad_norm": 1.4375786925806058, + "learning_rate": 2.7112526383620615e-05, + "loss": 0.7154, + "step": 2285 + }, + { + "epoch": 0.2808698857353483, + "grad_norm": 1.1746711465513362, + "learning_rate": 2.710873049281276e-05, + "loss": 0.5665, + "step": 2286 + }, + { + "epoch": 0.28099275095220544, + "grad_norm": 1.570189029348816, + "learning_rate": 2.7104932374682462e-05, + "loss": 0.69, + "step": 2287 + }, + { + "epoch": 0.28111561616906255, + "grad_norm": 1.4202025089957389, + "learning_rate": 2.7101132029928352e-05, + "loss": 0.6624, + "step": 2288 + }, + { + "epoch": 0.28123848138591967, + "grad_norm": 1.427444178913261, + "learning_rate": 2.7097329459249485e-05, + "loss": 0.7273, + "step": 2289 + }, + { + "epoch": 0.28136134660277673, + "grad_norm": 1.4241784904978274, + "learning_rate": 2.7093524663345318e-05, + "loss": 0.8499, + "step": 2290 + }, + { + "epoch": 0.28148421181963384, + "grad_norm": 1.4713752795542048, + "learning_rate": 2.7089717642915723e-05, + "loss": 0.6187, + "step": 2291 + }, + { + "epoch": 0.28160707703649096, + "grad_norm": 1.3058882868808603, + "learning_rate": 2.7085908398660966e-05, + "loss": 0.7097, + "step": 2292 + }, + { + "epoch": 0.2817299422533481, + "grad_norm": 1.3758965721411256, + "learning_rate": 2.7082096931281743e-05, + "loss": 0.6385, + "step": 2293 + }, + { + "epoch": 0.2818528074702052, + "grad_norm": 1.3319120618021292, + "learning_rate": 2.707828324147914e-05, + "loss": 0.6174, + "step": 2294 + }, + { + "epoch": 0.2819756726870623, + "grad_norm": 1.4380469209157543, + "learning_rate": 2.707446732995467e-05, + "loss": 0.6966, + "step": 2295 + }, + { + "epoch": 0.2820985379039194, + "grad_norm": 1.3507092288988436, + "learning_rate": 2.7070649197410236e-05, + "loss": 0.655, + "step": 2296 + }, + { + "epoch": 0.2822214031207765, + "grad_norm": 1.3893322993188522, + "learning_rate": 2.7066828844548166e-05, + "loss": 0.6203, + "step": 2297 + }, + { + "epoch": 0.2823442683376336, + "grad_norm": 1.2329902844485314, + "learning_rate": 2.7063006272071185e-05, + "loss": 0.5897, + "step": 2298 + }, + { + "epoch": 0.2824671335544907, + "grad_norm": 1.4148633382754057, + "learning_rate": 2.7059181480682434e-05, + "loss": 0.6296, + "step": 2299 + }, + { + "epoch": 0.28258999877134783, + "grad_norm": 1.3761880116644563, + "learning_rate": 2.7055354471085454e-05, + "loss": 0.6794, + "step": 2300 + }, + { + "epoch": 0.28271286398820494, + "grad_norm": 1.3477540043149876, + "learning_rate": 2.70515252439842e-05, + "loss": 0.6631, + "step": 2301 + }, + { + "epoch": 0.28283572920506206, + "grad_norm": 1.5942566446978674, + "learning_rate": 2.704769380008304e-05, + "loss": 0.6216, + "step": 2302 + }, + { + "epoch": 0.2829585944219192, + "grad_norm": 1.6037249340766404, + "learning_rate": 2.7043860140086728e-05, + "loss": 0.6283, + "step": 2303 + }, + { + "epoch": 0.2830814596387763, + "grad_norm": 1.6440260710232344, + "learning_rate": 2.7040024264700457e-05, + "loss": 0.6595, + "step": 2304 + }, + { + "epoch": 0.28320432485563335, + "grad_norm": 1.529381308190572, + "learning_rate": 2.70361861746298e-05, + "loss": 0.5792, + "step": 2305 + }, + { + "epoch": 0.28332719007249046, + "grad_norm": 1.4708460913099222, + "learning_rate": 2.7032345870580756e-05, + "loss": 0.6915, + "step": 2306 + }, + { + "epoch": 0.2834500552893476, + "grad_norm": 1.300770079723082, + "learning_rate": 2.7028503353259728e-05, + "loss": 0.6247, + "step": 2307 + }, + { + "epoch": 0.2835729205062047, + "grad_norm": 1.2630262496093956, + "learning_rate": 2.702465862337351e-05, + "loss": 0.6231, + "step": 2308 + }, + { + "epoch": 0.2836957857230618, + "grad_norm": 1.2091580601209428, + "learning_rate": 2.7020811681629318e-05, + "loss": 0.6469, + "step": 2309 + }, + { + "epoch": 0.2838186509399189, + "grad_norm": 1.3101138221182682, + "learning_rate": 2.701696252873478e-05, + "loss": 0.6886, + "step": 2310 + }, + { + "epoch": 0.28394151615677604, + "grad_norm": 1.3722088659224434, + "learning_rate": 2.7013111165397912e-05, + "loss": 0.606, + "step": 2311 + }, + { + "epoch": 0.2840643813736331, + "grad_norm": 1.2536367859192785, + "learning_rate": 2.700925759232716e-05, + "loss": 0.5796, + "step": 2312 + }, + { + "epoch": 0.2841872465904902, + "grad_norm": 1.3949159627734986, + "learning_rate": 2.700540181023135e-05, + "loss": 0.6238, + "step": 2313 + }, + { + "epoch": 0.28431011180734733, + "grad_norm": 1.2500613826393803, + "learning_rate": 2.700154381981974e-05, + "loss": 0.6886, + "step": 2314 + }, + { + "epoch": 0.28443297702420445, + "grad_norm": 1.5564201644403335, + "learning_rate": 2.699768362180197e-05, + "loss": 0.6674, + "step": 2315 + }, + { + "epoch": 0.28455584224106156, + "grad_norm": 1.5824835322967588, + "learning_rate": 2.6993821216888115e-05, + "loss": 0.6397, + "step": 2316 + }, + { + "epoch": 0.2846787074579187, + "grad_norm": 1.6145329328589666, + "learning_rate": 2.6989956605788623e-05, + "loss": 0.768, + "step": 2317 + }, + { + "epoch": 0.2848015726747758, + "grad_norm": 1.5243613844587494, + "learning_rate": 2.6986089789214376e-05, + "loss": 0.7313, + "step": 2318 + }, + { + "epoch": 0.28492443789163285, + "grad_norm": 1.3811676419795467, + "learning_rate": 2.698222076787664e-05, + "loss": 0.5877, + "step": 2319 + }, + { + "epoch": 0.28504730310848997, + "grad_norm": 1.3538866291762695, + "learning_rate": 2.6978349542487102e-05, + "loss": 0.619, + "step": 2320 + }, + { + "epoch": 0.2851701683253471, + "grad_norm": 1.7751044641068865, + "learning_rate": 2.6974476113757855e-05, + "loss": 0.6507, + "step": 2321 + }, + { + "epoch": 0.2852930335422042, + "grad_norm": 1.3117933636216823, + "learning_rate": 2.6970600482401373e-05, + "loss": 0.6744, + "step": 2322 + }, + { + "epoch": 0.2854158987590613, + "grad_norm": 1.2555423562302266, + "learning_rate": 2.696672264913057e-05, + "loss": 0.5506, + "step": 2323 + }, + { + "epoch": 0.28553876397591843, + "grad_norm": 1.6563637425242683, + "learning_rate": 2.6962842614658742e-05, + "loss": 0.6712, + "step": 2324 + }, + { + "epoch": 0.28566162919277555, + "grad_norm": 1.4863520581666054, + "learning_rate": 2.6958960379699596e-05, + "loss": 0.6903, + "step": 2325 + }, + { + "epoch": 0.2857844944096326, + "grad_norm": 1.6260634847878985, + "learning_rate": 2.695507594496725e-05, + "loss": 0.6825, + "step": 2326 + }, + { + "epoch": 0.2859073596264897, + "grad_norm": 1.3358686391256134, + "learning_rate": 2.695118931117621e-05, + "loss": 0.6961, + "step": 2327 + }, + { + "epoch": 0.28603022484334684, + "grad_norm": 1.1053726509961348, + "learning_rate": 2.69473004790414e-05, + "loss": 0.592, + "step": 2328 + }, + { + "epoch": 0.28615309006020395, + "grad_norm": 1.5003164557436357, + "learning_rate": 2.6943409449278152e-05, + "loss": 0.6872, + "step": 2329 + }, + { + "epoch": 0.28627595527706107, + "grad_norm": 1.2545663428466534, + "learning_rate": 2.693951622260219e-05, + "loss": 0.6549, + "step": 2330 + }, + { + "epoch": 0.2863988204939182, + "grad_norm": 1.4357904040583336, + "learning_rate": 2.6935620799729652e-05, + "loss": 0.54, + "step": 2331 + }, + { + "epoch": 0.2865216857107753, + "grad_norm": 1.2317550692259565, + "learning_rate": 2.6931723181377067e-05, + "loss": 0.7372, + "step": 2332 + }, + { + "epoch": 0.2866445509276324, + "grad_norm": 1.0348863509544772, + "learning_rate": 2.692782336826139e-05, + "loss": 0.6508, + "step": 2333 + }, + { + "epoch": 0.2867674161444895, + "grad_norm": 1.5867995097210663, + "learning_rate": 2.6923921361099953e-05, + "loss": 0.7803, + "step": 2334 + }, + { + "epoch": 0.2868902813613466, + "grad_norm": 1.2435371487391187, + "learning_rate": 2.6920017160610514e-05, + "loss": 0.5816, + "step": 2335 + }, + { + "epoch": 0.2870131465782037, + "grad_norm": 1.3215745007091262, + "learning_rate": 2.6916110767511223e-05, + "loss": 0.7075, + "step": 2336 + }, + { + "epoch": 0.2871360117950608, + "grad_norm": 1.2813949096540969, + "learning_rate": 2.6912202182520637e-05, + "loss": 0.6734, + "step": 2337 + }, + { + "epoch": 0.28725887701191793, + "grad_norm": 1.3874586440917138, + "learning_rate": 2.6908291406357714e-05, + "loss": 0.593, + "step": 2338 + }, + { + "epoch": 0.28738174222877505, + "grad_norm": 1.3741124690116528, + "learning_rate": 2.6904378439741806e-05, + "loss": 0.5827, + "step": 2339 + }, + { + "epoch": 0.28750460744563217, + "grad_norm": 1.4502570753607562, + "learning_rate": 2.690046328339269e-05, + "loss": 0.6545, + "step": 2340 + }, + { + "epoch": 0.2876274726624892, + "grad_norm": 1.357652974097361, + "learning_rate": 2.6896545938030532e-05, + "loss": 0.6288, + "step": 2341 + }, + { + "epoch": 0.28775033787934634, + "grad_norm": 1.4561742134742317, + "learning_rate": 2.68926264043759e-05, + "loss": 0.6229, + "step": 2342 + }, + { + "epoch": 0.28787320309620346, + "grad_norm": 1.795775069072911, + "learning_rate": 2.6888704683149768e-05, + "loss": 0.7792, + "step": 2343 + }, + { + "epoch": 0.28799606831306057, + "grad_norm": 1.2881431961787628, + "learning_rate": 2.6884780775073513e-05, + "loss": 0.6513, + "step": 2344 + }, + { + "epoch": 0.2881189335299177, + "grad_norm": 1.2304769714001682, + "learning_rate": 2.6880854680868905e-05, + "loss": 0.7874, + "step": 2345 + }, + { + "epoch": 0.2882417987467748, + "grad_norm": 1.3574911510772705, + "learning_rate": 2.687692640125813e-05, + "loss": 0.7058, + "step": 2346 + }, + { + "epoch": 0.2883646639636319, + "grad_norm": 1.3466192862412458, + "learning_rate": 2.687299593696377e-05, + "loss": 0.6884, + "step": 2347 + }, + { + "epoch": 0.288487529180489, + "grad_norm": 1.5279621880903231, + "learning_rate": 2.6869063288708807e-05, + "loss": 0.5868, + "step": 2348 + }, + { + "epoch": 0.2886103943973461, + "grad_norm": 1.3339210736952853, + "learning_rate": 2.6865128457216623e-05, + "loss": 0.7545, + "step": 2349 + }, + { + "epoch": 0.2887332596142032, + "grad_norm": 1.3747327522362114, + "learning_rate": 2.6861191443211008e-05, + "loss": 0.6101, + "step": 2350 + }, + { + "epoch": 0.2888561248310603, + "grad_norm": 1.5240198600673898, + "learning_rate": 2.685725224741615e-05, + "loss": 0.7174, + "step": 2351 + }, + { + "epoch": 0.28897899004791744, + "grad_norm": 1.4081328904299477, + "learning_rate": 2.6853310870556638e-05, + "loss": 0.6793, + "step": 2352 + }, + { + "epoch": 0.28910185526477455, + "grad_norm": 1.5610754167273866, + "learning_rate": 2.6849367313357458e-05, + "loss": 0.6229, + "step": 2353 + }, + { + "epoch": 0.28922472048163167, + "grad_norm": 1.3674193895516589, + "learning_rate": 2.6845421576544017e-05, + "loss": 0.7214, + "step": 2354 + }, + { + "epoch": 0.2893475856984888, + "grad_norm": 1.2589763368723152, + "learning_rate": 2.6841473660842088e-05, + "loss": 0.6452, + "step": 2355 + }, + { + "epoch": 0.28947045091534584, + "grad_norm": 1.3766157754638133, + "learning_rate": 2.6837523566977876e-05, + "loss": 0.6389, + "step": 2356 + }, + { + "epoch": 0.28959331613220296, + "grad_norm": 1.3756202967187943, + "learning_rate": 2.6833571295677976e-05, + "loss": 0.728, + "step": 2357 + }, + { + "epoch": 0.2897161813490601, + "grad_norm": 1.2443862900439489, + "learning_rate": 2.6829616847669372e-05, + "loss": 0.6129, + "step": 2358 + }, + { + "epoch": 0.2898390465659172, + "grad_norm": 1.403597006359707, + "learning_rate": 2.6825660223679477e-05, + "loss": 0.7489, + "step": 2359 + }, + { + "epoch": 0.2899619117827743, + "grad_norm": 1.3417765638414851, + "learning_rate": 2.682170142443607e-05, + "loss": 0.809, + "step": 2360 + }, + { + "epoch": 0.2900847769996314, + "grad_norm": 1.4880015784755476, + "learning_rate": 2.681774045066735e-05, + "loss": 0.8553, + "step": 2361 + }, + { + "epoch": 0.29020764221648854, + "grad_norm": 1.5292691139737404, + "learning_rate": 2.6813777303101922e-05, + "loss": 0.7199, + "step": 2362 + }, + { + "epoch": 0.2903305074333456, + "grad_norm": 1.385479021290994, + "learning_rate": 2.6809811982468768e-05, + "loss": 0.675, + "step": 2363 + }, + { + "epoch": 0.2904533726502027, + "grad_norm": 1.5000692823886397, + "learning_rate": 2.680584448949729e-05, + "loss": 0.7758, + "step": 2364 + }, + { + "epoch": 0.29057623786705983, + "grad_norm": 1.3228278715643271, + "learning_rate": 2.6801874824917287e-05, + "loss": 0.6411, + "step": 2365 + }, + { + "epoch": 0.29069910308391694, + "grad_norm": 1.6867419143389066, + "learning_rate": 2.6797902989458944e-05, + "loss": 0.6082, + "step": 2366 + }, + { + "epoch": 0.29082196830077406, + "grad_norm": 1.2637297237230825, + "learning_rate": 2.679392898385286e-05, + "loss": 0.6473, + "step": 2367 + }, + { + "epoch": 0.2909448335176312, + "grad_norm": 1.5595809647393528, + "learning_rate": 2.678995280883002e-05, + "loss": 0.5838, + "step": 2368 + }, + { + "epoch": 0.2910676987344883, + "grad_norm": 1.241479917879033, + "learning_rate": 2.6785974465121827e-05, + "loss": 0.5741, + "step": 2369 + }, + { + "epoch": 0.29119056395134535, + "grad_norm": 1.5972350426520476, + "learning_rate": 2.678199395346006e-05, + "loss": 0.7174, + "step": 2370 + }, + { + "epoch": 0.29131342916820246, + "grad_norm": 1.485075674534945, + "learning_rate": 2.677801127457692e-05, + "loss": 0.5888, + "step": 2371 + }, + { + "epoch": 0.2914362943850596, + "grad_norm": 1.561351697234515, + "learning_rate": 2.6774026429204987e-05, + "loss": 0.8026, + "step": 2372 + }, + { + "epoch": 0.2915591596019167, + "grad_norm": 1.5370320142722032, + "learning_rate": 2.677003941807725e-05, + "loss": 0.689, + "step": 2373 + }, + { + "epoch": 0.2916820248187738, + "grad_norm": 1.4748724868235021, + "learning_rate": 2.6766050241927095e-05, + "loss": 0.6487, + "step": 2374 + }, + { + "epoch": 0.2918048900356309, + "grad_norm": 1.6119807046391001, + "learning_rate": 2.6762058901488303e-05, + "loss": 0.7718, + "step": 2375 + }, + { + "epoch": 0.29192775525248804, + "grad_norm": 1.2744559328086116, + "learning_rate": 2.6758065397495057e-05, + "loss": 0.5263, + "step": 2376 + }, + { + "epoch": 0.2920506204693451, + "grad_norm": 1.625645551930686, + "learning_rate": 2.675406973068193e-05, + "loss": 0.7076, + "step": 2377 + }, + { + "epoch": 0.2921734856862022, + "grad_norm": 1.3214840240744323, + "learning_rate": 2.6750071901783907e-05, + "loss": 0.6375, + "step": 2378 + }, + { + "epoch": 0.29229635090305933, + "grad_norm": 1.464322449343132, + "learning_rate": 2.6746071911536358e-05, + "loss": 0.6111, + "step": 2379 + }, + { + "epoch": 0.29241921611991645, + "grad_norm": 1.1902615020497778, + "learning_rate": 2.674206976067506e-05, + "loss": 0.6591, + "step": 2380 + }, + { + "epoch": 0.29254208133677356, + "grad_norm": 1.253419272415584, + "learning_rate": 2.6738065449936178e-05, + "loss": 0.6359, + "step": 2381 + }, + { + "epoch": 0.2926649465536307, + "grad_norm": 1.2976732889014784, + "learning_rate": 2.673405898005628e-05, + "loss": 0.5671, + "step": 2382 + }, + { + "epoch": 0.2927878117704878, + "grad_norm": 1.4177258308303493, + "learning_rate": 2.673005035177233e-05, + "loss": 0.7137, + "step": 2383 + }, + { + "epoch": 0.2929106769873449, + "grad_norm": 1.3292276051178207, + "learning_rate": 2.6726039565821686e-05, + "loss": 0.6389, + "step": 2384 + }, + { + "epoch": 0.29303354220420197, + "grad_norm": 1.3195035776409756, + "learning_rate": 2.6722026622942118e-05, + "loss": 0.7, + "step": 2385 + }, + { + "epoch": 0.2931564074210591, + "grad_norm": 1.461860616137111, + "learning_rate": 2.6718011523871766e-05, + "loss": 0.6187, + "step": 2386 + }, + { + "epoch": 0.2932792726379162, + "grad_norm": 1.2929775540531439, + "learning_rate": 2.6713994269349195e-05, + "loss": 0.6341, + "step": 2387 + }, + { + "epoch": 0.2934021378547733, + "grad_norm": 1.3184296079387035, + "learning_rate": 2.670997486011334e-05, + "loss": 0.6717, + "step": 2388 + }, + { + "epoch": 0.29352500307163043, + "grad_norm": 1.6264705432552435, + "learning_rate": 2.6705953296903554e-05, + "loss": 0.7501, + "step": 2389 + }, + { + "epoch": 0.29364786828848755, + "grad_norm": 1.3509801250243938, + "learning_rate": 2.670192958045957e-05, + "loss": 0.664, + "step": 2390 + }, + { + "epoch": 0.29377073350534466, + "grad_norm": 1.4386967352497981, + "learning_rate": 2.669790371152154e-05, + "loss": 0.7087, + "step": 2391 + }, + { + "epoch": 0.2938935987222017, + "grad_norm": 1.512357376426424, + "learning_rate": 2.6693875690829982e-05, + "loss": 0.613, + "step": 2392 + }, + { + "epoch": 0.29401646393905884, + "grad_norm": 1.737242725271504, + "learning_rate": 2.668984551912582e-05, + "loss": 0.6891, + "step": 2393 + }, + { + "epoch": 0.29413932915591595, + "grad_norm": 1.4281470080451333, + "learning_rate": 2.6685813197150395e-05, + "loss": 0.7265, + "step": 2394 + }, + { + "epoch": 0.29426219437277307, + "grad_norm": 1.344618804445938, + "learning_rate": 2.6681778725645414e-05, + "loss": 0.6866, + "step": 2395 + }, + { + "epoch": 0.2943850595896302, + "grad_norm": 1.4537982723269145, + "learning_rate": 2.6677742105352994e-05, + "loss": 0.607, + "step": 2396 + }, + { + "epoch": 0.2945079248064873, + "grad_norm": 1.3053647109723443, + "learning_rate": 2.667370333701565e-05, + "loss": 0.7297, + "step": 2397 + }, + { + "epoch": 0.2946307900233444, + "grad_norm": 1.4911442249586875, + "learning_rate": 2.6669662421376278e-05, + "loss": 0.6712, + "step": 2398 + }, + { + "epoch": 0.2947536552402015, + "grad_norm": 1.2238201567477958, + "learning_rate": 2.6665619359178192e-05, + "loss": 0.5783, + "step": 2399 + }, + { + "epoch": 0.2948765204570586, + "grad_norm": 1.3570722872120904, + "learning_rate": 2.6661574151165072e-05, + "loss": 0.6034, + "step": 2400 + }, + { + "epoch": 0.2949993856739157, + "grad_norm": 1.3887905827339477, + "learning_rate": 2.665752679808102e-05, + "loss": 0.6079, + "step": 2401 + }, + { + "epoch": 0.2951222508907728, + "grad_norm": 1.461338067757449, + "learning_rate": 2.6653477300670515e-05, + "loss": 0.6326, + "step": 2402 + }, + { + "epoch": 0.29524511610762993, + "grad_norm": 1.272220018267634, + "learning_rate": 2.6649425659678435e-05, + "loss": 0.577, + "step": 2403 + }, + { + "epoch": 0.29536798132448705, + "grad_norm": 1.3518148896772197, + "learning_rate": 2.664537187585005e-05, + "loss": 0.6125, + "step": 2404 + }, + { + "epoch": 0.29549084654134417, + "grad_norm": 1.688446219130224, + "learning_rate": 2.6641315949931034e-05, + "loss": 0.7056, + "step": 2405 + }, + { + "epoch": 0.2956137117582013, + "grad_norm": 1.5608721191429917, + "learning_rate": 2.6637257882667446e-05, + "loss": 0.591, + "step": 2406 + }, + { + "epoch": 0.29573657697505834, + "grad_norm": 1.3234372134418741, + "learning_rate": 2.663319767480574e-05, + "loss": 0.641, + "step": 2407 + }, + { + "epoch": 0.29585944219191546, + "grad_norm": 1.4594953046896513, + "learning_rate": 2.6629135327092772e-05, + "loss": 0.5938, + "step": 2408 + }, + { + "epoch": 0.29598230740877257, + "grad_norm": 1.300664983788724, + "learning_rate": 2.6625070840275767e-05, + "loss": 0.5663, + "step": 2409 + }, + { + "epoch": 0.2961051726256297, + "grad_norm": 1.3867753852868154, + "learning_rate": 2.662100421510238e-05, + "loss": 0.6084, + "step": 2410 + }, + { + "epoch": 0.2962280378424868, + "grad_norm": 1.2430135298240106, + "learning_rate": 2.6616935452320634e-05, + "loss": 0.7468, + "step": 2411 + }, + { + "epoch": 0.2963509030593439, + "grad_norm": 1.4681634300405093, + "learning_rate": 2.661286455267894e-05, + "loss": 0.6462, + "step": 2412 + }, + { + "epoch": 0.29647376827620103, + "grad_norm": 1.330231847879845, + "learning_rate": 2.6608791516926133e-05, + "loss": 0.6292, + "step": 2413 + }, + { + "epoch": 0.2965966334930581, + "grad_norm": 1.6441750432730258, + "learning_rate": 2.660471634581141e-05, + "loss": 0.7538, + "step": 2414 + }, + { + "epoch": 0.2967194987099152, + "grad_norm": 1.4338646771457852, + "learning_rate": 2.660063904008437e-05, + "loss": 0.6066, + "step": 2415 + }, + { + "epoch": 0.2968423639267723, + "grad_norm": 1.8635030690541587, + "learning_rate": 2.659655960049502e-05, + "loss": 0.7196, + "step": 2416 + }, + { + "epoch": 0.29696522914362944, + "grad_norm": 1.2193824830009194, + "learning_rate": 2.6592478027793732e-05, + "loss": 0.7361, + "step": 2417 + }, + { + "epoch": 0.29708809436048655, + "grad_norm": 1.3868314240387876, + "learning_rate": 2.658839432273129e-05, + "loss": 0.65, + "step": 2418 + }, + { + "epoch": 0.29721095957734367, + "grad_norm": 1.205332926048183, + "learning_rate": 2.6584308486058866e-05, + "loss": 0.699, + "step": 2419 + }, + { + "epoch": 0.2973338247942008, + "grad_norm": 1.420851197812113, + "learning_rate": 2.6580220518528025e-05, + "loss": 0.7105, + "step": 2420 + }, + { + "epoch": 0.29745669001105784, + "grad_norm": 1.5848834819284587, + "learning_rate": 2.657613042089072e-05, + "loss": 0.6333, + "step": 2421 + }, + { + "epoch": 0.29757955522791496, + "grad_norm": 1.3621699404319922, + "learning_rate": 2.6572038193899296e-05, + "loss": 0.634, + "step": 2422 + }, + { + "epoch": 0.2977024204447721, + "grad_norm": 1.3110149634591528, + "learning_rate": 2.6567943838306497e-05, + "loss": 0.616, + "step": 2423 + }, + { + "epoch": 0.2978252856616292, + "grad_norm": 1.3808115857040475, + "learning_rate": 2.6563847354865443e-05, + "loss": 0.6119, + "step": 2424 + }, + { + "epoch": 0.2979481508784863, + "grad_norm": 1.7679539200123076, + "learning_rate": 2.655974874432967e-05, + "loss": 0.6511, + "step": 2425 + }, + { + "epoch": 0.2980710160953434, + "grad_norm": 1.4533395249779197, + "learning_rate": 2.655564800745308e-05, + "loss": 0.5851, + "step": 2426 + }, + { + "epoch": 0.29819388131220054, + "grad_norm": 1.5164199903265803, + "learning_rate": 2.655154514498998e-05, + "loss": 0.6407, + "step": 2427 + }, + { + "epoch": 0.2983167465290576, + "grad_norm": 1.6278295404583447, + "learning_rate": 2.654744015769506e-05, + "loss": 0.6136, + "step": 2428 + }, + { + "epoch": 0.2984396117459147, + "grad_norm": 1.3661832432023582, + "learning_rate": 2.6543333046323416e-05, + "loss": 0.6561, + "step": 2429 + }, + { + "epoch": 0.29856247696277183, + "grad_norm": 1.225977593213098, + "learning_rate": 2.653922381163052e-05, + "loss": 0.6093, + "step": 2430 + }, + { + "epoch": 0.29868534217962894, + "grad_norm": 1.184076210924204, + "learning_rate": 2.6535112454372236e-05, + "loss": 0.6146, + "step": 2431 + }, + { + "epoch": 0.29880820739648606, + "grad_norm": 1.362569804727129, + "learning_rate": 2.6530998975304823e-05, + "loss": 0.6637, + "step": 2432 + }, + { + "epoch": 0.2989310726133432, + "grad_norm": 1.485539180353542, + "learning_rate": 2.652688337518493e-05, + "loss": 0.5147, + "step": 2433 + }, + { + "epoch": 0.2990539378302003, + "grad_norm": 1.2460179031611254, + "learning_rate": 2.65227656547696e-05, + "loss": 0.6562, + "step": 2434 + }, + { + "epoch": 0.2991768030470574, + "grad_norm": 1.173444454588339, + "learning_rate": 2.651864581481625e-05, + "loss": 0.6501, + "step": 2435 + }, + { + "epoch": 0.29929966826391446, + "grad_norm": 1.5580583304626232, + "learning_rate": 2.6514523856082703e-05, + "loss": 0.7505, + "step": 2436 + }, + { + "epoch": 0.2994225334807716, + "grad_norm": 1.4138993995088807, + "learning_rate": 2.651039977932717e-05, + "loss": 0.6289, + "step": 2437 + }, + { + "epoch": 0.2995453986976287, + "grad_norm": 1.6046125637358981, + "learning_rate": 2.6506273585308247e-05, + "loss": 0.5689, + "step": 2438 + }, + { + "epoch": 0.2996682639144858, + "grad_norm": 1.4095377681281238, + "learning_rate": 2.6502145274784916e-05, + "loss": 0.6976, + "step": 2439 + }, + { + "epoch": 0.2997911291313429, + "grad_norm": 1.2225672789322066, + "learning_rate": 2.6498014848516557e-05, + "loss": 0.5801, + "step": 2440 + }, + { + "epoch": 0.29991399434820004, + "grad_norm": 1.30447627608537, + "learning_rate": 2.649388230726293e-05, + "loss": 0.572, + "step": 2441 + }, + { + "epoch": 0.30003685956505716, + "grad_norm": 1.376186799665795, + "learning_rate": 2.6489747651784196e-05, + "loss": 0.8825, + "step": 2442 + }, + { + "epoch": 0.3001597247819142, + "grad_norm": 1.4761414176356382, + "learning_rate": 2.6485610882840892e-05, + "loss": 0.7107, + "step": 2443 + }, + { + "epoch": 0.30028258999877133, + "grad_norm": 1.5918204978840815, + "learning_rate": 2.6481472001193958e-05, + "loss": 0.7503, + "step": 2444 + }, + { + "epoch": 0.30040545521562845, + "grad_norm": 1.445191374769686, + "learning_rate": 2.647733100760471e-05, + "loss": 0.5469, + "step": 2445 + }, + { + "epoch": 0.30052832043248556, + "grad_norm": 1.3377955487205306, + "learning_rate": 2.6473187902834848e-05, + "loss": 0.7394, + "step": 2446 + }, + { + "epoch": 0.3006511856493427, + "grad_norm": 1.3142133141215817, + "learning_rate": 2.646904268764648e-05, + "loss": 0.6485, + "step": 2447 + }, + { + "epoch": 0.3007740508661998, + "grad_norm": 1.337261276043544, + "learning_rate": 2.6464895362802095e-05, + "loss": 0.6148, + "step": 2448 + }, + { + "epoch": 0.3008969160830569, + "grad_norm": 1.377489523323608, + "learning_rate": 2.6460745929064553e-05, + "loss": 0.6353, + "step": 2449 + }, + { + "epoch": 0.30101978129991397, + "grad_norm": 1.4132881543320492, + "learning_rate": 2.645659438719713e-05, + "loss": 0.6028, + "step": 2450 + }, + { + "epoch": 0.3011426465167711, + "grad_norm": 1.7655989302338462, + "learning_rate": 2.6452440737963463e-05, + "loss": 0.8197, + "step": 2451 + }, + { + "epoch": 0.3012655117336282, + "grad_norm": 1.4442894580494268, + "learning_rate": 2.6448284982127596e-05, + "loss": 0.6502, + "step": 2452 + }, + { + "epoch": 0.3013883769504853, + "grad_norm": 1.536197832157501, + "learning_rate": 2.6444127120453957e-05, + "loss": 0.6919, + "step": 2453 + }, + { + "epoch": 0.30151124216734243, + "grad_norm": 1.2562097096007652, + "learning_rate": 2.643996715370734e-05, + "loss": 0.6998, + "step": 2454 + }, + { + "epoch": 0.30163410738419955, + "grad_norm": 1.2801352479232897, + "learning_rate": 2.6435805082652966e-05, + "loss": 0.5539, + "step": 2455 + }, + { + "epoch": 0.30175697260105666, + "grad_norm": 1.3079420825339567, + "learning_rate": 2.6431640908056408e-05, + "loss": 0.6051, + "step": 2456 + }, + { + "epoch": 0.3018798378179137, + "grad_norm": 1.4513054598372057, + "learning_rate": 2.6427474630683636e-05, + "loss": 0.74, + "step": 2457 + }, + { + "epoch": 0.30200270303477084, + "grad_norm": 1.6408437203301698, + "learning_rate": 2.642330625130102e-05, + "loss": 0.6344, + "step": 2458 + }, + { + "epoch": 0.30212556825162795, + "grad_norm": 1.2732096846259093, + "learning_rate": 2.6419135770675304e-05, + "loss": 0.604, + "step": 2459 + }, + { + "epoch": 0.30224843346848507, + "grad_norm": 1.377548799427008, + "learning_rate": 2.6414963189573616e-05, + "loss": 0.6089, + "step": 2460 + }, + { + "epoch": 0.3023712986853422, + "grad_norm": 1.4603319581710439, + "learning_rate": 2.641078850876348e-05, + "loss": 0.6389, + "step": 2461 + }, + { + "epoch": 0.3024941639021993, + "grad_norm": 1.55341564151219, + "learning_rate": 2.6406611729012796e-05, + "loss": 0.6829, + "step": 2462 + }, + { + "epoch": 0.3026170291190564, + "grad_norm": 1.2542987931849043, + "learning_rate": 2.6402432851089863e-05, + "loss": 0.746, + "step": 2463 + }, + { + "epoch": 0.30273989433591353, + "grad_norm": 1.1668802770077082, + "learning_rate": 2.639825187576335e-05, + "loss": 0.5791, + "step": 2464 + }, + { + "epoch": 0.3028627595527706, + "grad_norm": 1.2941309788019855, + "learning_rate": 2.6394068803802328e-05, + "loss": 0.6517, + "step": 2465 + }, + { + "epoch": 0.3029856247696277, + "grad_norm": 1.3537898591316258, + "learning_rate": 2.6389883635976243e-05, + "loss": 0.697, + "step": 2466 + }, + { + "epoch": 0.3031084899864848, + "grad_norm": 1.1669624561257468, + "learning_rate": 2.6385696373054926e-05, + "loss": 0.5354, + "step": 2467 + }, + { + "epoch": 0.30323135520334193, + "grad_norm": 1.412063534463707, + "learning_rate": 2.6381507015808603e-05, + "loss": 0.7671, + "step": 2468 + }, + { + "epoch": 0.30335422042019905, + "grad_norm": 1.12853057979421, + "learning_rate": 2.6377315565007876e-05, + "loss": 0.6919, + "step": 2469 + }, + { + "epoch": 0.30347708563705617, + "grad_norm": 1.3192679166399208, + "learning_rate": 2.6373122021423733e-05, + "loss": 0.6567, + "step": 2470 + }, + { + "epoch": 0.3035999508539133, + "grad_norm": 1.149287516751743, + "learning_rate": 2.6368926385827548e-05, + "loss": 0.7025, + "step": 2471 + }, + { + "epoch": 0.30372281607077034, + "grad_norm": 1.2564238302020516, + "learning_rate": 2.6364728658991093e-05, + "loss": 0.6621, + "step": 2472 + }, + { + "epoch": 0.30384568128762746, + "grad_norm": 1.2889429679730293, + "learning_rate": 2.63605288416865e-05, + "loss": 0.6959, + "step": 2473 + }, + { + "epoch": 0.30396854650448457, + "grad_norm": 1.4898757690755198, + "learning_rate": 2.6356326934686303e-05, + "loss": 0.7409, + "step": 2474 + }, + { + "epoch": 0.3040914117213417, + "grad_norm": 1.3899086667916727, + "learning_rate": 2.6352122938763412e-05, + "loss": 0.6986, + "step": 2475 + }, + { + "epoch": 0.3042142769381988, + "grad_norm": 1.25979048352415, + "learning_rate": 2.634791685469113e-05, + "loss": 0.7144, + "step": 2476 + }, + { + "epoch": 0.3043371421550559, + "grad_norm": 1.3610340649771746, + "learning_rate": 2.6343708683243137e-05, + "loss": 0.6037, + "step": 2477 + }, + { + "epoch": 0.30446000737191303, + "grad_norm": 1.4802857672976426, + "learning_rate": 2.6339498425193496e-05, + "loss": 0.5383, + "step": 2478 + }, + { + "epoch": 0.3045828725887701, + "grad_norm": 1.3058682083126159, + "learning_rate": 2.633528608131666e-05, + "loss": 0.6201, + "step": 2479 + }, + { + "epoch": 0.3047057378056272, + "grad_norm": 1.3162182356411625, + "learning_rate": 2.6331071652387463e-05, + "loss": 0.6432, + "step": 2480 + }, + { + "epoch": 0.3048286030224843, + "grad_norm": 1.2629438363239822, + "learning_rate": 2.6326855139181117e-05, + "loss": 0.653, + "step": 2481 + }, + { + "epoch": 0.30495146823934144, + "grad_norm": 1.3815055890360515, + "learning_rate": 2.6322636542473228e-05, + "loss": 0.7235, + "step": 2482 + }, + { + "epoch": 0.30507433345619855, + "grad_norm": 1.3009432438526725, + "learning_rate": 2.631841586303978e-05, + "loss": 0.5671, + "step": 2483 + }, + { + "epoch": 0.30519719867305567, + "grad_norm": 1.327052106998988, + "learning_rate": 2.6314193101657124e-05, + "loss": 0.6564, + "step": 2484 + }, + { + "epoch": 0.3053200638899128, + "grad_norm": 1.4685376349489467, + "learning_rate": 2.6309968259102032e-05, + "loss": 0.7218, + "step": 2485 + }, + { + "epoch": 0.3054429291067699, + "grad_norm": 2.1368373874279567, + "learning_rate": 2.630574133615163e-05, + "loss": 0.689, + "step": 2486 + }, + { + "epoch": 0.30556579432362696, + "grad_norm": 1.1806373531561964, + "learning_rate": 2.630151233358342e-05, + "loss": 0.6337, + "step": 2487 + }, + { + "epoch": 0.3056886595404841, + "grad_norm": 1.5547882466225953, + "learning_rate": 2.6297281252175316e-05, + "loss": 0.5564, + "step": 2488 + }, + { + "epoch": 0.3058115247573412, + "grad_norm": 1.2232814756210562, + "learning_rate": 2.6293048092705586e-05, + "loss": 0.5094, + "step": 2489 + }, + { + "epoch": 0.3059343899741983, + "grad_norm": 1.3585393518992124, + "learning_rate": 2.62888128559529e-05, + "loss": 0.7912, + "step": 2490 + }, + { + "epoch": 0.3060572551910554, + "grad_norm": 1.6308232832267326, + "learning_rate": 2.6284575542696297e-05, + "loss": 0.6719, + "step": 2491 + }, + { + "epoch": 0.30618012040791254, + "grad_norm": 1.3366492794879048, + "learning_rate": 2.628033615371521e-05, + "loss": 0.6072, + "step": 2492 + }, + { + "epoch": 0.30630298562476965, + "grad_norm": 1.4902375710911069, + "learning_rate": 2.627609468978944e-05, + "loss": 0.6522, + "step": 2493 + }, + { + "epoch": 0.3064258508416267, + "grad_norm": 2.0756321666560433, + "learning_rate": 2.6271851151699184e-05, + "loss": 0.7065, + "step": 2494 + }, + { + "epoch": 0.30654871605848383, + "grad_norm": 1.3766022234270892, + "learning_rate": 2.626760554022501e-05, + "loss": 0.6257, + "step": 2495 + }, + { + "epoch": 0.30667158127534094, + "grad_norm": 1.4815480133941579, + "learning_rate": 2.626335785614786e-05, + "loss": 0.6207, + "step": 2496 + }, + { + "epoch": 0.30679444649219806, + "grad_norm": 1.5921998916548383, + "learning_rate": 2.6259108100249086e-05, + "loss": 0.7173, + "step": 2497 + }, + { + "epoch": 0.3069173117090552, + "grad_norm": 1.378425045426828, + "learning_rate": 2.6254856273310394e-05, + "loss": 0.5211, + "step": 2498 + }, + { + "epoch": 0.3070401769259123, + "grad_norm": 1.329175482073475, + "learning_rate": 2.6250602376113882e-05, + "loss": 0.7245, + "step": 2499 + }, + { + "epoch": 0.3071630421427694, + "grad_norm": 1.7221210583365882, + "learning_rate": 2.6246346409442024e-05, + "loss": 0.8727, + "step": 2500 + }, + { + "epoch": 0.30728590735962646, + "grad_norm": 1.3072633374908982, + "learning_rate": 2.6242088374077676e-05, + "loss": 0.5817, + "step": 2501 + }, + { + "epoch": 0.3074087725764836, + "grad_norm": 1.4852634287298978, + "learning_rate": 2.623782827080408e-05, + "loss": 0.7688, + "step": 2502 + }, + { + "epoch": 0.3075316377933407, + "grad_norm": 1.4482031481844009, + "learning_rate": 2.6233566100404856e-05, + "loss": 0.6449, + "step": 2503 + }, + { + "epoch": 0.3076545030101978, + "grad_norm": 1.2675970740240525, + "learning_rate": 2.6229301863664e-05, + "loss": 0.591, + "step": 2504 + }, + { + "epoch": 0.3077773682270549, + "grad_norm": 1.441719576125879, + "learning_rate": 2.6225035561365888e-05, + "loss": 0.6407, + "step": 2505 + }, + { + "epoch": 0.30790023344391204, + "grad_norm": 1.2956222180735257, + "learning_rate": 2.6220767194295285e-05, + "loss": 0.6527, + "step": 2506 + }, + { + "epoch": 0.30802309866076916, + "grad_norm": 1.3829537062957955, + "learning_rate": 2.6216496763237324e-05, + "loss": 0.7639, + "step": 2507 + }, + { + "epoch": 0.3081459638776262, + "grad_norm": 1.3018371785417477, + "learning_rate": 2.6212224268977527e-05, + "loss": 0.633, + "step": 2508 + }, + { + "epoch": 0.30826882909448333, + "grad_norm": 1.4061000732830922, + "learning_rate": 2.6207949712301787e-05, + "loss": 0.731, + "step": 2509 + }, + { + "epoch": 0.30839169431134045, + "grad_norm": 1.2386783246880577, + "learning_rate": 2.6203673093996385e-05, + "loss": 0.6638, + "step": 2510 + }, + { + "epoch": 0.30851455952819756, + "grad_norm": 1.374410039184066, + "learning_rate": 2.6199394414847975e-05, + "loss": 0.7109, + "step": 2511 + }, + { + "epoch": 0.3086374247450547, + "grad_norm": 1.1227663809651465, + "learning_rate": 2.619511367564359e-05, + "loss": 0.6737, + "step": 2512 + }, + { + "epoch": 0.3087602899619118, + "grad_norm": 1.1396891319287876, + "learning_rate": 2.6190830877170653e-05, + "loss": 0.5849, + "step": 2513 + }, + { + "epoch": 0.3088831551787689, + "grad_norm": 1.3446049045321882, + "learning_rate": 2.618654602021695e-05, + "loss": 0.6503, + "step": 2514 + }, + { + "epoch": 0.309006020395626, + "grad_norm": 1.1422331814351225, + "learning_rate": 2.6182259105570652e-05, + "loss": 0.5188, + "step": 2515 + }, + { + "epoch": 0.3091288856124831, + "grad_norm": 1.6336220708089082, + "learning_rate": 2.6177970134020308e-05, + "loss": 0.6219, + "step": 2516 + }, + { + "epoch": 0.3092517508293402, + "grad_norm": 1.264204170686446, + "learning_rate": 2.6173679106354852e-05, + "loss": 0.6696, + "step": 2517 + }, + { + "epoch": 0.3093746160461973, + "grad_norm": 1.349567393846118, + "learning_rate": 2.616938602336359e-05, + "loss": 0.679, + "step": 2518 + }, + { + "epoch": 0.30949748126305443, + "grad_norm": 1.2517880866341422, + "learning_rate": 2.6165090885836208e-05, + "loss": 0.6055, + "step": 2519 + }, + { + "epoch": 0.30962034647991155, + "grad_norm": 1.2607803784804532, + "learning_rate": 2.616079369456276e-05, + "loss": 0.7428, + "step": 2520 + }, + { + "epoch": 0.30974321169676866, + "grad_norm": 1.238614498838452, + "learning_rate": 2.6156494450333696e-05, + "loss": 0.6576, + "step": 2521 + }, + { + "epoch": 0.3098660769136258, + "grad_norm": 1.2864564924525819, + "learning_rate": 2.6152193153939826e-05, + "loss": 0.628, + "step": 2522 + }, + { + "epoch": 0.30998894213048284, + "grad_norm": 1.1728487729998607, + "learning_rate": 2.614788980617235e-05, + "loss": 0.6175, + "step": 2523 + }, + { + "epoch": 0.31011180734733995, + "grad_norm": 1.3132363058693837, + "learning_rate": 2.6143584407822848e-05, + "loss": 0.657, + "step": 2524 + }, + { + "epoch": 0.31023467256419707, + "grad_norm": 1.7142676593143533, + "learning_rate": 2.6139276959683254e-05, + "loss": 0.7109, + "step": 2525 + }, + { + "epoch": 0.3103575377810542, + "grad_norm": 1.4543096618175497, + "learning_rate": 2.6134967462545908e-05, + "loss": 0.7207, + "step": 2526 + }, + { + "epoch": 0.3104804029979113, + "grad_norm": 1.4348026856164229, + "learning_rate": 2.6130655917203512e-05, + "loss": 0.7111, + "step": 2527 + }, + { + "epoch": 0.3106032682147684, + "grad_norm": 1.6185329391280219, + "learning_rate": 2.6126342324449142e-05, + "loss": 0.6284, + "step": 2528 + }, + { + "epoch": 0.31072613343162553, + "grad_norm": 1.2772076956547773, + "learning_rate": 2.6122026685076256e-05, + "loss": 0.6835, + "step": 2529 + }, + { + "epoch": 0.3108489986484826, + "grad_norm": 1.5443823181694118, + "learning_rate": 2.6117708999878695e-05, + "loss": 0.718, + "step": 2530 + }, + { + "epoch": 0.3109718638653397, + "grad_norm": 1.6278603059521664, + "learning_rate": 2.611338926965066e-05, + "loss": 0.7446, + "step": 2531 + }, + { + "epoch": 0.3110947290821968, + "grad_norm": 1.4029653716314316, + "learning_rate": 2.6109067495186747e-05, + "loss": 0.7183, + "step": 2532 + }, + { + "epoch": 0.31121759429905393, + "grad_norm": 1.1942365271575093, + "learning_rate": 2.6104743677281912e-05, + "loss": 0.5555, + "step": 2533 + }, + { + "epoch": 0.31134045951591105, + "grad_norm": 1.3040379994176885, + "learning_rate": 2.610041781673149e-05, + "loss": 0.6809, + "step": 2534 + }, + { + "epoch": 0.31146332473276817, + "grad_norm": 1.226468097490837, + "learning_rate": 2.60960899143312e-05, + "loss": 0.6494, + "step": 2535 + }, + { + "epoch": 0.3115861899496253, + "grad_norm": 1.3356276346099825, + "learning_rate": 2.6091759970877134e-05, + "loss": 0.6388, + "step": 2536 + }, + { + "epoch": 0.3117090551664824, + "grad_norm": 1.2277314594635969, + "learning_rate": 2.6087427987165754e-05, + "loss": 0.6733, + "step": 2537 + }, + { + "epoch": 0.31183192038333946, + "grad_norm": 1.7859210985388885, + "learning_rate": 2.6083093963993898e-05, + "loss": 0.743, + "step": 2538 + }, + { + "epoch": 0.31195478560019657, + "grad_norm": 1.2798944498099116, + "learning_rate": 2.6078757902158784e-05, + "loss": 0.5982, + "step": 2539 + }, + { + "epoch": 0.3120776508170537, + "grad_norm": 1.6188265327407387, + "learning_rate": 2.6074419802458002e-05, + "loss": 0.6475, + "step": 2540 + }, + { + "epoch": 0.3122005160339108, + "grad_norm": 1.2444869239351086, + "learning_rate": 2.6070079665689518e-05, + "loss": 0.6599, + "step": 2541 + }, + { + "epoch": 0.3123233812507679, + "grad_norm": 1.4173631620479035, + "learning_rate": 2.6065737492651677e-05, + "loss": 0.556, + "step": 2542 + }, + { + "epoch": 0.31244624646762503, + "grad_norm": 1.3523053100461713, + "learning_rate": 2.606139328414318e-05, + "loss": 0.667, + "step": 2543 + }, + { + "epoch": 0.31256911168448215, + "grad_norm": 1.3674646951596976, + "learning_rate": 2.6057047040963127e-05, + "loss": 0.7741, + "step": 2544 + }, + { + "epoch": 0.3126919769013392, + "grad_norm": 1.1879789892668655, + "learning_rate": 2.605269876391098e-05, + "loss": 0.5451, + "step": 2545 + }, + { + "epoch": 0.3128148421181963, + "grad_norm": 1.3205740270632151, + "learning_rate": 2.6048348453786576e-05, + "loss": 0.7046, + "step": 2546 + }, + { + "epoch": 0.31293770733505344, + "grad_norm": 1.3827041541426788, + "learning_rate": 2.604399611139012e-05, + "loss": 0.6505, + "step": 2547 + }, + { + "epoch": 0.31306057255191055, + "grad_norm": 1.187935203296127, + "learning_rate": 2.60396417375222e-05, + "loss": 0.5001, + "step": 2548 + }, + { + "epoch": 0.31318343776876767, + "grad_norm": 1.2683024100457911, + "learning_rate": 2.6035285332983783e-05, + "loss": 0.6129, + "step": 2549 + }, + { + "epoch": 0.3133063029856248, + "grad_norm": 1.2801065416233088, + "learning_rate": 2.6030926898576196e-05, + "loss": 0.5747, + "step": 2550 + }, + { + "epoch": 0.3134291682024819, + "grad_norm": 1.5012068352588868, + "learning_rate": 2.6026566435101143e-05, + "loss": 0.6733, + "step": 2551 + }, + { + "epoch": 0.31355203341933896, + "grad_norm": 1.1059099916301751, + "learning_rate": 2.60222039433607e-05, + "loss": 0.6212, + "step": 2552 + }, + { + "epoch": 0.3136748986361961, + "grad_norm": 1.4283691349842955, + "learning_rate": 2.6017839424157322e-05, + "loss": 0.6212, + "step": 2553 + }, + { + "epoch": 0.3137977638530532, + "grad_norm": 1.490641846209876, + "learning_rate": 2.601347287829384e-05, + "loss": 0.7474, + "step": 2554 + }, + { + "epoch": 0.3139206290699103, + "grad_norm": 1.6498064978450815, + "learning_rate": 2.6009104306573447e-05, + "loss": 0.6375, + "step": 2555 + }, + { + "epoch": 0.3140434942867674, + "grad_norm": 1.5501671439934752, + "learning_rate": 2.600473370979971e-05, + "loss": 0.6318, + "step": 2556 + }, + { + "epoch": 0.31416635950362454, + "grad_norm": 1.3357930680220989, + "learning_rate": 2.600036108877658e-05, + "loss": 0.5443, + "step": 2557 + }, + { + "epoch": 0.31428922472048165, + "grad_norm": 1.1034052001652728, + "learning_rate": 2.5995986444308366e-05, + "loss": 0.5908, + "step": 2558 + }, + { + "epoch": 0.3144120899373387, + "grad_norm": 1.4559047460943224, + "learning_rate": 2.5991609777199755e-05, + "loss": 0.6918, + "step": 2559 + }, + { + "epoch": 0.3145349551541958, + "grad_norm": 1.3882494862365227, + "learning_rate": 2.5987231088255807e-05, + "loss": 0.5873, + "step": 2560 + }, + { + "epoch": 0.31465782037105294, + "grad_norm": 1.6013406223009476, + "learning_rate": 2.598285037828196e-05, + "loss": 0.5888, + "step": 2561 + }, + { + "epoch": 0.31478068558791006, + "grad_norm": 1.365953475128749, + "learning_rate": 2.5978467648084012e-05, + "loss": 0.6063, + "step": 2562 + }, + { + "epoch": 0.3149035508047672, + "grad_norm": 1.244561415772206, + "learning_rate": 2.5974082898468135e-05, + "loss": 0.6508, + "step": 2563 + }, + { + "epoch": 0.3150264160216243, + "grad_norm": 1.5446476507843223, + "learning_rate": 2.5969696130240876e-05, + "loss": 0.7349, + "step": 2564 + }, + { + "epoch": 0.3151492812384814, + "grad_norm": 1.4555865289214887, + "learning_rate": 2.596530734420916e-05, + "loss": 0.6362, + "step": 2565 + }, + { + "epoch": 0.3152721464553385, + "grad_norm": 1.4698464494128383, + "learning_rate": 2.596091654118027e-05, + "loss": 0.6475, + "step": 2566 + }, + { + "epoch": 0.3153950116721956, + "grad_norm": 1.2105702552277666, + "learning_rate": 2.5956523721961866e-05, + "loss": 0.5492, + "step": 2567 + }, + { + "epoch": 0.3155178768890527, + "grad_norm": 1.5139621919074298, + "learning_rate": 2.5952128887361977e-05, + "loss": 0.5953, + "step": 2568 + }, + { + "epoch": 0.3156407421059098, + "grad_norm": 1.376337368112041, + "learning_rate": 2.5947732038189005e-05, + "loss": 0.665, + "step": 2569 + }, + { + "epoch": 0.3157636073227669, + "grad_norm": 1.250419532897594, + "learning_rate": 2.5943333175251723e-05, + "loss": 0.5404, + "step": 2570 + }, + { + "epoch": 0.31588647253962404, + "grad_norm": 1.370229977969879, + "learning_rate": 2.5938932299359276e-05, + "loss": 0.5484, + "step": 2571 + }, + { + "epoch": 0.31600933775648116, + "grad_norm": 1.29867804103967, + "learning_rate": 2.5934529411321174e-05, + "loss": 0.7878, + "step": 2572 + }, + { + "epoch": 0.31613220297333827, + "grad_norm": 1.3677243117750901, + "learning_rate": 2.59301245119473e-05, + "loss": 0.7092, + "step": 2573 + }, + { + "epoch": 0.31625506819019533, + "grad_norm": 1.1637274135340092, + "learning_rate": 2.5925717602047903e-05, + "loss": 0.6135, + "step": 2574 + }, + { + "epoch": 0.31637793340705245, + "grad_norm": 1.298217335672922, + "learning_rate": 2.5921308682433613e-05, + "loss": 0.618, + "step": 2575 + }, + { + "epoch": 0.31650079862390956, + "grad_norm": 1.3597695491733306, + "learning_rate": 2.5916897753915415e-05, + "loss": 0.651, + "step": 2576 + }, + { + "epoch": 0.3166236638407667, + "grad_norm": 1.214403160311376, + "learning_rate": 2.5912484817304675e-05, + "loss": 0.6438, + "step": 2577 + }, + { + "epoch": 0.3167465290576238, + "grad_norm": 1.3332037720094896, + "learning_rate": 2.5908069873413123e-05, + "loss": 0.7253, + "step": 2578 + }, + { + "epoch": 0.3168693942744809, + "grad_norm": 1.1781021038172923, + "learning_rate": 2.590365292305286e-05, + "loss": 0.6703, + "step": 2579 + }, + { + "epoch": 0.316992259491338, + "grad_norm": 1.2373537921675537, + "learning_rate": 2.589923396703635e-05, + "loss": 0.6871, + "step": 2580 + }, + { + "epoch": 0.3171151247081951, + "grad_norm": 1.458256984028032, + "learning_rate": 2.5894813006176443e-05, + "loss": 0.6507, + "step": 2581 + }, + { + "epoch": 0.3172379899250522, + "grad_norm": 1.1502066692457085, + "learning_rate": 2.5890390041286335e-05, + "loss": 0.6551, + "step": 2582 + }, + { + "epoch": 0.3173608551419093, + "grad_norm": 1.4041446859293654, + "learning_rate": 2.5885965073179605e-05, + "loss": 0.5989, + "step": 2583 + }, + { + "epoch": 0.31748372035876643, + "grad_norm": 1.2869795160531898, + "learning_rate": 2.58815381026702e-05, + "loss": 0.6641, + "step": 2584 + }, + { + "epoch": 0.31760658557562355, + "grad_norm": 1.3907326018079387, + "learning_rate": 2.5877109130572427e-05, + "loss": 0.5665, + "step": 2585 + }, + { + "epoch": 0.31772945079248066, + "grad_norm": 1.2350372873818287, + "learning_rate": 2.587267815770097e-05, + "loss": 0.672, + "step": 2586 + }, + { + "epoch": 0.3178523160093378, + "grad_norm": 1.248320314920715, + "learning_rate": 2.586824518487088e-05, + "loss": 0.7338, + "step": 2587 + }, + { + "epoch": 0.3179751812261949, + "grad_norm": 1.4369785814691036, + "learning_rate": 2.586381021289757e-05, + "loss": 0.6268, + "step": 2588 + }, + { + "epoch": 0.31809804644305195, + "grad_norm": 1.302923672839092, + "learning_rate": 2.5859373242596827e-05, + "loss": 0.5552, + "step": 2589 + }, + { + "epoch": 0.31822091165990907, + "grad_norm": 1.2109691119657817, + "learning_rate": 2.58549342747848e-05, + "loss": 0.6076, + "step": 2590 + }, + { + "epoch": 0.3183437768767662, + "grad_norm": 1.418565842140509, + "learning_rate": 2.585049331027801e-05, + "loss": 0.646, + "step": 2591 + }, + { + "epoch": 0.3184666420936233, + "grad_norm": 1.520246966610575, + "learning_rate": 2.5846050349893345e-05, + "loss": 0.623, + "step": 2592 + }, + { + "epoch": 0.3185895073104804, + "grad_norm": 1.512292729014408, + "learning_rate": 2.584160539444806e-05, + "loss": 0.6445, + "step": 2593 + }, + { + "epoch": 0.31871237252733753, + "grad_norm": 1.146667164858264, + "learning_rate": 2.5837158444759764e-05, + "loss": 0.6671, + "step": 2594 + }, + { + "epoch": 0.31883523774419464, + "grad_norm": 1.251373086606052, + "learning_rate": 2.583270950164646e-05, + "loss": 0.6246, + "step": 2595 + }, + { + "epoch": 0.3189581029610517, + "grad_norm": 1.2822615137428304, + "learning_rate": 2.5828258565926497e-05, + "loss": 0.6483, + "step": 2596 + }, + { + "epoch": 0.3190809681779088, + "grad_norm": 1.3793251900759342, + "learning_rate": 2.582380563841859e-05, + "loss": 0.6412, + "step": 2597 + }, + { + "epoch": 0.31920383339476593, + "grad_norm": 1.401578369021944, + "learning_rate": 2.5819350719941836e-05, + "loss": 0.7121, + "step": 2598 + }, + { + "epoch": 0.31932669861162305, + "grad_norm": 1.4071848815983876, + "learning_rate": 2.5814893811315675e-05, + "loss": 0.6147, + "step": 2599 + }, + { + "epoch": 0.31944956382848017, + "grad_norm": 1.400233448306146, + "learning_rate": 2.5810434913359943e-05, + "loss": 0.6179, + "step": 2600 + }, + { + "epoch": 0.3195724290453373, + "grad_norm": 1.1959140906101628, + "learning_rate": 2.580597402689481e-05, + "loss": 0.6302, + "step": 2601 + }, + { + "epoch": 0.3196952942621944, + "grad_norm": 1.4347986175401308, + "learning_rate": 2.5801511152740837e-05, + "loss": 0.6266, + "step": 2602 + }, + { + "epoch": 0.31981815947905146, + "grad_norm": 1.550561011660293, + "learning_rate": 2.5797046291718943e-05, + "loss": 0.6967, + "step": 2603 + }, + { + "epoch": 0.31994102469590857, + "grad_norm": 1.6490815814270972, + "learning_rate": 2.57925794446504e-05, + "loss": 0.7436, + "step": 2604 + }, + { + "epoch": 0.3200638899127657, + "grad_norm": 1.166096420517154, + "learning_rate": 2.578811061235686e-05, + "loss": 0.5972, + "step": 2605 + }, + { + "epoch": 0.3201867551296228, + "grad_norm": 1.6985292587802272, + "learning_rate": 2.5783639795660333e-05, + "loss": 0.6917, + "step": 2606 + }, + { + "epoch": 0.3203096203464799, + "grad_norm": 1.5096066459399982, + "learning_rate": 2.57791669953832e-05, + "loss": 0.7676, + "step": 2607 + }, + { + "epoch": 0.32043248556333703, + "grad_norm": 1.6707534920991851, + "learning_rate": 2.577469221234821e-05, + "loss": 0.6244, + "step": 2608 + }, + { + "epoch": 0.32055535078019415, + "grad_norm": 1.6566903487379576, + "learning_rate": 2.5770215447378463e-05, + "loss": 0.801, + "step": 2609 + }, + { + "epoch": 0.3206782159970512, + "grad_norm": 1.4237559103569994, + "learning_rate": 2.5765736701297427e-05, + "loss": 0.6654, + "step": 2610 + }, + { + "epoch": 0.3208010812139083, + "grad_norm": 1.3674943012861098, + "learning_rate": 2.576125597492895e-05, + "loss": 0.6859, + "step": 2611 + }, + { + "epoch": 0.32092394643076544, + "grad_norm": 1.269773735327447, + "learning_rate": 2.5756773269097217e-05, + "loss": 0.5755, + "step": 2612 + }, + { + "epoch": 0.32104681164762255, + "grad_norm": 1.5633598131928748, + "learning_rate": 2.5752288584626807e-05, + "loss": 0.6617, + "step": 2613 + }, + { + "epoch": 0.32116967686447967, + "grad_norm": 1.4245475468319873, + "learning_rate": 2.574780192234264e-05, + "loss": 0.7261, + "step": 2614 + }, + { + "epoch": 0.3212925420813368, + "grad_norm": 1.6144095494920554, + "learning_rate": 2.5743313283070015e-05, + "loss": 0.5976, + "step": 2615 + }, + { + "epoch": 0.3214154072981939, + "grad_norm": 1.3085805707846523, + "learning_rate": 2.573882266763458e-05, + "loss": 0.6626, + "step": 2616 + }, + { + "epoch": 0.321538272515051, + "grad_norm": 1.5154457376959152, + "learning_rate": 2.573433007686236e-05, + "loss": 0.6931, + "step": 2617 + }, + { + "epoch": 0.3216611377319081, + "grad_norm": 1.4225351102347, + "learning_rate": 2.572983551157974e-05, + "loss": 0.5263, + "step": 2618 + }, + { + "epoch": 0.3217840029487652, + "grad_norm": 1.2047673939135632, + "learning_rate": 2.572533897261346e-05, + "loss": 0.5607, + "step": 2619 + }, + { + "epoch": 0.3219068681656223, + "grad_norm": 1.1246564674742145, + "learning_rate": 2.5720840460790635e-05, + "loss": 0.6805, + "step": 2620 + }, + { + "epoch": 0.3220297333824794, + "grad_norm": 1.6353367158565286, + "learning_rate": 2.571633997693873e-05, + "loss": 0.612, + "step": 2621 + }, + { + "epoch": 0.32215259859933654, + "grad_norm": 1.5130014569591679, + "learning_rate": 2.571183752188559e-05, + "loss": 0.6231, + "step": 2622 + }, + { + "epoch": 0.32227546381619365, + "grad_norm": 1.3667676938426772, + "learning_rate": 2.57073330964594e-05, + "loss": 0.6656, + "step": 2623 + }, + { + "epoch": 0.32239832903305077, + "grad_norm": 1.3229940266448188, + "learning_rate": 2.5702826701488735e-05, + "loss": 0.7438, + "step": 2624 + }, + { + "epoch": 0.3225211942499078, + "grad_norm": 1.4347702153060102, + "learning_rate": 2.56983183378025e-05, + "loss": 0.7259, + "step": 2625 + }, + { + "epoch": 0.32264405946676494, + "grad_norm": 2.2042327438534524, + "learning_rate": 2.5693808006229988e-05, + "loss": 0.7014, + "step": 2626 + }, + { + "epoch": 0.32276692468362206, + "grad_norm": 1.3379183537386428, + "learning_rate": 2.5689295707600853e-05, + "loss": 0.5754, + "step": 2627 + }, + { + "epoch": 0.3228897899004792, + "grad_norm": 1.104038076891047, + "learning_rate": 2.568478144274509e-05, + "loss": 0.6649, + "step": 2628 + }, + { + "epoch": 0.3230126551173363, + "grad_norm": 1.2326051771159159, + "learning_rate": 2.568026521249307e-05, + "loss": 0.7723, + "step": 2629 + }, + { + "epoch": 0.3231355203341934, + "grad_norm": 1.4401760104379724, + "learning_rate": 2.5675747017675535e-05, + "loss": 0.6467, + "step": 2630 + }, + { + "epoch": 0.3232583855510505, + "grad_norm": 1.4325110418303435, + "learning_rate": 2.5671226859123567e-05, + "loss": 0.7172, + "step": 2631 + }, + { + "epoch": 0.3233812507679076, + "grad_norm": 1.2460257175717617, + "learning_rate": 2.5666704737668627e-05, + "loss": 0.6924, + "step": 2632 + }, + { + "epoch": 0.3235041159847647, + "grad_norm": 1.1567462888893196, + "learning_rate": 2.5662180654142523e-05, + "loss": 0.5933, + "step": 2633 + }, + { + "epoch": 0.3236269812016218, + "grad_norm": 1.2360458694802836, + "learning_rate": 2.5657654609377438e-05, + "loss": 0.6713, + "step": 2634 + }, + { + "epoch": 0.3237498464184789, + "grad_norm": 1.456236450735909, + "learning_rate": 2.56531266042059e-05, + "loss": 0.6664, + "step": 2635 + }, + { + "epoch": 0.32387271163533604, + "grad_norm": 1.3438996397427874, + "learning_rate": 2.564859663946081e-05, + "loss": 0.6311, + "step": 2636 + }, + { + "epoch": 0.32399557685219316, + "grad_norm": 1.640426117356657, + "learning_rate": 2.564406471597543e-05, + "loss": 0.6873, + "step": 2637 + }, + { + "epoch": 0.32411844206905027, + "grad_norm": 1.3631561818408349, + "learning_rate": 2.563953083458338e-05, + "loss": 0.5908, + "step": 2638 + }, + { + "epoch": 0.32424130728590733, + "grad_norm": 1.7492913635085148, + "learning_rate": 2.5634994996118625e-05, + "loss": 0.5763, + "step": 2639 + }, + { + "epoch": 0.32436417250276445, + "grad_norm": 1.3650085867199575, + "learning_rate": 2.563045720141551e-05, + "loss": 0.8255, + "step": 2640 + }, + { + "epoch": 0.32448703771962156, + "grad_norm": 1.3667143011918912, + "learning_rate": 2.562591745130874e-05, + "loss": 0.6195, + "step": 2641 + }, + { + "epoch": 0.3246099029364787, + "grad_norm": 1.3464031111411778, + "learning_rate": 2.5621375746633363e-05, + "loss": 0.6702, + "step": 2642 + }, + { + "epoch": 0.3247327681533358, + "grad_norm": 1.3431064944452729, + "learning_rate": 2.56168320882248e-05, + "loss": 0.6261, + "step": 2643 + }, + { + "epoch": 0.3248556333701929, + "grad_norm": 1.4975466565509967, + "learning_rate": 2.561228647691883e-05, + "loss": 0.6689, + "step": 2644 + }, + { + "epoch": 0.32497849858705, + "grad_norm": 1.4209372711633905, + "learning_rate": 2.560773891355158e-05, + "loss": 0.7505, + "step": 2645 + }, + { + "epoch": 0.32510136380390714, + "grad_norm": 1.330257862460566, + "learning_rate": 2.5603189398959554e-05, + "loss": 0.7212, + "step": 2646 + }, + { + "epoch": 0.3252242290207642, + "grad_norm": 1.251823445855228, + "learning_rate": 2.55986379339796e-05, + "loss": 0.5255, + "step": 2647 + }, + { + "epoch": 0.3253470942376213, + "grad_norm": 1.23395845228261, + "learning_rate": 2.5594084519448934e-05, + "loss": 0.5615, + "step": 2648 + }, + { + "epoch": 0.32546995945447843, + "grad_norm": 1.2430184554090695, + "learning_rate": 2.5589529156205126e-05, + "loss": 0.6128, + "step": 2649 + }, + { + "epoch": 0.32559282467133555, + "grad_norm": 1.3269565551861917, + "learning_rate": 2.5584971845086107e-05, + "loss": 0.6181, + "step": 2650 + }, + { + "epoch": 0.32571568988819266, + "grad_norm": 1.197794710707406, + "learning_rate": 2.558041258693016e-05, + "loss": 0.5411, + "step": 2651 + }, + { + "epoch": 0.3258385551050498, + "grad_norm": 1.7487812007064714, + "learning_rate": 2.5575851382575935e-05, + "loss": 0.6968, + "step": 2652 + }, + { + "epoch": 0.3259614203219069, + "grad_norm": 1.2618777226109428, + "learning_rate": 2.5571288232862433e-05, + "loss": 0.5781, + "step": 2653 + }, + { + "epoch": 0.32608428553876395, + "grad_norm": 1.3879980676624495, + "learning_rate": 2.556672313862902e-05, + "loss": 0.6464, + "step": 2654 + }, + { + "epoch": 0.32620715075562107, + "grad_norm": 1.6431412314944789, + "learning_rate": 2.556215610071541e-05, + "loss": 0.7289, + "step": 2655 + }, + { + "epoch": 0.3263300159724782, + "grad_norm": 1.3774703472867555, + "learning_rate": 2.555758711996169e-05, + "loss": 0.7313, + "step": 2656 + }, + { + "epoch": 0.3264528811893353, + "grad_norm": 1.3691435047088343, + "learning_rate": 2.5553016197208282e-05, + "loss": 0.7864, + "step": 2657 + }, + { + "epoch": 0.3265757464061924, + "grad_norm": 1.5203732616078796, + "learning_rate": 2.5548443333295984e-05, + "loss": 0.6866, + "step": 2658 + }, + { + "epoch": 0.32669861162304953, + "grad_norm": 1.3141009154740049, + "learning_rate": 2.5543868529065944e-05, + "loss": 0.5628, + "step": 2659 + }, + { + "epoch": 0.32682147683990664, + "grad_norm": 1.5009999433405659, + "learning_rate": 2.5539291785359672e-05, + "loss": 0.676, + "step": 2660 + }, + { + "epoch": 0.3269443420567637, + "grad_norm": 1.5396464659175015, + "learning_rate": 2.553471310301902e-05, + "loss": 0.6434, + "step": 2661 + }, + { + "epoch": 0.3270672072736208, + "grad_norm": 1.375997399182245, + "learning_rate": 2.5530132482886215e-05, + "loss": 0.6876, + "step": 2662 + }, + { + "epoch": 0.32719007249047793, + "grad_norm": 1.3073619411919035, + "learning_rate": 2.552554992580383e-05, + "loss": 0.6325, + "step": 2663 + }, + { + "epoch": 0.32731293770733505, + "grad_norm": 1.491340639576816, + "learning_rate": 2.55209654326148e-05, + "loss": 0.7321, + "step": 2664 + }, + { + "epoch": 0.32743580292419217, + "grad_norm": 1.2801826898829267, + "learning_rate": 2.5516379004162402e-05, + "loss": 0.5597, + "step": 2665 + }, + { + "epoch": 0.3275586681410493, + "grad_norm": 1.4700304633000156, + "learning_rate": 2.5511790641290292e-05, + "loss": 0.639, + "step": 2666 + }, + { + "epoch": 0.3276815333579064, + "grad_norm": 1.2378453323075547, + "learning_rate": 2.5507200344842466e-05, + "loss": 0.5537, + "step": 2667 + }, + { + "epoch": 0.3278043985747635, + "grad_norm": 1.3084044018497565, + "learning_rate": 2.5502608115663275e-05, + "loss": 0.5727, + "step": 2668 + }, + { + "epoch": 0.32792726379162057, + "grad_norm": 1.3122542979756189, + "learning_rate": 2.5498013954597435e-05, + "loss": 0.5632, + "step": 2669 + }, + { + "epoch": 0.3280501290084777, + "grad_norm": 1.3338882694787164, + "learning_rate": 2.5493417862490013e-05, + "loss": 0.5219, + "step": 2670 + }, + { + "epoch": 0.3281729942253348, + "grad_norm": 1.26135798450435, + "learning_rate": 2.548881984018642e-05, + "loss": 0.6892, + "step": 2671 + }, + { + "epoch": 0.3282958594421919, + "grad_norm": 1.3744655841250497, + "learning_rate": 2.5484219888532443e-05, + "loss": 0.5929, + "step": 2672 + }, + { + "epoch": 0.32841872465904903, + "grad_norm": 1.3487172955431832, + "learning_rate": 2.547961800837421e-05, + "loss": 0.6916, + "step": 2673 + }, + { + "epoch": 0.32854158987590615, + "grad_norm": 1.3231541444458397, + "learning_rate": 2.547501420055821e-05, + "loss": 0.7199, + "step": 2674 + }, + { + "epoch": 0.32866445509276326, + "grad_norm": 1.2799948731013884, + "learning_rate": 2.5470408465931277e-05, + "loss": 0.6268, + "step": 2675 + }, + { + "epoch": 0.3287873203096203, + "grad_norm": 1.5946570981689958, + "learning_rate": 2.5465800805340613e-05, + "loss": 0.6189, + "step": 2676 + }, + { + "epoch": 0.32891018552647744, + "grad_norm": 1.3423578702699086, + "learning_rate": 2.546119121963376e-05, + "loss": 0.5986, + "step": 2677 + }, + { + "epoch": 0.32903305074333455, + "grad_norm": 1.2223057524868068, + "learning_rate": 2.5456579709658632e-05, + "loss": 0.5736, + "step": 2678 + }, + { + "epoch": 0.32915591596019167, + "grad_norm": 1.4838679174060914, + "learning_rate": 2.5451966276263472e-05, + "loss": 0.6286, + "step": 2679 + }, + { + "epoch": 0.3292787811770488, + "grad_norm": 1.3179843553670556, + "learning_rate": 2.5447350920296902e-05, + "loss": 0.6856, + "step": 2680 + }, + { + "epoch": 0.3294016463939059, + "grad_norm": 1.5657305872870284, + "learning_rate": 2.5442733642607888e-05, + "loss": 0.6464, + "step": 2681 + }, + { + "epoch": 0.329524511610763, + "grad_norm": 1.2481930659503457, + "learning_rate": 2.5438114444045738e-05, + "loss": 0.6587, + "step": 2682 + }, + { + "epoch": 0.3296473768276201, + "grad_norm": 1.2656631835055256, + "learning_rate": 2.543349332546013e-05, + "loss": 0.5842, + "step": 2683 + }, + { + "epoch": 0.3297702420444772, + "grad_norm": 1.3813778455726833, + "learning_rate": 2.5428870287701088e-05, + "loss": 0.7175, + "step": 2684 + }, + { + "epoch": 0.3298931072613343, + "grad_norm": 1.4680904203293106, + "learning_rate": 2.5424245331618992e-05, + "loss": 0.5517, + "step": 2685 + }, + { + "epoch": 0.3300159724781914, + "grad_norm": 1.1455593623385454, + "learning_rate": 2.541961845806457e-05, + "loss": 0.5961, + "step": 2686 + }, + { + "epoch": 0.33013883769504854, + "grad_norm": 1.5395167741825477, + "learning_rate": 2.541498966788891e-05, + "loss": 0.7804, + "step": 2687 + }, + { + "epoch": 0.33026170291190565, + "grad_norm": 1.2925538029626353, + "learning_rate": 2.541035896194344e-05, + "loss": 0.6675, + "step": 2688 + }, + { + "epoch": 0.33038456812876277, + "grad_norm": 1.2209973428780587, + "learning_rate": 2.5405726341079955e-05, + "loss": 0.5646, + "step": 2689 + }, + { + "epoch": 0.3305074333456198, + "grad_norm": 1.575950000734113, + "learning_rate": 2.540109180615059e-05, + "loss": 0.5994, + "step": 2690 + }, + { + "epoch": 0.33063029856247694, + "grad_norm": 1.5933673430400759, + "learning_rate": 2.5396455358007843e-05, + "loss": 0.6644, + "step": 2691 + }, + { + "epoch": 0.33075316377933406, + "grad_norm": 1.311588283744037, + "learning_rate": 2.5391816997504552e-05, + "loss": 0.6193, + "step": 2692 + }, + { + "epoch": 0.3308760289961912, + "grad_norm": 1.3308232294119642, + "learning_rate": 2.5387176725493922e-05, + "loss": 0.6086, + "step": 2693 + }, + { + "epoch": 0.3309988942130483, + "grad_norm": 1.5444151986439603, + "learning_rate": 2.5382534542829497e-05, + "loss": 0.6461, + "step": 2694 + }, + { + "epoch": 0.3311217594299054, + "grad_norm": 1.2453342570047887, + "learning_rate": 2.5377890450365174e-05, + "loss": 0.7758, + "step": 2695 + }, + { + "epoch": 0.3312446246467625, + "grad_norm": 1.4100841748747668, + "learning_rate": 2.5373244448955207e-05, + "loss": 0.6867, + "step": 2696 + }, + { + "epoch": 0.33136748986361964, + "grad_norm": 1.333463795517214, + "learning_rate": 2.5368596539454195e-05, + "loss": 0.6467, + "step": 2697 + }, + { + "epoch": 0.3314903550804767, + "grad_norm": 1.4209955828809815, + "learning_rate": 2.536394672271709e-05, + "loss": 0.6587, + "step": 2698 + }, + { + "epoch": 0.3316132202973338, + "grad_norm": 1.6442228098945157, + "learning_rate": 2.5359294999599204e-05, + "loss": 0.6933, + "step": 2699 + }, + { + "epoch": 0.3317360855141909, + "grad_norm": 1.4409559993116852, + "learning_rate": 2.5354641370956184e-05, + "loss": 0.6646, + "step": 2700 + }, + { + "epoch": 0.33185895073104804, + "grad_norm": 1.4543784476723773, + "learning_rate": 2.5349985837644033e-05, + "loss": 0.6004, + "step": 2701 + }, + { + "epoch": 0.33198181594790516, + "grad_norm": 1.2588493257173354, + "learning_rate": 2.5345328400519112e-05, + "loss": 0.6592, + "step": 2702 + }, + { + "epoch": 0.33210468116476227, + "grad_norm": 1.1902975262865394, + "learning_rate": 2.534066906043812e-05, + "loss": 0.6165, + "step": 2703 + }, + { + "epoch": 0.3322275463816194, + "grad_norm": 1.1191003799348873, + "learning_rate": 2.533600781825812e-05, + "loss": 0.7186, + "step": 2704 + }, + { + "epoch": 0.33235041159847645, + "grad_norm": 1.608572413844205, + "learning_rate": 2.533134467483651e-05, + "loss": 0.7429, + "step": 2705 + }, + { + "epoch": 0.33247327681533356, + "grad_norm": 1.5401357608432849, + "learning_rate": 2.532667963103105e-05, + "loss": 0.6945, + "step": 2706 + }, + { + "epoch": 0.3325961420321907, + "grad_norm": 1.5053801691974358, + "learning_rate": 2.532201268769984e-05, + "loss": 0.5584, + "step": 2707 + }, + { + "epoch": 0.3327190072490478, + "grad_norm": 1.5834434583335115, + "learning_rate": 2.531734384570134e-05, + "loss": 0.7287, + "step": 2708 + }, + { + "epoch": 0.3328418724659049, + "grad_norm": 1.5481727048817362, + "learning_rate": 2.5312673105894347e-05, + "loss": 0.5778, + "step": 2709 + }, + { + "epoch": 0.332964737682762, + "grad_norm": 1.2860392580619513, + "learning_rate": 2.530800046913802e-05, + "loss": 0.5164, + "step": 2710 + }, + { + "epoch": 0.33308760289961914, + "grad_norm": 1.3512751264741225, + "learning_rate": 2.5303325936291853e-05, + "loss": 0.6541, + "step": 2711 + }, + { + "epoch": 0.3332104681164762, + "grad_norm": 1.272965541221011, + "learning_rate": 2.5298649508215702e-05, + "loss": 0.5421, + "step": 2712 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.388388404583162, + "learning_rate": 2.529397118576976e-05, + "loss": 0.6003, + "step": 2713 + }, + { + "epoch": 0.33345619855019043, + "grad_norm": 1.2818228689222484, + "learning_rate": 2.5289290969814582e-05, + "loss": 0.6775, + "step": 2714 + }, + { + "epoch": 0.33357906376704755, + "grad_norm": 1.4209528049678357, + "learning_rate": 2.5284608861211053e-05, + "loss": 0.7344, + "step": 2715 + }, + { + "epoch": 0.33370192898390466, + "grad_norm": 1.3136830686068734, + "learning_rate": 2.527992486082042e-05, + "loss": 0.6331, + "step": 2716 + }, + { + "epoch": 0.3338247942007618, + "grad_norm": 1.3335549633808954, + "learning_rate": 2.5275238969504288e-05, + "loss": 0.5645, + "step": 2717 + }, + { + "epoch": 0.3339476594176189, + "grad_norm": 1.4891249762505194, + "learning_rate": 2.5270551188124572e-05, + "loss": 0.6508, + "step": 2718 + }, + { + "epoch": 0.334070524634476, + "grad_norm": 1.3969083060238623, + "learning_rate": 2.526586151754358e-05, + "loss": 0.726, + "step": 2719 + }, + { + "epoch": 0.33419338985133307, + "grad_norm": 1.297573137597818, + "learning_rate": 2.5261169958623937e-05, + "loss": 0.6166, + "step": 2720 + }, + { + "epoch": 0.3343162550681902, + "grad_norm": 1.5822075367930928, + "learning_rate": 2.5256476512228625e-05, + "loss": 0.6672, + "step": 2721 + }, + { + "epoch": 0.3344391202850473, + "grad_norm": 1.279749283220469, + "learning_rate": 2.5251781179220973e-05, + "loss": 0.6018, + "step": 2722 + }, + { + "epoch": 0.3345619855019044, + "grad_norm": 1.5204670722317952, + "learning_rate": 2.524708396046466e-05, + "loss": 0.7262, + "step": 2723 + }, + { + "epoch": 0.33468485071876153, + "grad_norm": 1.1414499242267164, + "learning_rate": 2.5242384856823703e-05, + "loss": 0.7091, + "step": 2724 + }, + { + "epoch": 0.33480771593561864, + "grad_norm": 1.3445900356264866, + "learning_rate": 2.523768386916248e-05, + "loss": 0.5157, + "step": 2725 + }, + { + "epoch": 0.33493058115247576, + "grad_norm": 1.1917871966739255, + "learning_rate": 2.5232980998345702e-05, + "loss": 0.6323, + "step": 2726 + }, + { + "epoch": 0.3350534463693328, + "grad_norm": 1.3056607806536895, + "learning_rate": 2.522827624523844e-05, + "loss": 0.6727, + "step": 2727 + }, + { + "epoch": 0.33517631158618993, + "grad_norm": 1.4234768935791653, + "learning_rate": 2.522356961070608e-05, + "loss": 0.662, + "step": 2728 + }, + { + "epoch": 0.33529917680304705, + "grad_norm": 1.2937528259628377, + "learning_rate": 2.5218861095614404e-05, + "loss": 0.6049, + "step": 2729 + }, + { + "epoch": 0.33542204201990417, + "grad_norm": 1.2546859420856482, + "learning_rate": 2.5214150700829497e-05, + "loss": 0.5489, + "step": 2730 + }, + { + "epoch": 0.3355449072367613, + "grad_norm": 1.3218681532014407, + "learning_rate": 2.520943842721781e-05, + "loss": 0.6688, + "step": 2731 + }, + { + "epoch": 0.3356677724536184, + "grad_norm": 1.4158680349182347, + "learning_rate": 2.5204724275646132e-05, + "loss": 0.689, + "step": 2732 + }, + { + "epoch": 0.3357906376704755, + "grad_norm": 1.5396225500608862, + "learning_rate": 2.5200008246981612e-05, + "loss": 0.551, + "step": 2733 + }, + { + "epoch": 0.33591350288733257, + "grad_norm": 1.8526086257114844, + "learning_rate": 2.5195290342091717e-05, + "loss": 0.6192, + "step": 2734 + }, + { + "epoch": 0.3360363681041897, + "grad_norm": 1.286306052313637, + "learning_rate": 2.5190570561844283e-05, + "loss": 0.6217, + "step": 2735 + }, + { + "epoch": 0.3361592333210468, + "grad_norm": 1.3958622013242172, + "learning_rate": 2.5185848907107485e-05, + "loss": 0.6642, + "step": 2736 + }, + { + "epoch": 0.3362820985379039, + "grad_norm": 1.543902295533319, + "learning_rate": 2.5181125378749834e-05, + "loss": 0.6359, + "step": 2737 + }, + { + "epoch": 0.33640496375476103, + "grad_norm": 1.2221201211218737, + "learning_rate": 2.5176399977640202e-05, + "loss": 0.6478, + "step": 2738 + }, + { + "epoch": 0.33652782897161815, + "grad_norm": 1.3325713769700507, + "learning_rate": 2.5171672704647785e-05, + "loss": 0.6282, + "step": 2739 + }, + { + "epoch": 0.33665069418847526, + "grad_norm": 1.4765669065240266, + "learning_rate": 2.5166943560642145e-05, + "loss": 0.5777, + "step": 2740 + }, + { + "epoch": 0.3367735594053323, + "grad_norm": 1.3987573950506902, + "learning_rate": 2.5162212546493166e-05, + "loss": 0.5671, + "step": 2741 + }, + { + "epoch": 0.33689642462218944, + "grad_norm": 1.4734856645399652, + "learning_rate": 2.5157479663071096e-05, + "loss": 0.6656, + "step": 2742 + }, + { + "epoch": 0.33701928983904655, + "grad_norm": 1.4584227434908958, + "learning_rate": 2.5152744911246516e-05, + "loss": 0.6472, + "step": 2743 + }, + { + "epoch": 0.33714215505590367, + "grad_norm": 1.3409594007849681, + "learning_rate": 2.5148008291890358e-05, + "loss": 0.6361, + "step": 2744 + }, + { + "epoch": 0.3372650202727608, + "grad_norm": 1.413283015605535, + "learning_rate": 2.5143269805873877e-05, + "loss": 0.6315, + "step": 2745 + }, + { + "epoch": 0.3373878854896179, + "grad_norm": 1.3803259262908205, + "learning_rate": 2.5138529454068704e-05, + "loss": 0.5988, + "step": 2746 + }, + { + "epoch": 0.337510750706475, + "grad_norm": 1.6594303520450842, + "learning_rate": 2.513378723734678e-05, + "loss": 0.6848, + "step": 2747 + }, + { + "epoch": 0.33763361592333213, + "grad_norm": 1.2449683179012523, + "learning_rate": 2.512904315658042e-05, + "loss": 0.5339, + "step": 2748 + }, + { + "epoch": 0.3377564811401892, + "grad_norm": 1.4042392133664494, + "learning_rate": 2.5124297212642263e-05, + "loss": 0.7336, + "step": 2749 + }, + { + "epoch": 0.3378793463570463, + "grad_norm": 1.4638731582483036, + "learning_rate": 2.511954940640529e-05, + "loss": 0.6547, + "step": 2750 + }, + { + "epoch": 0.3380022115739034, + "grad_norm": 1.229619716744784, + "learning_rate": 2.5114799738742827e-05, + "loss": 0.5844, + "step": 2751 + }, + { + "epoch": 0.33812507679076054, + "grad_norm": 1.2747095944222078, + "learning_rate": 2.511004821052855e-05, + "loss": 0.6961, + "step": 2752 + }, + { + "epoch": 0.33824794200761765, + "grad_norm": 1.3692451004362243, + "learning_rate": 2.5105294822636476e-05, + "loss": 0.6996, + "step": 2753 + }, + { + "epoch": 0.33837080722447477, + "grad_norm": 1.2941705514138502, + "learning_rate": 2.510053957594095e-05, + "loss": 0.7397, + "step": 2754 + }, + { + "epoch": 0.3384936724413319, + "grad_norm": 1.4212208518475902, + "learning_rate": 2.5095782471316676e-05, + "loss": 0.674, + "step": 2755 + }, + { + "epoch": 0.33861653765818894, + "grad_norm": 1.4556614335168183, + "learning_rate": 2.5091023509638688e-05, + "loss": 0.5659, + "step": 2756 + }, + { + "epoch": 0.33873940287504606, + "grad_norm": 1.2751085512651674, + "learning_rate": 2.5086262691782366e-05, + "loss": 0.6139, + "step": 2757 + }, + { + "epoch": 0.3388622680919032, + "grad_norm": 1.329054882311604, + "learning_rate": 2.5081500018623436e-05, + "loss": 0.738, + "step": 2758 + }, + { + "epoch": 0.3389851333087603, + "grad_norm": 1.2996808598919685, + "learning_rate": 2.5076735491037958e-05, + "loss": 0.7835, + "step": 2759 + }, + { + "epoch": 0.3391079985256174, + "grad_norm": 1.349836699553481, + "learning_rate": 2.5071969109902334e-05, + "loss": 0.5799, + "step": 2760 + }, + { + "epoch": 0.3392308637424745, + "grad_norm": 1.2155478307122192, + "learning_rate": 2.5067200876093316e-05, + "loss": 0.6015, + "step": 2761 + }, + { + "epoch": 0.33935372895933164, + "grad_norm": 1.4036067587199046, + "learning_rate": 2.506243079048798e-05, + "loss": 0.6481, + "step": 2762 + }, + { + "epoch": 0.3394765941761887, + "grad_norm": 1.766343039304458, + "learning_rate": 2.505765885396376e-05, + "loss": 0.6644, + "step": 2763 + }, + { + "epoch": 0.3395994593930458, + "grad_norm": 1.3903174524425181, + "learning_rate": 2.5052885067398423e-05, + "loss": 0.6531, + "step": 2764 + }, + { + "epoch": 0.3397223246099029, + "grad_norm": 1.1540918235132587, + "learning_rate": 2.504810943167007e-05, + "loss": 0.7156, + "step": 2765 + }, + { + "epoch": 0.33984518982676004, + "grad_norm": 1.4025129185362368, + "learning_rate": 2.5043331947657147e-05, + "loss": 0.6622, + "step": 2766 + }, + { + "epoch": 0.33996805504361716, + "grad_norm": 1.2726252285132713, + "learning_rate": 2.503855261623845e-05, + "loss": 0.7039, + "step": 2767 + }, + { + "epoch": 0.34009092026047427, + "grad_norm": 1.4769809037173156, + "learning_rate": 2.5033771438293104e-05, + "loss": 0.6342, + "step": 2768 + }, + { + "epoch": 0.3402137854773314, + "grad_norm": 1.528388962909006, + "learning_rate": 2.5028988414700573e-05, + "loss": 0.6819, + "step": 2769 + }, + { + "epoch": 0.3403366506941885, + "grad_norm": 1.4600954675953655, + "learning_rate": 2.5024203546340657e-05, + "loss": 0.7113, + "step": 2770 + }, + { + "epoch": 0.34045951591104556, + "grad_norm": 1.299534406526137, + "learning_rate": 2.5019416834093513e-05, + "loss": 0.7765, + "step": 2771 + }, + { + "epoch": 0.3405823811279027, + "grad_norm": 1.472797811816017, + "learning_rate": 2.5014628278839617e-05, + "loss": 0.6378, + "step": 2772 + }, + { + "epoch": 0.3407052463447598, + "grad_norm": 1.3446742762382806, + "learning_rate": 2.5009837881459805e-05, + "loss": 0.614, + "step": 2773 + }, + { + "epoch": 0.3408281115616169, + "grad_norm": 1.5341673680966372, + "learning_rate": 2.5005045642835223e-05, + "loss": 0.6235, + "step": 2774 + }, + { + "epoch": 0.340950976778474, + "grad_norm": 1.1800236080442934, + "learning_rate": 2.5000251563847378e-05, + "loss": 0.5598, + "step": 2775 + }, + { + "epoch": 0.34107384199533114, + "grad_norm": 1.2057060641162696, + "learning_rate": 2.4995455645378114e-05, + "loss": 0.6298, + "step": 2776 + }, + { + "epoch": 0.34119670721218825, + "grad_norm": 1.3380564012798153, + "learning_rate": 2.499065788830961e-05, + "loss": 0.6323, + "step": 2777 + }, + { + "epoch": 0.3413195724290453, + "grad_norm": 1.220620409791524, + "learning_rate": 2.498585829352438e-05, + "loss": 0.6799, + "step": 2778 + }, + { + "epoch": 0.34144243764590243, + "grad_norm": 1.2332615406153689, + "learning_rate": 2.498105686190527e-05, + "loss": 0.6106, + "step": 2779 + }, + { + "epoch": 0.34156530286275955, + "grad_norm": 1.186125929055212, + "learning_rate": 2.4976253594335485e-05, + "loss": 0.5799, + "step": 2780 + }, + { + "epoch": 0.34168816807961666, + "grad_norm": 1.4226232607385074, + "learning_rate": 2.497144849169855e-05, + "loss": 0.6987, + "step": 2781 + }, + { + "epoch": 0.3418110332964738, + "grad_norm": 1.3083254225314636, + "learning_rate": 2.4966641554878332e-05, + "loss": 0.6338, + "step": 2782 + }, + { + "epoch": 0.3419338985133309, + "grad_norm": 1.6876847834021844, + "learning_rate": 2.4961832784759037e-05, + "loss": 0.5879, + "step": 2783 + }, + { + "epoch": 0.342056763730188, + "grad_norm": 1.48442532949755, + "learning_rate": 2.49570221822252e-05, + "loss": 0.7045, + "step": 2784 + }, + { + "epoch": 0.34217962894704507, + "grad_norm": 1.4916725424356772, + "learning_rate": 2.4952209748161708e-05, + "loss": 0.7509, + "step": 2785 + }, + { + "epoch": 0.3423024941639022, + "grad_norm": 1.2823394690344974, + "learning_rate": 2.494739548345378e-05, + "loss": 0.5919, + "step": 2786 + }, + { + "epoch": 0.3424253593807593, + "grad_norm": 1.41506627426524, + "learning_rate": 2.494257938898696e-05, + "loss": 0.6549, + "step": 2787 + }, + { + "epoch": 0.3425482245976164, + "grad_norm": 1.1970963774935819, + "learning_rate": 2.4937761465647144e-05, + "loss": 0.6614, + "step": 2788 + }, + { + "epoch": 0.34267108981447353, + "grad_norm": 1.3109448165825661, + "learning_rate": 2.4932941714320552e-05, + "loss": 0.6936, + "step": 2789 + }, + { + "epoch": 0.34279395503133064, + "grad_norm": 1.443510754212926, + "learning_rate": 2.4928120135893752e-05, + "loss": 0.6727, + "step": 2790 + }, + { + "epoch": 0.34291682024818776, + "grad_norm": 1.402077777923276, + "learning_rate": 2.4923296731253635e-05, + "loss": 0.5513, + "step": 2791 + }, + { + "epoch": 0.3430396854650448, + "grad_norm": 1.3347071693240653, + "learning_rate": 2.4918471501287447e-05, + "loss": 0.6354, + "step": 2792 + }, + { + "epoch": 0.34316255068190193, + "grad_norm": 1.1506939440778963, + "learning_rate": 2.491364444688274e-05, + "loss": 0.5777, + "step": 2793 + }, + { + "epoch": 0.34328541589875905, + "grad_norm": 1.1678002005096746, + "learning_rate": 2.4908815568927435e-05, + "loss": 0.6605, + "step": 2794 + }, + { + "epoch": 0.34340828111561617, + "grad_norm": 1.5583626133180346, + "learning_rate": 2.4903984868309768e-05, + "loss": 0.6601, + "step": 2795 + }, + { + "epoch": 0.3435311463324733, + "grad_norm": 1.3655131868215489, + "learning_rate": 2.489915234591831e-05, + "loss": 0.5476, + "step": 2796 + }, + { + "epoch": 0.3436540115493304, + "grad_norm": 1.1183084303794542, + "learning_rate": 2.489431800264198e-05, + "loss": 0.566, + "step": 2797 + }, + { + "epoch": 0.3437768767661875, + "grad_norm": 1.319563121580085, + "learning_rate": 2.488948183937002e-05, + "loss": 0.606, + "step": 2798 + }, + { + "epoch": 0.3438997419830446, + "grad_norm": 1.2691685394506749, + "learning_rate": 2.4884643856992008e-05, + "loss": 0.7269, + "step": 2799 + }, + { + "epoch": 0.3440226071999017, + "grad_norm": 1.3109649351979273, + "learning_rate": 2.4879804056397865e-05, + "loss": 0.6612, + "step": 2800 + }, + { + "epoch": 0.3441454724167588, + "grad_norm": 1.2155566932415305, + "learning_rate": 2.4874962438477838e-05, + "loss": 0.6405, + "step": 2801 + }, + { + "epoch": 0.3442683376336159, + "grad_norm": 1.1817827721064367, + "learning_rate": 2.487011900412251e-05, + "loss": 0.6778, + "step": 2802 + }, + { + "epoch": 0.34439120285047303, + "grad_norm": 1.5654657202255273, + "learning_rate": 2.4865273754222805e-05, + "loss": 0.5839, + "step": 2803 + }, + { + "epoch": 0.34451406806733015, + "grad_norm": 1.3171313231504846, + "learning_rate": 2.4860426689669965e-05, + "loss": 0.7002, + "step": 2804 + }, + { + "epoch": 0.34463693328418726, + "grad_norm": 1.2647864512785145, + "learning_rate": 2.485557781135559e-05, + "loss": 0.6973, + "step": 2805 + }, + { + "epoch": 0.3447597985010444, + "grad_norm": 1.4402805296448422, + "learning_rate": 2.485072712017159e-05, + "loss": 0.6464, + "step": 2806 + }, + { + "epoch": 0.34488266371790144, + "grad_norm": 1.3949273396598187, + "learning_rate": 2.4845874617010218e-05, + "loss": 0.6053, + "step": 2807 + }, + { + "epoch": 0.34500552893475855, + "grad_norm": 1.2580850902397493, + "learning_rate": 2.4841020302764066e-05, + "loss": 0.566, + "step": 2808 + }, + { + "epoch": 0.34512839415161567, + "grad_norm": 1.3398063403516818, + "learning_rate": 2.483616417832605e-05, + "loss": 0.6812, + "step": 2809 + }, + { + "epoch": 0.3452512593684728, + "grad_norm": 1.3316264936952265, + "learning_rate": 2.483130624458942e-05, + "loss": 0.7361, + "step": 2810 + }, + { + "epoch": 0.3453741245853299, + "grad_norm": 1.8385969532029396, + "learning_rate": 2.4826446502447767e-05, + "loss": 0.6795, + "step": 2811 + }, + { + "epoch": 0.345496989802187, + "grad_norm": 2.019773098001874, + "learning_rate": 2.482158495279501e-05, + "loss": 0.6371, + "step": 2812 + }, + { + "epoch": 0.34561985501904413, + "grad_norm": 1.404725207515585, + "learning_rate": 2.4816721596525392e-05, + "loss": 0.6955, + "step": 2813 + }, + { + "epoch": 0.3457427202359012, + "grad_norm": 1.2622116461730961, + "learning_rate": 2.4811856434533497e-05, + "loss": 0.5712, + "step": 2814 + }, + { + "epoch": 0.3458655854527583, + "grad_norm": 1.3542839252628665, + "learning_rate": 2.480698946771425e-05, + "loss": 0.6087, + "step": 2815 + }, + { + "epoch": 0.3459884506696154, + "grad_norm": 1.589661558271548, + "learning_rate": 2.4802120696962886e-05, + "loss": 0.6571, + "step": 2816 + }, + { + "epoch": 0.34611131588647254, + "grad_norm": 1.6157299451119107, + "learning_rate": 2.4797250123174993e-05, + "loss": 0.6621, + "step": 2817 + }, + { + "epoch": 0.34623418110332965, + "grad_norm": 1.7434374568652977, + "learning_rate": 2.479237774724647e-05, + "loss": 0.7823, + "step": 2818 + }, + { + "epoch": 0.34635704632018677, + "grad_norm": 1.2362406618938764, + "learning_rate": 2.4787503570073574e-05, + "loss": 0.5939, + "step": 2819 + }, + { + "epoch": 0.3464799115370439, + "grad_norm": 1.4466765788750935, + "learning_rate": 2.478262759255287e-05, + "loss": 0.6847, + "step": 2820 + }, + { + "epoch": 0.34660277675390094, + "grad_norm": 1.4761913865046838, + "learning_rate": 2.4777749815581258e-05, + "loss": 0.5929, + "step": 2821 + }, + { + "epoch": 0.34672564197075806, + "grad_norm": 1.7937272243190563, + "learning_rate": 2.477287024005598e-05, + "loss": 0.5303, + "step": 2822 + }, + { + "epoch": 0.3468485071876152, + "grad_norm": 1.3375731507145783, + "learning_rate": 2.4767988866874604e-05, + "loss": 0.733, + "step": 2823 + }, + { + "epoch": 0.3469713724044723, + "grad_norm": 1.3910293593881524, + "learning_rate": 2.4763105696935016e-05, + "loss": 0.6012, + "step": 2824 + }, + { + "epoch": 0.3470942376213294, + "grad_norm": 1.1040308914500607, + "learning_rate": 2.4758220731135456e-05, + "loss": 0.5913, + "step": 2825 + }, + { + "epoch": 0.3472171028381865, + "grad_norm": 1.2663447998245985, + "learning_rate": 2.475333397037448e-05, + "loss": 0.6959, + "step": 2826 + }, + { + "epoch": 0.34733996805504364, + "grad_norm": 1.3619351390825811, + "learning_rate": 2.4748445415550964e-05, + "loss": 0.66, + "step": 2827 + }, + { + "epoch": 0.34746283327190075, + "grad_norm": 1.293708058290336, + "learning_rate": 2.4743555067564144e-05, + "loss": 0.6855, + "step": 2828 + }, + { + "epoch": 0.3475856984887578, + "grad_norm": 1.425947759738349, + "learning_rate": 2.473866292731355e-05, + "loss": 0.6866, + "step": 2829 + }, + { + "epoch": 0.3477085637056149, + "grad_norm": 1.423324542436127, + "learning_rate": 2.4733768995699077e-05, + "loss": 0.6774, + "step": 2830 + }, + { + "epoch": 0.34783142892247204, + "grad_norm": 1.2326684636687357, + "learning_rate": 2.4728873273620918e-05, + "loss": 0.7282, + "step": 2831 + }, + { + "epoch": 0.34795429413932916, + "grad_norm": 1.2116439917394042, + "learning_rate": 2.4723975761979615e-05, + "loss": 0.7036, + "step": 2832 + }, + { + "epoch": 0.34807715935618627, + "grad_norm": 1.4166775644935086, + "learning_rate": 2.4719076461676033e-05, + "loss": 0.5902, + "step": 2833 + }, + { + "epoch": 0.3482000245730434, + "grad_norm": 1.6258721347411786, + "learning_rate": 2.4714175373611365e-05, + "loss": 0.7292, + "step": 2834 + }, + { + "epoch": 0.3483228897899005, + "grad_norm": 1.501000329257634, + "learning_rate": 2.4709272498687135e-05, + "loss": 0.6987, + "step": 2835 + }, + { + "epoch": 0.34844575500675756, + "grad_norm": 1.5896820779091139, + "learning_rate": 2.47043678378052e-05, + "loss": 0.6031, + "step": 2836 + }, + { + "epoch": 0.3485686202236147, + "grad_norm": 1.5609640525643038, + "learning_rate": 2.469946139186773e-05, + "loss": 0.6056, + "step": 2837 + }, + { + "epoch": 0.3486914854404718, + "grad_norm": 1.2764317496336732, + "learning_rate": 2.4694553161777246e-05, + "loss": 0.6027, + "step": 2838 + }, + { + "epoch": 0.3488143506573289, + "grad_norm": 1.4441067183201504, + "learning_rate": 2.4689643148436577e-05, + "loss": 0.6499, + "step": 2839 + }, + { + "epoch": 0.348937215874186, + "grad_norm": 1.3555668270768086, + "learning_rate": 2.4684731352748893e-05, + "loss": 0.5842, + "step": 2840 + }, + { + "epoch": 0.34906008109104314, + "grad_norm": 1.3296748432849688, + "learning_rate": 2.4679817775617675e-05, + "loss": 0.6204, + "step": 2841 + }, + { + "epoch": 0.34918294630790025, + "grad_norm": 1.5537789695755568, + "learning_rate": 2.4674902417946763e-05, + "loss": 0.7813, + "step": 2842 + }, + { + "epoch": 0.3493058115247573, + "grad_norm": 1.3604099855997405, + "learning_rate": 2.466998528064029e-05, + "loss": 0.6389, + "step": 2843 + }, + { + "epoch": 0.34942867674161443, + "grad_norm": 1.2596162155182409, + "learning_rate": 2.4665066364602743e-05, + "loss": 0.5566, + "step": 2844 + }, + { + "epoch": 0.34955154195847155, + "grad_norm": 1.2221445019611434, + "learning_rate": 2.4660145670738914e-05, + "loss": 0.8065, + "step": 2845 + }, + { + "epoch": 0.34967440717532866, + "grad_norm": 1.2943068305962433, + "learning_rate": 2.4655223199953932e-05, + "loss": 0.6082, + "step": 2846 + }, + { + "epoch": 0.3497972723921858, + "grad_norm": 1.391852438593883, + "learning_rate": 2.4650298953153265e-05, + "loss": 0.8128, + "step": 2847 + }, + { + "epoch": 0.3499201376090429, + "grad_norm": 1.297446988158341, + "learning_rate": 2.4645372931242692e-05, + "loss": 0.5959, + "step": 2848 + }, + { + "epoch": 0.3500430028259, + "grad_norm": 1.4850173400831785, + "learning_rate": 2.4640445135128317e-05, + "loss": 0.6086, + "step": 2849 + }, + { + "epoch": 0.3501658680427571, + "grad_norm": 1.3867025911455704, + "learning_rate": 2.4635515565716577e-05, + "loss": 0.586, + "step": 2850 + }, + { + "epoch": 0.3502887332596142, + "grad_norm": 1.2045176476940846, + "learning_rate": 2.463058422391424e-05, + "loss": 0.6751, + "step": 2851 + }, + { + "epoch": 0.3504115984764713, + "grad_norm": 1.396377648055341, + "learning_rate": 2.4625651110628395e-05, + "loss": 0.5676, + "step": 2852 + }, + { + "epoch": 0.3505344636933284, + "grad_norm": 1.2166167758489665, + "learning_rate": 2.4620716226766448e-05, + "loss": 0.6311, + "step": 2853 + }, + { + "epoch": 0.35065732891018553, + "grad_norm": 1.6128788597832702, + "learning_rate": 2.4615779573236145e-05, + "loss": 0.6247, + "step": 2854 + }, + { + "epoch": 0.35078019412704264, + "grad_norm": 1.6225956139742743, + "learning_rate": 2.461084115094555e-05, + "loss": 0.6058, + "step": 2855 + }, + { + "epoch": 0.35090305934389976, + "grad_norm": 1.4963772791581587, + "learning_rate": 2.4605900960803056e-05, + "loss": 0.5201, + "step": 2856 + }, + { + "epoch": 0.3510259245607569, + "grad_norm": 1.12768998961081, + "learning_rate": 2.4600959003717375e-05, + "loss": 0.5812, + "step": 2857 + }, + { + "epoch": 0.35114878977761393, + "grad_norm": 1.3804339369983554, + "learning_rate": 2.459601528059755e-05, + "loss": 0.6361, + "step": 2858 + }, + { + "epoch": 0.35127165499447105, + "grad_norm": 1.2246546127094418, + "learning_rate": 2.4591069792352946e-05, + "loss": 0.6152, + "step": 2859 + }, + { + "epoch": 0.35139452021132817, + "grad_norm": 1.4253142619730457, + "learning_rate": 2.4586122539893253e-05, + "loss": 0.6549, + "step": 2860 + }, + { + "epoch": 0.3515173854281853, + "grad_norm": 1.225413066005511, + "learning_rate": 2.458117352412849e-05, + "loss": 0.6153, + "step": 2861 + }, + { + "epoch": 0.3516402506450424, + "grad_norm": 1.3389064405566784, + "learning_rate": 2.4576222745968988e-05, + "loss": 0.6364, + "step": 2862 + }, + { + "epoch": 0.3517631158618995, + "grad_norm": 1.474764775908628, + "learning_rate": 2.457127020632542e-05, + "loss": 0.6861, + "step": 2863 + }, + { + "epoch": 0.3518859810787566, + "grad_norm": 1.6920927507187056, + "learning_rate": 2.4566315906108772e-05, + "loss": 0.7226, + "step": 2864 + }, + { + "epoch": 0.3520088462956137, + "grad_norm": 1.2757191849543568, + "learning_rate": 2.4561359846230346e-05, + "loss": 0.6685, + "step": 2865 + }, + { + "epoch": 0.3521317115124708, + "grad_norm": 1.3804077134245787, + "learning_rate": 2.455640202760179e-05, + "loss": 0.6421, + "step": 2866 + }, + { + "epoch": 0.3522545767293279, + "grad_norm": 1.4520652269838694, + "learning_rate": 2.4551442451135052e-05, + "loss": 0.5816, + "step": 2867 + }, + { + "epoch": 0.35237744194618503, + "grad_norm": 1.3418940934266081, + "learning_rate": 2.4546481117742422e-05, + "loss": 0.6212, + "step": 2868 + }, + { + "epoch": 0.35250030716304215, + "grad_norm": 1.3014164086891709, + "learning_rate": 2.4541518028336496e-05, + "loss": 0.6146, + "step": 2869 + }, + { + "epoch": 0.35262317237989926, + "grad_norm": 1.3869525726418286, + "learning_rate": 2.453655318383021e-05, + "loss": 0.5949, + "step": 2870 + }, + { + "epoch": 0.3527460375967564, + "grad_norm": 1.2684613030258876, + "learning_rate": 2.4531586585136817e-05, + "loss": 0.62, + "step": 2871 + }, + { + "epoch": 0.35286890281361344, + "grad_norm": 1.3347761756812664, + "learning_rate": 2.452661823316988e-05, + "loss": 0.7649, + "step": 2872 + }, + { + "epoch": 0.35299176803047055, + "grad_norm": 1.3289611675761985, + "learning_rate": 2.4521648128843307e-05, + "loss": 0.6773, + "step": 2873 + }, + { + "epoch": 0.35311463324732767, + "grad_norm": 1.2934680513006693, + "learning_rate": 2.451667627307131e-05, + "loss": 0.678, + "step": 2874 + }, + { + "epoch": 0.3532374984641848, + "grad_norm": 1.3684301965698735, + "learning_rate": 2.4511702666768422e-05, + "loss": 0.6355, + "step": 2875 + }, + { + "epoch": 0.3533603636810419, + "grad_norm": 1.4260512793612574, + "learning_rate": 2.4506727310849525e-05, + "loss": 0.6897, + "step": 2876 + }, + { + "epoch": 0.353483228897899, + "grad_norm": 1.4609799844987639, + "learning_rate": 2.4501750206229785e-05, + "loss": 0.6503, + "step": 2877 + }, + { + "epoch": 0.35360609411475613, + "grad_norm": 1.1024603555101053, + "learning_rate": 2.449677135382472e-05, + "loss": 0.5527, + "step": 2878 + }, + { + "epoch": 0.35372895933161325, + "grad_norm": 1.3733017767330695, + "learning_rate": 2.4491790754550154e-05, + "loss": 0.6711, + "step": 2879 + }, + { + "epoch": 0.3538518245484703, + "grad_norm": 1.2414510695633374, + "learning_rate": 2.4486808409322234e-05, + "loss": 0.6173, + "step": 2880 + }, + { + "epoch": 0.3539746897653274, + "grad_norm": 1.3513278257752368, + "learning_rate": 2.448182431905743e-05, + "loss": 0.6844, + "step": 2881 + }, + { + "epoch": 0.35409755498218454, + "grad_norm": 1.2721215365495033, + "learning_rate": 2.4476838484672533e-05, + "loss": 0.5657, + "step": 2882 + }, + { + "epoch": 0.35422042019904165, + "grad_norm": 1.154448156448332, + "learning_rate": 2.4471850907084658e-05, + "loss": 0.6157, + "step": 2883 + }, + { + "epoch": 0.35434328541589877, + "grad_norm": 1.2364215540928962, + "learning_rate": 2.4466861587211233e-05, + "loss": 0.7401, + "step": 2884 + }, + { + "epoch": 0.3544661506327559, + "grad_norm": 1.2032761472677374, + "learning_rate": 2.4461870525970013e-05, + "loss": 0.5662, + "step": 2885 + }, + { + "epoch": 0.354589015849613, + "grad_norm": 1.192850377770801, + "learning_rate": 2.4456877724279076e-05, + "loss": 0.5529, + "step": 2886 + }, + { + "epoch": 0.35471188106647006, + "grad_norm": 1.329738599637935, + "learning_rate": 2.4451883183056812e-05, + "loss": 0.4972, + "step": 2887 + }, + { + "epoch": 0.3548347462833272, + "grad_norm": 1.5334138047542893, + "learning_rate": 2.4446886903221935e-05, + "loss": 0.6805, + "step": 2888 + }, + { + "epoch": 0.3549576115001843, + "grad_norm": 1.2477255899037958, + "learning_rate": 2.4441888885693473e-05, + "loss": 0.5577, + "step": 2889 + }, + { + "epoch": 0.3550804767170414, + "grad_norm": 1.957636902895662, + "learning_rate": 2.4436889131390788e-05, + "loss": 0.7928, + "step": 2890 + }, + { + "epoch": 0.3552033419338985, + "grad_norm": 1.330358276377526, + "learning_rate": 2.4431887641233543e-05, + "loss": 0.5986, + "step": 2891 + }, + { + "epoch": 0.35532620715075564, + "grad_norm": 1.3958942236873897, + "learning_rate": 2.442688441614174e-05, + "loss": 0.6695, + "step": 2892 + }, + { + "epoch": 0.35544907236761275, + "grad_norm": 1.496385869981552, + "learning_rate": 2.4421879457035678e-05, + "loss": 0.6896, + "step": 2893 + }, + { + "epoch": 0.3555719375844698, + "grad_norm": 1.394113431092259, + "learning_rate": 2.4416872764836e-05, + "loss": 0.6945, + "step": 2894 + }, + { + "epoch": 0.3556948028013269, + "grad_norm": 1.4309264333093454, + "learning_rate": 2.441186434046364e-05, + "loss": 0.7273, + "step": 2895 + }, + { + "epoch": 0.35581766801818404, + "grad_norm": 1.5092040245508187, + "learning_rate": 2.4406854184839875e-05, + "loss": 0.6822, + "step": 2896 + }, + { + "epoch": 0.35594053323504116, + "grad_norm": 1.3183042497106792, + "learning_rate": 2.440184229888629e-05, + "loss": 0.6489, + "step": 2897 + }, + { + "epoch": 0.35606339845189827, + "grad_norm": 1.321890055763165, + "learning_rate": 2.4396828683524787e-05, + "loss": 0.5979, + "step": 2898 + }, + { + "epoch": 0.3561862636687554, + "grad_norm": 1.3612569963971601, + "learning_rate": 2.4391813339677588e-05, + "loss": 0.7658, + "step": 2899 + }, + { + "epoch": 0.3563091288856125, + "grad_norm": 1.1903335719172712, + "learning_rate": 2.4386796268267227e-05, + "loss": 0.6039, + "step": 2900 + }, + { + "epoch": 0.3564319941024696, + "grad_norm": 1.280922543928591, + "learning_rate": 2.438177747021658e-05, + "loss": 0.5993, + "step": 2901 + }, + { + "epoch": 0.3565548593193267, + "grad_norm": 1.3513351249191208, + "learning_rate": 2.43767569464488e-05, + "loss": 0.6161, + "step": 2902 + }, + { + "epoch": 0.3566777245361838, + "grad_norm": 1.4302856532851838, + "learning_rate": 2.4371734697887395e-05, + "loss": 0.6074, + "step": 2903 + }, + { + "epoch": 0.3568005897530409, + "grad_norm": 1.3046739645405683, + "learning_rate": 2.436671072545617e-05, + "loss": 0.6468, + "step": 2904 + }, + { + "epoch": 0.356923454969898, + "grad_norm": 1.2660216199311898, + "learning_rate": 2.436168503007925e-05, + "loss": 0.6288, + "step": 2905 + }, + { + "epoch": 0.35704632018675514, + "grad_norm": 1.3175804655602612, + "learning_rate": 2.435665761268108e-05, + "loss": 0.6142, + "step": 2906 + }, + { + "epoch": 0.35716918540361225, + "grad_norm": 1.1771289683037809, + "learning_rate": 2.4351628474186427e-05, + "loss": 0.6695, + "step": 2907 + }, + { + "epoch": 0.35729205062046937, + "grad_norm": 1.219462980416533, + "learning_rate": 2.434659761552036e-05, + "loss": 0.7147, + "step": 2908 + }, + { + "epoch": 0.35741491583732643, + "grad_norm": 1.384968856443894, + "learning_rate": 2.4341565037608278e-05, + "loss": 0.6194, + "step": 2909 + }, + { + "epoch": 0.35753778105418355, + "grad_norm": 1.272710186134619, + "learning_rate": 2.4336530741375892e-05, + "loss": 0.6559, + "step": 2910 + }, + { + "epoch": 0.35766064627104066, + "grad_norm": 1.3473198797702814, + "learning_rate": 2.4331494727749223e-05, + "loss": 0.6689, + "step": 2911 + }, + { + "epoch": 0.3577835114878978, + "grad_norm": 1.4032147496694274, + "learning_rate": 2.4326456997654617e-05, + "loss": 0.6509, + "step": 2912 + }, + { + "epoch": 0.3579063767047549, + "grad_norm": 1.3851314228605418, + "learning_rate": 2.4321417552018728e-05, + "loss": 0.6237, + "step": 2913 + }, + { + "epoch": 0.358029241921612, + "grad_norm": 1.6153704737539059, + "learning_rate": 2.4316376391768534e-05, + "loss": 0.7231, + "step": 2914 + }, + { + "epoch": 0.3581521071384691, + "grad_norm": 1.2618219796318941, + "learning_rate": 2.431133351783132e-05, + "loss": 0.6867, + "step": 2915 + }, + { + "epoch": 0.3582749723553262, + "grad_norm": 1.118042457696448, + "learning_rate": 2.430628893113469e-05, + "loss": 0.5658, + "step": 2916 + }, + { + "epoch": 0.3583978375721833, + "grad_norm": 1.0592276223105983, + "learning_rate": 2.430124263260657e-05, + "loss": 0.5983, + "step": 2917 + }, + { + "epoch": 0.3585207027890404, + "grad_norm": 1.251775341770502, + "learning_rate": 2.4296194623175187e-05, + "loss": 0.7416, + "step": 2918 + }, + { + "epoch": 0.35864356800589753, + "grad_norm": 1.3595690513311904, + "learning_rate": 2.4291144903769087e-05, + "loss": 0.5997, + "step": 2919 + }, + { + "epoch": 0.35876643322275464, + "grad_norm": 1.4134084331513677, + "learning_rate": 2.4286093475317145e-05, + "loss": 0.7977, + "step": 2920 + }, + { + "epoch": 0.35888929843961176, + "grad_norm": 1.3441908648939007, + "learning_rate": 2.428104033874852e-05, + "loss": 0.5951, + "step": 2921 + }, + { + "epoch": 0.3590121636564689, + "grad_norm": 1.3877739937126663, + "learning_rate": 2.4275985494992724e-05, + "loss": 0.6889, + "step": 2922 + }, + { + "epoch": 0.35913502887332593, + "grad_norm": 1.606485277003862, + "learning_rate": 2.4270928944979546e-05, + "loss": 0.7592, + "step": 2923 + }, + { + "epoch": 0.35925789409018305, + "grad_norm": 1.44096581767626, + "learning_rate": 2.4265870689639113e-05, + "loss": 0.665, + "step": 2924 + }, + { + "epoch": 0.35938075930704017, + "grad_norm": 1.573327100045176, + "learning_rate": 2.4260810729901857e-05, + "loss": 0.7232, + "step": 2925 + }, + { + "epoch": 0.3595036245238973, + "grad_norm": 1.5303182042207017, + "learning_rate": 2.4255749066698535e-05, + "loss": 0.674, + "step": 2926 + }, + { + "epoch": 0.3596264897407544, + "grad_norm": 1.5431198218500666, + "learning_rate": 2.4250685700960188e-05, + "loss": 0.739, + "step": 2927 + }, + { + "epoch": 0.3597493549576115, + "grad_norm": 1.2479839826703332, + "learning_rate": 2.4245620633618207e-05, + "loss": 0.5594, + "step": 2928 + }, + { + "epoch": 0.3598722201744686, + "grad_norm": 1.1408334736336936, + "learning_rate": 2.424055386560426e-05, + "loss": 0.6992, + "step": 2929 + }, + { + "epoch": 0.35999508539132574, + "grad_norm": 1.3671484586437301, + "learning_rate": 2.4235485397850363e-05, + "loss": 0.6673, + "step": 2930 + }, + { + "epoch": 0.3601179506081828, + "grad_norm": 1.2700758259448937, + "learning_rate": 2.4230415231288823e-05, + "loss": 0.5556, + "step": 2931 + }, + { + "epoch": 0.3602408158250399, + "grad_norm": 1.3609185252187523, + "learning_rate": 2.422534336685226e-05, + "loss": 0.6092, + "step": 2932 + }, + { + "epoch": 0.36036368104189703, + "grad_norm": 1.408986878211817, + "learning_rate": 2.4220269805473612e-05, + "loss": 0.543, + "step": 2933 + }, + { + "epoch": 0.36048654625875415, + "grad_norm": 1.256372017792412, + "learning_rate": 2.421519454808613e-05, + "loss": 0.6098, + "step": 2934 + }, + { + "epoch": 0.36060941147561126, + "grad_norm": 1.2898994775773198, + "learning_rate": 2.4210117595623377e-05, + "loss": 0.6233, + "step": 2935 + }, + { + "epoch": 0.3607322766924684, + "grad_norm": 1.5459042565270145, + "learning_rate": 2.4205038949019218e-05, + "loss": 0.5792, + "step": 2936 + }, + { + "epoch": 0.3608551419093255, + "grad_norm": 1.432206799448882, + "learning_rate": 2.419995860920784e-05, + "loss": 0.5883, + "step": 2937 + }, + { + "epoch": 0.36097800712618255, + "grad_norm": 1.5181332185322032, + "learning_rate": 2.4194876577123746e-05, + "loss": 0.6576, + "step": 2938 + }, + { + "epoch": 0.36110087234303967, + "grad_norm": 1.4397044452009289, + "learning_rate": 2.4189792853701734e-05, + "loss": 0.6617, + "step": 2939 + }, + { + "epoch": 0.3612237375598968, + "grad_norm": 1.4707158604208062, + "learning_rate": 2.418470743987692e-05, + "loss": 0.5681, + "step": 2940 + }, + { + "epoch": 0.3613466027767539, + "grad_norm": 1.2968890774401758, + "learning_rate": 2.4179620336584743e-05, + "loss": 0.5357, + "step": 2941 + }, + { + "epoch": 0.361469467993611, + "grad_norm": 1.2899064815993366, + "learning_rate": 2.417453154476093e-05, + "loss": 0.7098, + "step": 2942 + }, + { + "epoch": 0.36159233321046813, + "grad_norm": 1.2828699388813116, + "learning_rate": 2.4169441065341546e-05, + "loss": 0.5733, + "step": 2943 + }, + { + "epoch": 0.36171519842732525, + "grad_norm": 1.5338316128657483, + "learning_rate": 2.4164348899262936e-05, + "loss": 0.6749, + "step": 2944 + }, + { + "epoch": 0.3618380636441823, + "grad_norm": 1.3809076504241335, + "learning_rate": 2.4159255047461785e-05, + "loss": 0.6523, + "step": 2945 + }, + { + "epoch": 0.3619609288610394, + "grad_norm": 1.2148841553979777, + "learning_rate": 2.4154159510875065e-05, + "loss": 0.6656, + "step": 2946 + }, + { + "epoch": 0.36208379407789654, + "grad_norm": 1.3313564477265711, + "learning_rate": 2.414906229044007e-05, + "loss": 0.6964, + "step": 2947 + }, + { + "epoch": 0.36220665929475365, + "grad_norm": 1.3712767120852138, + "learning_rate": 2.4143963387094403e-05, + "loss": 0.5847, + "step": 2948 + }, + { + "epoch": 0.36232952451161077, + "grad_norm": 1.1313410495333536, + "learning_rate": 2.4138862801775973e-05, + "loss": 0.5773, + "step": 2949 + }, + { + "epoch": 0.3624523897284679, + "grad_norm": 1.7086468731394118, + "learning_rate": 2.4133760535422994e-05, + "loss": 0.6471, + "step": 2950 + }, + { + "epoch": 0.362575254945325, + "grad_norm": 1.2752716393839483, + "learning_rate": 2.4128656588974e-05, + "loss": 0.6765, + "step": 2951 + }, + { + "epoch": 0.3626981201621821, + "grad_norm": 1.1763211871191426, + "learning_rate": 2.4123550963367824e-05, + "loss": 0.6525, + "step": 2952 + }, + { + "epoch": 0.3628209853790392, + "grad_norm": 1.195326708457016, + "learning_rate": 2.411844365954362e-05, + "loss": 0.6604, + "step": 2953 + }, + { + "epoch": 0.3629438505958963, + "grad_norm": 1.2739677536622915, + "learning_rate": 2.4113334678440842e-05, + "loss": 0.6038, + "step": 2954 + }, + { + "epoch": 0.3630667158127534, + "grad_norm": 1.3760507222020615, + "learning_rate": 2.410822402099925e-05, + "loss": 0.6635, + "step": 2955 + }, + { + "epoch": 0.3631895810296105, + "grad_norm": 1.3108169371628935, + "learning_rate": 2.4103111688158917e-05, + "loss": 0.6499, + "step": 2956 + }, + { + "epoch": 0.36331244624646764, + "grad_norm": 1.6731838672088424, + "learning_rate": 2.4097997680860232e-05, + "loss": 0.6814, + "step": 2957 + }, + { + "epoch": 0.36343531146332475, + "grad_norm": 1.7087002669456355, + "learning_rate": 2.4092882000043868e-05, + "loss": 0.694, + "step": 2958 + }, + { + "epoch": 0.36355817668018187, + "grad_norm": 1.432029648065579, + "learning_rate": 2.408776464665083e-05, + "loss": 0.7091, + "step": 2959 + }, + { + "epoch": 0.3636810418970389, + "grad_norm": 1.2855320506796832, + "learning_rate": 2.4082645621622425e-05, + "loss": 0.6262, + "step": 2960 + }, + { + "epoch": 0.36380390711389604, + "grad_norm": 1.0028777474356807, + "learning_rate": 2.407752492590026e-05, + "loss": 0.6286, + "step": 2961 + }, + { + "epoch": 0.36392677233075316, + "grad_norm": 1.1039105805897944, + "learning_rate": 2.4072402560426253e-05, + "loss": 0.6662, + "step": 2962 + }, + { + "epoch": 0.36404963754761027, + "grad_norm": 1.3826209509763898, + "learning_rate": 2.4067278526142635e-05, + "loss": 0.7074, + "step": 2963 + }, + { + "epoch": 0.3641725027644674, + "grad_norm": 1.5330846884072493, + "learning_rate": 2.4062152823991933e-05, + "loss": 0.7138, + "step": 2964 + }, + { + "epoch": 0.3642953679813245, + "grad_norm": 1.3602726894624495, + "learning_rate": 2.405702545491699e-05, + "loss": 0.5689, + "step": 2965 + }, + { + "epoch": 0.3644182331981816, + "grad_norm": 1.511557408225221, + "learning_rate": 2.405189641986095e-05, + "loss": 0.6491, + "step": 2966 + }, + { + "epoch": 0.3645410984150387, + "grad_norm": 1.2553907337805934, + "learning_rate": 2.404676571976727e-05, + "loss": 0.599, + "step": 2967 + }, + { + "epoch": 0.3646639636318958, + "grad_norm": 1.2398584654944411, + "learning_rate": 2.4041633355579705e-05, + "loss": 0.6342, + "step": 2968 + }, + { + "epoch": 0.3647868288487529, + "grad_norm": 1.4534434429812835, + "learning_rate": 2.403649932824232e-05, + "loss": 0.7167, + "step": 2969 + }, + { + "epoch": 0.36490969406561, + "grad_norm": 1.4247760214121539, + "learning_rate": 2.403136363869949e-05, + "loss": 0.7409, + "step": 2970 + }, + { + "epoch": 0.36503255928246714, + "grad_norm": 1.375507382751723, + "learning_rate": 2.4026226287895885e-05, + "loss": 0.6425, + "step": 2971 + }, + { + "epoch": 0.36515542449932425, + "grad_norm": 1.3422094284789758, + "learning_rate": 2.4021087276776493e-05, + "loss": 0.6277, + "step": 2972 + }, + { + "epoch": 0.36527828971618137, + "grad_norm": 1.1783589896325022, + "learning_rate": 2.40159466062866e-05, + "loss": 0.6096, + "step": 2973 + }, + { + "epoch": 0.36540115493303843, + "grad_norm": 1.4451769549505467, + "learning_rate": 2.40108042773718e-05, + "loss": 0.5913, + "step": 2974 + }, + { + "epoch": 0.36552402014989555, + "grad_norm": 1.3626891086730504, + "learning_rate": 2.400566029097799e-05, + "loss": 0.6079, + "step": 2975 + }, + { + "epoch": 0.36564688536675266, + "grad_norm": 1.2035142264046366, + "learning_rate": 2.4000514648051372e-05, + "loss": 0.6635, + "step": 2976 + }, + { + "epoch": 0.3657697505836098, + "grad_norm": 1.248744846346287, + "learning_rate": 2.3995367349538456e-05, + "loss": 0.5929, + "step": 2977 + }, + { + "epoch": 0.3658926158004669, + "grad_norm": 1.280535805746773, + "learning_rate": 2.399021839638605e-05, + "loss": 0.6486, + "step": 2978 + }, + { + "epoch": 0.366015481017324, + "grad_norm": 1.2981241659885376, + "learning_rate": 2.3985067789541285e-05, + "loss": 0.5422, + "step": 2979 + }, + { + "epoch": 0.3661383462341811, + "grad_norm": 1.2030127977946137, + "learning_rate": 2.3979915529951562e-05, + "loss": 0.5277, + "step": 2980 + }, + { + "epoch": 0.36626121145103824, + "grad_norm": 1.4095033341245589, + "learning_rate": 2.3974761618564613e-05, + "loss": 0.6332, + "step": 2981 + }, + { + "epoch": 0.3663840766678953, + "grad_norm": 1.446787439605187, + "learning_rate": 2.396960605632847e-05, + "loss": 0.6148, + "step": 2982 + }, + { + "epoch": 0.3665069418847524, + "grad_norm": 1.2275521134578737, + "learning_rate": 2.396444884419146e-05, + "loss": 0.5814, + "step": 2983 + }, + { + "epoch": 0.36662980710160953, + "grad_norm": 1.1208233633738671, + "learning_rate": 2.3959289983102223e-05, + "loss": 0.5367, + "step": 2984 + }, + { + "epoch": 0.36675267231846664, + "grad_norm": 1.3502268238323631, + "learning_rate": 2.39541294740097e-05, + "loss": 0.6142, + "step": 2985 + }, + { + "epoch": 0.36687553753532376, + "grad_norm": 1.394303209782231, + "learning_rate": 2.3948967317863124e-05, + "loss": 0.6272, + "step": 2986 + }, + { + "epoch": 0.3669984027521809, + "grad_norm": 1.4311177426259918, + "learning_rate": 2.3943803515612053e-05, + "loss": 0.6935, + "step": 2987 + }, + { + "epoch": 0.367121267969038, + "grad_norm": 1.2891582893564986, + "learning_rate": 2.393863806820632e-05, + "loss": 0.6471, + "step": 2988 + }, + { + "epoch": 0.36724413318589505, + "grad_norm": 1.256424536224272, + "learning_rate": 2.3933470976596088e-05, + "loss": 0.6356, + "step": 2989 + }, + { + "epoch": 0.36736699840275217, + "grad_norm": 1.415806151627246, + "learning_rate": 2.3928302241731807e-05, + "loss": 0.623, + "step": 2990 + }, + { + "epoch": 0.3674898636196093, + "grad_norm": 1.4201794001200552, + "learning_rate": 2.3923131864564228e-05, + "loss": 0.5906, + "step": 2991 + }, + { + "epoch": 0.3676127288364664, + "grad_norm": 1.3865085153213939, + "learning_rate": 2.391795984604441e-05, + "loss": 0.5984, + "step": 2992 + }, + { + "epoch": 0.3677355940533235, + "grad_norm": 1.2942271545687993, + "learning_rate": 2.3912786187123714e-05, + "loss": 0.6193, + "step": 2993 + }, + { + "epoch": 0.3678584592701806, + "grad_norm": 1.439224968515976, + "learning_rate": 2.39076108887538e-05, + "loss": 0.692, + "step": 2994 + }, + { + "epoch": 0.36798132448703774, + "grad_norm": 1.1420610058825036, + "learning_rate": 2.3902433951886634e-05, + "loss": 0.6487, + "step": 2995 + }, + { + "epoch": 0.3681041897038948, + "grad_norm": 1.1783292561186094, + "learning_rate": 2.3897255377474472e-05, + "loss": 0.5752, + "step": 2996 + }, + { + "epoch": 0.3682270549207519, + "grad_norm": 1.5374626570368481, + "learning_rate": 2.389207516646989e-05, + "loss": 0.616, + "step": 2997 + }, + { + "epoch": 0.36834992013760903, + "grad_norm": 1.426290023423719, + "learning_rate": 2.3886893319825747e-05, + "loss": 0.6277, + "step": 2998 + }, + { + "epoch": 0.36847278535446615, + "grad_norm": 1.1938918147971573, + "learning_rate": 2.3881709838495208e-05, + "loss": 0.5355, + "step": 2999 + }, + { + "epoch": 0.36859565057132326, + "grad_norm": 1.7837989514549788, + "learning_rate": 2.3876524723431748e-05, + "loss": 0.6743, + "step": 3000 + }, + { + "epoch": 0.3687185157881804, + "grad_norm": 1.3672478651261157, + "learning_rate": 2.3871337975589124e-05, + "loss": 0.6594, + "step": 3001 + }, + { + "epoch": 0.3688413810050375, + "grad_norm": 1.1484836477217446, + "learning_rate": 2.386614959592142e-05, + "loss": 0.6646, + "step": 3002 + }, + { + "epoch": 0.36896424622189455, + "grad_norm": 1.1578454706796342, + "learning_rate": 2.3860959585382995e-05, + "loss": 0.6339, + "step": 3003 + }, + { + "epoch": 0.36908711143875167, + "grad_norm": 1.7868597088533542, + "learning_rate": 2.385576794492852e-05, + "loss": 0.808, + "step": 3004 + }, + { + "epoch": 0.3692099766556088, + "grad_norm": 1.4573882378870107, + "learning_rate": 2.385057467551296e-05, + "loss": 0.7382, + "step": 3005 + }, + { + "epoch": 0.3693328418724659, + "grad_norm": 1.324977226044546, + "learning_rate": 2.3845379778091587e-05, + "loss": 0.5496, + "step": 3006 + }, + { + "epoch": 0.369455707089323, + "grad_norm": 1.3455956070702377, + "learning_rate": 2.384018325361997e-05, + "loss": 0.5725, + "step": 3007 + }, + { + "epoch": 0.36957857230618013, + "grad_norm": 1.3216042995034352, + "learning_rate": 2.3834985103053976e-05, + "loss": 0.5868, + "step": 3008 + }, + { + "epoch": 0.36970143752303725, + "grad_norm": 1.4314494389024397, + "learning_rate": 2.3829785327349766e-05, + "loss": 0.6431, + "step": 3009 + }, + { + "epoch": 0.36982430273989436, + "grad_norm": 1.6180545698747584, + "learning_rate": 2.382458392746381e-05, + "loss": 0.6689, + "step": 3010 + }, + { + "epoch": 0.3699471679567514, + "grad_norm": 1.2795557983570238, + "learning_rate": 2.381938090435287e-05, + "loss": 0.6213, + "step": 3011 + }, + { + "epoch": 0.37007003317360854, + "grad_norm": 1.297917871196048, + "learning_rate": 2.3814176258974006e-05, + "loss": 0.6315, + "step": 3012 + }, + { + "epoch": 0.37019289839046565, + "grad_norm": 1.1702951026882769, + "learning_rate": 2.380896999228458e-05, + "loss": 0.6575, + "step": 3013 + }, + { + "epoch": 0.37031576360732277, + "grad_norm": 1.3322508595526688, + "learning_rate": 2.3803762105242255e-05, + "loss": 0.6746, + "step": 3014 + }, + { + "epoch": 0.3704386288241799, + "grad_norm": 1.180468781992892, + "learning_rate": 2.3798552598804987e-05, + "loss": 0.6285, + "step": 3015 + }, + { + "epoch": 0.370561494041037, + "grad_norm": 1.9441644424162583, + "learning_rate": 2.3793341473931024e-05, + "loss": 0.6077, + "step": 3016 + }, + { + "epoch": 0.3706843592578941, + "grad_norm": 1.6465345064959926, + "learning_rate": 2.3788128731578928e-05, + "loss": 0.6587, + "step": 3017 + }, + { + "epoch": 0.3708072244747512, + "grad_norm": 1.2172969011676469, + "learning_rate": 2.378291437270754e-05, + "loss": 0.5475, + "step": 3018 + }, + { + "epoch": 0.3709300896916083, + "grad_norm": 1.3292162659881221, + "learning_rate": 2.377769839827602e-05, + "loss": 0.5892, + "step": 3019 + }, + { + "epoch": 0.3710529549084654, + "grad_norm": 1.3651005055379835, + "learning_rate": 2.3772480809243797e-05, + "loss": 0.5468, + "step": 3020 + }, + { + "epoch": 0.3711758201253225, + "grad_norm": 1.3457942682457644, + "learning_rate": 2.3767261606570626e-05, + "loss": 0.671, + "step": 3021 + }, + { + "epoch": 0.37129868534217964, + "grad_norm": 1.451509516742317, + "learning_rate": 2.376204079121654e-05, + "loss": 0.7284, + "step": 3022 + }, + { + "epoch": 0.37142155055903675, + "grad_norm": 1.5569670431871967, + "learning_rate": 2.375681836414187e-05, + "loss": 0.5742, + "step": 3023 + }, + { + "epoch": 0.37154441577589387, + "grad_norm": 1.0818502149786509, + "learning_rate": 2.3751594326307254e-05, + "loss": 0.6219, + "step": 3024 + }, + { + "epoch": 0.3716672809927509, + "grad_norm": 1.4427557366131711, + "learning_rate": 2.374636867867362e-05, + "loss": 0.6509, + "step": 3025 + }, + { + "epoch": 0.37179014620960804, + "grad_norm": 1.236146525683267, + "learning_rate": 2.3741141422202188e-05, + "loss": 0.7282, + "step": 3026 + }, + { + "epoch": 0.37191301142646516, + "grad_norm": 1.26711501236808, + "learning_rate": 2.373591255785448e-05, + "loss": 0.5392, + "step": 3027 + }, + { + "epoch": 0.37203587664332227, + "grad_norm": 1.3498801682043113, + "learning_rate": 2.373068208659231e-05, + "loss": 0.5354, + "step": 3028 + }, + { + "epoch": 0.3721587418601794, + "grad_norm": 1.2223077728708003, + "learning_rate": 2.3725450009377795e-05, + "loss": 0.5359, + "step": 3029 + }, + { + "epoch": 0.3722816070770365, + "grad_norm": 1.3123842792988207, + "learning_rate": 2.3720216327173327e-05, + "loss": 0.6811, + "step": 3030 + }, + { + "epoch": 0.3724044722938936, + "grad_norm": 2.0093199041201646, + "learning_rate": 2.371498104094163e-05, + "loss": 0.6532, + "step": 3031 + }, + { + "epoch": 0.37252733751075073, + "grad_norm": 1.3192358059017948, + "learning_rate": 2.3709744151645686e-05, + "loss": 0.6539, + "step": 3032 + }, + { + "epoch": 0.3726502027276078, + "grad_norm": 1.2980217591623473, + "learning_rate": 2.3704505660248786e-05, + "loss": 0.6419, + "step": 3033 + }, + { + "epoch": 0.3727730679444649, + "grad_norm": 1.2422002611106016, + "learning_rate": 2.3699265567714522e-05, + "loss": 0.6063, + "step": 3034 + }, + { + "epoch": 0.372895933161322, + "grad_norm": 1.4203449476293402, + "learning_rate": 2.3694023875006773e-05, + "loss": 0.5732, + "step": 3035 + }, + { + "epoch": 0.37301879837817914, + "grad_norm": 1.3294561783584766, + "learning_rate": 2.368878058308972e-05, + "loss": 0.7252, + "step": 3036 + }, + { + "epoch": 0.37314166359503625, + "grad_norm": 1.4273031127727738, + "learning_rate": 2.368353569292782e-05, + "loss": 0.5938, + "step": 3037 + }, + { + "epoch": 0.37326452881189337, + "grad_norm": 1.5597780367656804, + "learning_rate": 2.367828920548585e-05, + "loss": 0.7185, + "step": 3038 + }, + { + "epoch": 0.3733873940287505, + "grad_norm": 1.167696785599463, + "learning_rate": 2.3673041121728857e-05, + "loss": 0.6144, + "step": 3039 + }, + { + "epoch": 0.37351025924560755, + "grad_norm": 1.7029812578899834, + "learning_rate": 2.36677914426222e-05, + "loss": 0.7357, + "step": 3040 + }, + { + "epoch": 0.37363312446246466, + "grad_norm": 1.2561781970018855, + "learning_rate": 2.3662540169131516e-05, + "loss": 0.5892, + "step": 3041 + }, + { + "epoch": 0.3737559896793218, + "grad_norm": 1.310506619929962, + "learning_rate": 2.365728730222275e-05, + "loss": 0.5368, + "step": 3042 + }, + { + "epoch": 0.3738788548961789, + "grad_norm": 1.3557688020031782, + "learning_rate": 2.3652032842862127e-05, + "loss": 0.6249, + "step": 3043 + }, + { + "epoch": 0.374001720113036, + "grad_norm": 1.233971986097292, + "learning_rate": 2.3646776792016175e-05, + "loss": 0.6428, + "step": 3044 + }, + { + "epoch": 0.3741245853298931, + "grad_norm": 1.4216214114363233, + "learning_rate": 2.3641519150651707e-05, + "loss": 0.4976, + "step": 3045 + }, + { + "epoch": 0.37424745054675024, + "grad_norm": 1.2783146921697044, + "learning_rate": 2.3636259919735835e-05, + "loss": 0.6205, + "step": 3046 + }, + { + "epoch": 0.3743703157636073, + "grad_norm": 1.4002410177787372, + "learning_rate": 2.3630999100235956e-05, + "loss": 0.6286, + "step": 3047 + }, + { + "epoch": 0.3744931809804644, + "grad_norm": 1.2449844319945587, + "learning_rate": 2.362573669311977e-05, + "loss": 0.7029, + "step": 3048 + }, + { + "epoch": 0.37461604619732153, + "grad_norm": 1.3194452945407886, + "learning_rate": 2.3620472699355255e-05, + "loss": 0.5959, + "step": 3049 + }, + { + "epoch": 0.37473891141417864, + "grad_norm": 1.1423055632959906, + "learning_rate": 2.3615207119910693e-05, + "loss": 0.5663, + "step": 3050 + }, + { + "epoch": 0.37486177663103576, + "grad_norm": 1.3299685589697985, + "learning_rate": 2.3609939955754656e-05, + "loss": 0.6749, + "step": 3051 + }, + { + "epoch": 0.3749846418478929, + "grad_norm": 1.615816441540961, + "learning_rate": 2.3604671207856002e-05, + "loss": 0.6793, + "step": 3052 + }, + { + "epoch": 0.37510750706475, + "grad_norm": 1.3993187650668584, + "learning_rate": 2.359940087718388e-05, + "loss": 0.5887, + "step": 3053 + }, + { + "epoch": 0.37523037228160705, + "grad_norm": 1.2375670840040132, + "learning_rate": 2.3594128964707736e-05, + "loss": 0.5319, + "step": 3054 + }, + { + "epoch": 0.37535323749846417, + "grad_norm": 2.0808246177826173, + "learning_rate": 2.3588855471397305e-05, + "loss": 0.6731, + "step": 3055 + }, + { + "epoch": 0.3754761027153213, + "grad_norm": 1.710629895778046, + "learning_rate": 2.358358039822261e-05, + "loss": 0.6559, + "step": 3056 + }, + { + "epoch": 0.3755989679321784, + "grad_norm": 1.5226938040252462, + "learning_rate": 2.357830374615397e-05, + "loss": 0.5699, + "step": 3057 + }, + { + "epoch": 0.3757218331490355, + "grad_norm": 1.122065225903869, + "learning_rate": 2.3573025516161977e-05, + "loss": 0.6112, + "step": 3058 + }, + { + "epoch": 0.3758446983658926, + "grad_norm": 1.2831878520478954, + "learning_rate": 2.356774570921755e-05, + "loss": 0.6499, + "step": 3059 + }, + { + "epoch": 0.37596756358274974, + "grad_norm": 2.132641348444403, + "learning_rate": 2.3562464326291862e-05, + "loss": 0.7971, + "step": 3060 + }, + { + "epoch": 0.37609042879960686, + "grad_norm": 1.3621468000889878, + "learning_rate": 2.355718136835639e-05, + "loss": 0.6989, + "step": 3061 + }, + { + "epoch": 0.3762132940164639, + "grad_norm": 1.355987035450686, + "learning_rate": 2.35518968363829e-05, + "loss": 0.68, + "step": 3062 + }, + { + "epoch": 0.37633615923332103, + "grad_norm": 1.4127390332582228, + "learning_rate": 2.3546610731343446e-05, + "loss": 0.7668, + "step": 3063 + }, + { + "epoch": 0.37645902445017815, + "grad_norm": 1.3099907780498028, + "learning_rate": 2.3541323054210374e-05, + "loss": 0.6351, + "step": 3064 + }, + { + "epoch": 0.37658188966703526, + "grad_norm": 1.586507466621477, + "learning_rate": 2.353603380595633e-05, + "loss": 0.6144, + "step": 3065 + }, + { + "epoch": 0.3767047548838924, + "grad_norm": 1.3377084762212375, + "learning_rate": 2.353074298755421e-05, + "loss": 0.7024, + "step": 3066 + }, + { + "epoch": 0.3768276201007495, + "grad_norm": 1.7010983677916929, + "learning_rate": 2.352545059997725e-05, + "loss": 0.6986, + "step": 3067 + }, + { + "epoch": 0.3769504853176066, + "grad_norm": 1.4342961407259711, + "learning_rate": 2.352015664419894e-05, + "loss": 0.5826, + "step": 3068 + }, + { + "epoch": 0.37707335053446367, + "grad_norm": 1.3182187492920756, + "learning_rate": 2.3514861121193068e-05, + "loss": 0.6134, + "step": 3069 + }, + { + "epoch": 0.3771962157513208, + "grad_norm": 1.2232995188566826, + "learning_rate": 2.3509564031933716e-05, + "loss": 0.5831, + "step": 3070 + }, + { + "epoch": 0.3773190809681779, + "grad_norm": 1.1279329391816846, + "learning_rate": 2.3504265377395244e-05, + "loss": 0.5241, + "step": 3071 + }, + { + "epoch": 0.377441946185035, + "grad_norm": 1.3892540767416506, + "learning_rate": 2.349896515855231e-05, + "loss": 0.5687, + "step": 3072 + }, + { + "epoch": 0.37756481140189213, + "grad_norm": 1.3409968800249306, + "learning_rate": 2.3493663376379853e-05, + "loss": 0.6251, + "step": 3073 + }, + { + "epoch": 0.37768767661874925, + "grad_norm": 1.3243336003543786, + "learning_rate": 2.3488360031853102e-05, + "loss": 0.5823, + "step": 3074 + }, + { + "epoch": 0.37781054183560636, + "grad_norm": 1.2934130300753455, + "learning_rate": 2.348305512594757e-05, + "loss": 0.6706, + "step": 3075 + }, + { + "epoch": 0.3779334070524634, + "grad_norm": 1.4917946038223329, + "learning_rate": 2.3477748659639063e-05, + "loss": 0.5832, + "step": 3076 + }, + { + "epoch": 0.37805627226932054, + "grad_norm": 1.3152077175788883, + "learning_rate": 2.347244063390367e-05, + "loss": 0.5935, + "step": 3077 + }, + { + "epoch": 0.37817913748617765, + "grad_norm": 1.1792844260031874, + "learning_rate": 2.346713104971777e-05, + "loss": 0.5363, + "step": 3078 + }, + { + "epoch": 0.37830200270303477, + "grad_norm": 1.6073602977028432, + "learning_rate": 2.3461819908058024e-05, + "loss": 0.6854, + "step": 3079 + }, + { + "epoch": 0.3784248679198919, + "grad_norm": 1.2454186792565136, + "learning_rate": 2.3456507209901382e-05, + "loss": 0.6403, + "step": 3080 + }, + { + "epoch": 0.378547733136749, + "grad_norm": 1.4465885678285888, + "learning_rate": 2.345119295622508e-05, + "loss": 0.7558, + "step": 3081 + }, + { + "epoch": 0.3786705983536061, + "grad_norm": 1.3888426014082174, + "learning_rate": 2.3445877148006643e-05, + "loss": 0.6638, + "step": 3082 + }, + { + "epoch": 0.37879346357046323, + "grad_norm": 1.259527974002533, + "learning_rate": 2.3440559786223878e-05, + "loss": 0.6393, + "step": 3083 + }, + { + "epoch": 0.3789163287873203, + "grad_norm": 1.315487635917726, + "learning_rate": 2.343524087185488e-05, + "loss": 0.6973, + "step": 3084 + }, + { + "epoch": 0.3790391940041774, + "grad_norm": 1.2558828981209915, + "learning_rate": 2.3429920405878024e-05, + "loss": 0.5364, + "step": 3085 + }, + { + "epoch": 0.3791620592210345, + "grad_norm": 1.197453458943176, + "learning_rate": 2.3424598389271986e-05, + "loss": 0.6815, + "step": 3086 + }, + { + "epoch": 0.37928492443789164, + "grad_norm": 1.3414207995986853, + "learning_rate": 2.3419274823015704e-05, + "loss": 0.5766, + "step": 3087 + }, + { + "epoch": 0.37940778965474875, + "grad_norm": 1.228127593264913, + "learning_rate": 2.3413949708088424e-05, + "loss": 0.5907, + "step": 3088 + }, + { + "epoch": 0.37953065487160587, + "grad_norm": 1.4497319073567136, + "learning_rate": 2.3408623045469658e-05, + "loss": 0.6258, + "step": 3089 + }, + { + "epoch": 0.379653520088463, + "grad_norm": 1.2950724658796482, + "learning_rate": 2.3403294836139216e-05, + "loss": 0.6105, + "step": 3090 + }, + { + "epoch": 0.37977638530532004, + "grad_norm": 1.4936394491254836, + "learning_rate": 2.339796508107718e-05, + "loss": 0.6613, + "step": 3091 + }, + { + "epoch": 0.37989925052217716, + "grad_norm": 1.2110650834823302, + "learning_rate": 2.339263378126394e-05, + "loss": 0.6122, + "step": 3092 + }, + { + "epoch": 0.38002211573903427, + "grad_norm": 1.5055783337702415, + "learning_rate": 2.338730093768014e-05, + "loss": 0.6648, + "step": 3093 + }, + { + "epoch": 0.3801449809558914, + "grad_norm": 1.5097882371525442, + "learning_rate": 2.338196655130673e-05, + "loss": 0.5912, + "step": 3094 + }, + { + "epoch": 0.3802678461727485, + "grad_norm": 1.398334913808832, + "learning_rate": 2.3376630623124925e-05, + "loss": 0.6546, + "step": 3095 + }, + { + "epoch": 0.3803907113896056, + "grad_norm": 1.1711902684241875, + "learning_rate": 2.3371293154116244e-05, + "loss": 0.6672, + "step": 3096 + }, + { + "epoch": 0.38051357660646273, + "grad_norm": 1.3888086838523215, + "learning_rate": 2.3365954145262478e-05, + "loss": 0.6172, + "step": 3097 + }, + { + "epoch": 0.3806364418233198, + "grad_norm": 1.4622026132767225, + "learning_rate": 2.3360613597545698e-05, + "loss": 0.5806, + "step": 3098 + }, + { + "epoch": 0.3807593070401769, + "grad_norm": 1.838457863455748, + "learning_rate": 2.3355271511948272e-05, + "loss": 0.6677, + "step": 3099 + }, + { + "epoch": 0.380882172257034, + "grad_norm": 1.407937002148143, + "learning_rate": 2.3349927889452834e-05, + "loss": 0.5639, + "step": 3100 + }, + { + "epoch": 0.38100503747389114, + "grad_norm": 1.5319887365660656, + "learning_rate": 2.3344582731042313e-05, + "loss": 0.5934, + "step": 3101 + }, + { + "epoch": 0.38112790269074825, + "grad_norm": 1.2921245591606374, + "learning_rate": 2.3339236037699915e-05, + "loss": 0.7351, + "step": 3102 + }, + { + "epoch": 0.38125076790760537, + "grad_norm": 1.3009038843686636, + "learning_rate": 2.333388781040913e-05, + "loss": 0.559, + "step": 3103 + }, + { + "epoch": 0.3813736331244625, + "grad_norm": 1.2603536621092253, + "learning_rate": 2.3328538050153735e-05, + "loss": 0.6757, + "step": 3104 + }, + { + "epoch": 0.38149649834131955, + "grad_norm": 1.1044198671026118, + "learning_rate": 2.3323186757917772e-05, + "loss": 0.6132, + "step": 3105 + }, + { + "epoch": 0.38161936355817666, + "grad_norm": 1.213749743677481, + "learning_rate": 2.3317833934685583e-05, + "loss": 0.569, + "step": 3106 + }, + { + "epoch": 0.3817422287750338, + "grad_norm": 1.3001519551364638, + "learning_rate": 2.3312479581441786e-05, + "loss": 0.6609, + "step": 3107 + }, + { + "epoch": 0.3818650939918909, + "grad_norm": 1.4414410588134763, + "learning_rate": 2.3307123699171277e-05, + "loss": 0.5635, + "step": 3108 + }, + { + "epoch": 0.381987959208748, + "grad_norm": 1.4272943020021183, + "learning_rate": 2.330176628885924e-05, + "loss": 0.7128, + "step": 3109 + }, + { + "epoch": 0.3821108244256051, + "grad_norm": 1.3987002858598885, + "learning_rate": 2.329640735149113e-05, + "loss": 0.5813, + "step": 3110 + }, + { + "epoch": 0.38223368964246224, + "grad_norm": 1.600257613230404, + "learning_rate": 2.329104688805269e-05, + "loss": 0.7582, + "step": 3111 + }, + { + "epoch": 0.38235655485931935, + "grad_norm": 1.2598699201113055, + "learning_rate": 2.3285684899529948e-05, + "loss": 0.5529, + "step": 3112 + }, + { + "epoch": 0.3824794200761764, + "grad_norm": 1.306911075244212, + "learning_rate": 2.3280321386909203e-05, + "loss": 0.7384, + "step": 3113 + }, + { + "epoch": 0.38260228529303353, + "grad_norm": 1.1317120661591564, + "learning_rate": 2.3274956351177037e-05, + "loss": 0.6573, + "step": 3114 + }, + { + "epoch": 0.38272515050989064, + "grad_norm": 1.4172414426706152, + "learning_rate": 2.326958979332032e-05, + "loss": 0.676, + "step": 3115 + }, + { + "epoch": 0.38284801572674776, + "grad_norm": 1.173142625828647, + "learning_rate": 2.3264221714326182e-05, + "loss": 0.6793, + "step": 3116 + }, + { + "epoch": 0.3829708809436049, + "grad_norm": 1.6564340585049546, + "learning_rate": 2.325885211518206e-05, + "loss": 0.744, + "step": 3117 + }, + { + "epoch": 0.383093746160462, + "grad_norm": 1.2031537084123252, + "learning_rate": 2.3253480996875653e-05, + "loss": 0.6579, + "step": 3118 + }, + { + "epoch": 0.3832166113773191, + "grad_norm": 1.1616810714614563, + "learning_rate": 2.3248108360394942e-05, + "loss": 0.5813, + "step": 3119 + }, + { + "epoch": 0.38333947659417617, + "grad_norm": 1.4373256623716808, + "learning_rate": 2.3242734206728186e-05, + "loss": 0.7659, + "step": 3120 + }, + { + "epoch": 0.3834623418110333, + "grad_norm": 1.4399961118109796, + "learning_rate": 2.323735853686393e-05, + "loss": 0.6127, + "step": 3121 + }, + { + "epoch": 0.3835852070278904, + "grad_norm": 1.4570822759732374, + "learning_rate": 2.3231981351790993e-05, + "loss": 0.6319, + "step": 3122 + }, + { + "epoch": 0.3837080722447475, + "grad_norm": 1.215428865304125, + "learning_rate": 2.3226602652498473e-05, + "loss": 0.569, + "step": 3123 + }, + { + "epoch": 0.3838309374616046, + "grad_norm": 1.1488719202695636, + "learning_rate": 2.3221222439975748e-05, + "loss": 0.6472, + "step": 3124 + }, + { + "epoch": 0.38395380267846174, + "grad_norm": 1.145034954914683, + "learning_rate": 2.3215840715212467e-05, + "loss": 0.6023, + "step": 3125 + }, + { + "epoch": 0.38407666789531886, + "grad_norm": 1.1060677953722022, + "learning_rate": 2.3210457479198573e-05, + "loss": 0.556, + "step": 3126 + }, + { + "epoch": 0.3841995331121759, + "grad_norm": 1.465473984401948, + "learning_rate": 2.3205072732924266e-05, + "loss": 0.7223, + "step": 3127 + }, + { + "epoch": 0.38432239832903303, + "grad_norm": 1.3505774434187645, + "learning_rate": 2.3199686477380047e-05, + "loss": 0.6449, + "step": 3128 + }, + { + "epoch": 0.38444526354589015, + "grad_norm": 1.4057788598548848, + "learning_rate": 2.3194298713556676e-05, + "loss": 0.6154, + "step": 3129 + }, + { + "epoch": 0.38456812876274726, + "grad_norm": 1.3706875460672943, + "learning_rate": 2.3188909442445202e-05, + "loss": 0.573, + "step": 3130 + }, + { + "epoch": 0.3846909939796044, + "grad_norm": 1.5368055841421846, + "learning_rate": 2.318351866503694e-05, + "loss": 0.766, + "step": 3131 + }, + { + "epoch": 0.3848138591964615, + "grad_norm": 1.2427877208963602, + "learning_rate": 2.3178126382323488e-05, + "loss": 0.6327, + "step": 3132 + }, + { + "epoch": 0.3849367244133186, + "grad_norm": 1.4923328543661019, + "learning_rate": 2.3172732595296727e-05, + "loss": 0.5986, + "step": 3133 + }, + { + "epoch": 0.3850595896301757, + "grad_norm": 1.2551929518285199, + "learning_rate": 2.316733730494881e-05, + "loss": 0.7245, + "step": 3134 + }, + { + "epoch": 0.3851824548470328, + "grad_norm": 1.808330218641065, + "learning_rate": 2.316194051227216e-05, + "loss": 0.6572, + "step": 3135 + }, + { + "epoch": 0.3853053200638899, + "grad_norm": 1.2338044807744633, + "learning_rate": 2.3156542218259485e-05, + "loss": 0.6245, + "step": 3136 + }, + { + "epoch": 0.385428185280747, + "grad_norm": 1.4663039728017604, + "learning_rate": 2.3151142423903765e-05, + "loss": 0.627, + "step": 3137 + }, + { + "epoch": 0.38555105049760413, + "grad_norm": 1.2747617512773768, + "learning_rate": 2.314574113019826e-05, + "loss": 0.6249, + "step": 3138 + }, + { + "epoch": 0.38567391571446125, + "grad_norm": 1.1351158354518418, + "learning_rate": 2.3140338338136505e-05, + "loss": 0.5741, + "step": 3139 + }, + { + "epoch": 0.38579678093131836, + "grad_norm": 1.3033155620339516, + "learning_rate": 2.31349340487123e-05, + "loss": 0.6037, + "step": 3140 + }, + { + "epoch": 0.3859196461481755, + "grad_norm": 1.203321248600611, + "learning_rate": 2.312952826291973e-05, + "loss": 0.5665, + "step": 3141 + }, + { + "epoch": 0.38604251136503254, + "grad_norm": 1.3544156349488452, + "learning_rate": 2.3124120981753164e-05, + "loss": 0.6239, + "step": 3142 + }, + { + "epoch": 0.38616537658188965, + "grad_norm": 1.3796261399412184, + "learning_rate": 2.311871220620723e-05, + "loss": 0.6914, + "step": 3143 + }, + { + "epoch": 0.38628824179874677, + "grad_norm": 1.3395557851223567, + "learning_rate": 2.3113301937276834e-05, + "loss": 0.6225, + "step": 3144 + }, + { + "epoch": 0.3864111070156039, + "grad_norm": 1.2130202009382083, + "learning_rate": 2.310789017595717e-05, + "loss": 0.5927, + "step": 3145 + }, + { + "epoch": 0.386533972232461, + "grad_norm": 1.2531027051100638, + "learning_rate": 2.310247692324368e-05, + "loss": 0.6578, + "step": 3146 + }, + { + "epoch": 0.3866568374493181, + "grad_norm": 1.1206652091879596, + "learning_rate": 2.3097062180132113e-05, + "loss": 0.7087, + "step": 3147 + }, + { + "epoch": 0.38677970266617523, + "grad_norm": 1.294485282229125, + "learning_rate": 2.3091645947618463e-05, + "loss": 0.6285, + "step": 3148 + }, + { + "epoch": 0.3869025678830323, + "grad_norm": 1.2529010878354239, + "learning_rate": 2.3086228226699023e-05, + "loss": 0.6203, + "step": 3149 + }, + { + "epoch": 0.3870254330998894, + "grad_norm": 1.326038525572919, + "learning_rate": 2.3080809018370338e-05, + "loss": 0.639, + "step": 3150 + }, + { + "epoch": 0.3871482983167465, + "grad_norm": 1.1958554236348393, + "learning_rate": 2.3075388323629242e-05, + "loss": 0.6997, + "step": 3151 + }, + { + "epoch": 0.38727116353360364, + "grad_norm": 1.2403462107313898, + "learning_rate": 2.3069966143472837e-05, + "loss": 0.6336, + "step": 3152 + }, + { + "epoch": 0.38739402875046075, + "grad_norm": 1.2804649917446485, + "learning_rate": 2.3064542478898494e-05, + "loss": 0.5514, + "step": 3153 + }, + { + "epoch": 0.38751689396731787, + "grad_norm": 1.2006425101010036, + "learning_rate": 2.305911733090386e-05, + "loss": 0.6396, + "step": 3154 + }, + { + "epoch": 0.387639759184175, + "grad_norm": 1.6255698371566016, + "learning_rate": 2.305369070048686e-05, + "loss": 0.69, + "step": 3155 + }, + { + "epoch": 0.38776262440103204, + "grad_norm": 1.242586095370747, + "learning_rate": 2.304826258864569e-05, + "loss": 0.7377, + "step": 3156 + }, + { + "epoch": 0.38788548961788916, + "grad_norm": 1.336222000861688, + "learning_rate": 2.30428329963788e-05, + "loss": 0.5447, + "step": 3157 + }, + { + "epoch": 0.38800835483474627, + "grad_norm": 1.6041166700461131, + "learning_rate": 2.303740192468495e-05, + "loss": 0.6048, + "step": 3158 + }, + { + "epoch": 0.3881312200516034, + "grad_norm": 1.6130857755177876, + "learning_rate": 2.3031969374563137e-05, + "loss": 0.6089, + "step": 3159 + }, + { + "epoch": 0.3882540852684605, + "grad_norm": 1.3807524813772005, + "learning_rate": 2.302653534701265e-05, + "loss": 0.5442, + "step": 3160 + }, + { + "epoch": 0.3883769504853176, + "grad_norm": 1.2756225956443825, + "learning_rate": 2.3021099843033037e-05, + "loss": 0.7019, + "step": 3161 + }, + { + "epoch": 0.38849981570217473, + "grad_norm": 1.3308797797025091, + "learning_rate": 2.3015662863624124e-05, + "loss": 0.5988, + "step": 3162 + }, + { + "epoch": 0.38862268091903185, + "grad_norm": 1.262070107988154, + "learning_rate": 2.3010224409786016e-05, + "loss": 0.7164, + "step": 3163 + }, + { + "epoch": 0.3887455461358889, + "grad_norm": 1.0689202286246584, + "learning_rate": 2.300478448251907e-05, + "loss": 0.6055, + "step": 3164 + }, + { + "epoch": 0.388868411352746, + "grad_norm": 1.2023740944534262, + "learning_rate": 2.299934308282393e-05, + "loss": 0.6403, + "step": 3165 + }, + { + "epoch": 0.38899127656960314, + "grad_norm": 1.312873842398042, + "learning_rate": 2.2993900211701516e-05, + "loss": 0.6819, + "step": 3166 + }, + { + "epoch": 0.38911414178646025, + "grad_norm": 1.2862506598929728, + "learning_rate": 2.2988455870152995e-05, + "loss": 0.6167, + "step": 3167 + }, + { + "epoch": 0.38923700700331737, + "grad_norm": 1.5285739473499709, + "learning_rate": 2.2983010059179824e-05, + "loss": 0.6872, + "step": 3168 + }, + { + "epoch": 0.3893598722201745, + "grad_norm": 2.3820159916983856, + "learning_rate": 2.2977562779783726e-05, + "loss": 0.6585, + "step": 3169 + }, + { + "epoch": 0.3894827374370316, + "grad_norm": 1.302537875958053, + "learning_rate": 2.297211403296669e-05, + "loss": 0.7127, + "step": 3170 + }, + { + "epoch": 0.38960560265388866, + "grad_norm": 1.2526793558939684, + "learning_rate": 2.296666381973098e-05, + "loss": 0.6875, + "step": 3171 + }, + { + "epoch": 0.3897284678707458, + "grad_norm": 1.4077678438476966, + "learning_rate": 2.2961212141079123e-05, + "loss": 0.6735, + "step": 3172 + }, + { + "epoch": 0.3898513330876029, + "grad_norm": 1.3784288548771246, + "learning_rate": 2.2955758998013924e-05, + "loss": 0.6559, + "step": 3173 + }, + { + "epoch": 0.38997419830446, + "grad_norm": 1.456774300127324, + "learning_rate": 2.2950304391538453e-05, + "loss": 0.6592, + "step": 3174 + }, + { + "epoch": 0.3900970635213171, + "grad_norm": 1.0901773012356553, + "learning_rate": 2.2944848322656048e-05, + "loss": 0.6065, + "step": 3175 + }, + { + "epoch": 0.39021992873817424, + "grad_norm": 1.255412454356825, + "learning_rate": 2.2939390792370315e-05, + "loss": 0.6686, + "step": 3176 + }, + { + "epoch": 0.39034279395503135, + "grad_norm": 1.1986719650477806, + "learning_rate": 2.2933931801685137e-05, + "loss": 0.5304, + "step": 3177 + }, + { + "epoch": 0.3904656591718884, + "grad_norm": 1.3724783116947679, + "learning_rate": 2.292847135160466e-05, + "loss": 0.6078, + "step": 3178 + }, + { + "epoch": 0.39058852438874553, + "grad_norm": 1.7342458172935287, + "learning_rate": 2.2923009443133294e-05, + "loss": 0.7165, + "step": 3179 + }, + { + "epoch": 0.39071138960560264, + "grad_norm": 1.2725680176165868, + "learning_rate": 2.2917546077275725e-05, + "loss": 0.6399, + "step": 3180 + }, + { + "epoch": 0.39083425482245976, + "grad_norm": 1.3734963812446788, + "learning_rate": 2.29120812550369e-05, + "loss": 0.6286, + "step": 3181 + }, + { + "epoch": 0.3909571200393169, + "grad_norm": 1.4049282046205358, + "learning_rate": 2.290661497742204e-05, + "loss": 0.5864, + "step": 3182 + }, + { + "epoch": 0.391079985256174, + "grad_norm": 1.238852097215092, + "learning_rate": 2.2901147245436635e-05, + "loss": 0.5276, + "step": 3183 + }, + { + "epoch": 0.3912028504730311, + "grad_norm": 1.404362352342043, + "learning_rate": 2.2895678060086432e-05, + "loss": 0.7803, + "step": 3184 + }, + { + "epoch": 0.39132571568988817, + "grad_norm": 1.3260638315433146, + "learning_rate": 2.289020742237745e-05, + "loss": 0.5056, + "step": 3185 + }, + { + "epoch": 0.3914485809067453, + "grad_norm": 1.3063314905135048, + "learning_rate": 2.288473533331599e-05, + "loss": 0.5841, + "step": 3186 + }, + { + "epoch": 0.3915714461236024, + "grad_norm": 1.369956406120543, + "learning_rate": 2.2879261793908596e-05, + "loss": 0.6445, + "step": 3187 + }, + { + "epoch": 0.3916943113404595, + "grad_norm": 1.2877857913658093, + "learning_rate": 2.2873786805162096e-05, + "loss": 0.6085, + "step": 3188 + }, + { + "epoch": 0.3918171765573166, + "grad_norm": 1.2441992800576496, + "learning_rate": 2.2868310368083578e-05, + "loss": 0.5688, + "step": 3189 + }, + { + "epoch": 0.39194004177417374, + "grad_norm": 1.1009481005316666, + "learning_rate": 2.2862832483680392e-05, + "loss": 0.6587, + "step": 3190 + }, + { + "epoch": 0.39206290699103086, + "grad_norm": 1.4990053466945141, + "learning_rate": 2.2857353152960165e-05, + "loss": 0.7823, + "step": 3191 + }, + { + "epoch": 0.392185772207888, + "grad_norm": 1.3398430480027306, + "learning_rate": 2.2851872376930777e-05, + "loss": 0.5784, + "step": 3192 + }, + { + "epoch": 0.39230863742474503, + "grad_norm": 1.186626427690252, + "learning_rate": 2.2846390156600395e-05, + "loss": 0.6655, + "step": 3193 + }, + { + "epoch": 0.39243150264160215, + "grad_norm": 1.4149618633784187, + "learning_rate": 2.284090649297742e-05, + "loss": 0.5873, + "step": 3194 + }, + { + "epoch": 0.39255436785845926, + "grad_norm": 1.4185054124790508, + "learning_rate": 2.2835421387070556e-05, + "loss": 0.6246, + "step": 3195 + }, + { + "epoch": 0.3926772330753164, + "grad_norm": 1.2675950034982193, + "learning_rate": 2.2829934839888732e-05, + "loss": 0.6652, + "step": 3196 + }, + { + "epoch": 0.3928000982921735, + "grad_norm": 1.3797889140399258, + "learning_rate": 2.2824446852441182e-05, + "loss": 0.6901, + "step": 3197 + }, + { + "epoch": 0.3929229635090306, + "grad_norm": 1.4499497710343199, + "learning_rate": 2.281895742573737e-05, + "loss": 0.582, + "step": 3198 + }, + { + "epoch": 0.3930458287258877, + "grad_norm": 1.2306390838652088, + "learning_rate": 2.281346656078705e-05, + "loss": 0.5767, + "step": 3199 + }, + { + "epoch": 0.3931686939427448, + "grad_norm": 1.1352118156419178, + "learning_rate": 2.2807974258600227e-05, + "loss": 0.576, + "step": 3200 + }, + { + "epoch": 0.3932915591596019, + "grad_norm": 1.3560556355186917, + "learning_rate": 2.280248052018718e-05, + "loss": 0.6573, + "step": 3201 + }, + { + "epoch": 0.393414424376459, + "grad_norm": 1.3093441197502143, + "learning_rate": 2.2796985346558436e-05, + "loss": 0.5844, + "step": 3202 + }, + { + "epoch": 0.39353728959331613, + "grad_norm": 1.0949837376604656, + "learning_rate": 2.2791488738724807e-05, + "loss": 0.5613, + "step": 3203 + }, + { + "epoch": 0.39366015481017325, + "grad_norm": 1.1780485338935327, + "learning_rate": 2.2785990697697353e-05, + "loss": 0.5273, + "step": 3204 + }, + { + "epoch": 0.39378302002703036, + "grad_norm": 1.336867865788004, + "learning_rate": 2.2780491224487402e-05, + "loss": 0.6233, + "step": 3205 + }, + { + "epoch": 0.3939058852438875, + "grad_norm": 1.380179394459063, + "learning_rate": 2.2774990320106552e-05, + "loss": 0.6375, + "step": 3206 + }, + { + "epoch": 0.39402875046074454, + "grad_norm": 1.3226846765482512, + "learning_rate": 2.2769487985566653e-05, + "loss": 0.6798, + "step": 3207 + }, + { + "epoch": 0.39415161567760165, + "grad_norm": 1.2131165238090367, + "learning_rate": 2.2763984221879827e-05, + "loss": 0.6447, + "step": 3208 + }, + { + "epoch": 0.39427448089445877, + "grad_norm": 1.1948172570697202, + "learning_rate": 2.2758479030058453e-05, + "loss": 0.6115, + "step": 3209 + }, + { + "epoch": 0.3943973461113159, + "grad_norm": 1.2357057381140435, + "learning_rate": 2.275297241111518e-05, + "loss": 0.7365, + "step": 3210 + }, + { + "epoch": 0.394520211328173, + "grad_norm": 1.1149240727215048, + "learning_rate": 2.274746436606291e-05, + "loss": 0.5601, + "step": 3211 + }, + { + "epoch": 0.3946430765450301, + "grad_norm": 1.4218882875605054, + "learning_rate": 2.2741954895914813e-05, + "loss": 0.7394, + "step": 3212 + }, + { + "epoch": 0.39476594176188723, + "grad_norm": 1.1924781255813701, + "learning_rate": 2.273644400168432e-05, + "loss": 0.6534, + "step": 3213 + }, + { + "epoch": 0.39488880697874434, + "grad_norm": 1.2153778763577558, + "learning_rate": 2.273093168438513e-05, + "loss": 0.5918, + "step": 3214 + }, + { + "epoch": 0.3950116721956014, + "grad_norm": 1.3455511955649004, + "learning_rate": 2.272541794503119e-05, + "loss": 0.6062, + "step": 3215 + }, + { + "epoch": 0.3951345374124585, + "grad_norm": 1.2899533518816655, + "learning_rate": 2.271990278463672e-05, + "loss": 0.653, + "step": 3216 + }, + { + "epoch": 0.39525740262931564, + "grad_norm": 1.2139759356648276, + "learning_rate": 2.27143862042162e-05, + "loss": 0.5891, + "step": 3217 + }, + { + "epoch": 0.39538026784617275, + "grad_norm": 1.0789412445450366, + "learning_rate": 2.270886820478437e-05, + "loss": 0.6616, + "step": 3218 + }, + { + "epoch": 0.39550313306302987, + "grad_norm": 1.4744508666038183, + "learning_rate": 2.270334878735622e-05, + "loss": 0.6547, + "step": 3219 + }, + { + "epoch": 0.395625998279887, + "grad_norm": 1.2058387923570915, + "learning_rate": 2.2697827952947023e-05, + "loss": 0.5986, + "step": 3220 + }, + { + "epoch": 0.3957488634967441, + "grad_norm": 1.3127742272889749, + "learning_rate": 2.2692305702572295e-05, + "loss": 0.5398, + "step": 3221 + }, + { + "epoch": 0.39587172871360116, + "grad_norm": 1.1800218007309389, + "learning_rate": 2.268678203724782e-05, + "loss": 0.6745, + "step": 3222 + }, + { + "epoch": 0.39599459393045827, + "grad_norm": 1.2361848533247268, + "learning_rate": 2.268125695798964e-05, + "loss": 0.6423, + "step": 3223 + }, + { + "epoch": 0.3961174591473154, + "grad_norm": 1.4638412437777095, + "learning_rate": 2.2675730465814056e-05, + "loss": 0.7075, + "step": 3224 + }, + { + "epoch": 0.3962403243641725, + "grad_norm": 1.195781149496282, + "learning_rate": 2.2670202561737635e-05, + "loss": 0.7178, + "step": 3225 + }, + { + "epoch": 0.3963631895810296, + "grad_norm": 1.3046837414849843, + "learning_rate": 2.2664673246777197e-05, + "loss": 0.6841, + "step": 3226 + }, + { + "epoch": 0.39648605479788673, + "grad_norm": 1.4220067382927764, + "learning_rate": 2.265914252194982e-05, + "loss": 0.7508, + "step": 3227 + }, + { + "epoch": 0.39660892001474385, + "grad_norm": 1.5038192682488487, + "learning_rate": 2.2653610388272842e-05, + "loss": 0.6757, + "step": 3228 + }, + { + "epoch": 0.3967317852316009, + "grad_norm": 1.5999611967983256, + "learning_rate": 2.2648076846763877e-05, + "loss": 0.6354, + "step": 3229 + }, + { + "epoch": 0.396854650448458, + "grad_norm": 1.4872140648984473, + "learning_rate": 2.2642541898440764e-05, + "loss": 0.6915, + "step": 3230 + }, + { + "epoch": 0.39697751566531514, + "grad_norm": 1.3775721360788562, + "learning_rate": 2.2637005544321645e-05, + "loss": 0.634, + "step": 3231 + }, + { + "epoch": 0.39710038088217225, + "grad_norm": 1.466706034094729, + "learning_rate": 2.2631467785424875e-05, + "loss": 0.5249, + "step": 3232 + }, + { + "epoch": 0.39722324609902937, + "grad_norm": 1.4567987451465292, + "learning_rate": 2.2625928622769105e-05, + "loss": 0.5729, + "step": 3233 + }, + { + "epoch": 0.3973461113158865, + "grad_norm": 1.247859831548399, + "learning_rate": 2.2620388057373216e-05, + "loss": 0.6488, + "step": 3234 + }, + { + "epoch": 0.3974689765327436, + "grad_norm": 1.154418462409934, + "learning_rate": 2.2614846090256366e-05, + "loss": 0.564, + "step": 3235 + }, + { + "epoch": 0.39759184174960066, + "grad_norm": 1.3963176369738308, + "learning_rate": 2.2609302722437958e-05, + "loss": 0.6263, + "step": 3236 + }, + { + "epoch": 0.3977147069664578, + "grad_norm": 1.3693709243698375, + "learning_rate": 2.2603757954937668e-05, + "loss": 0.706, + "step": 3237 + }, + { + "epoch": 0.3978375721833149, + "grad_norm": 1.236617387839573, + "learning_rate": 2.259821178877541e-05, + "loss": 0.6136, + "step": 3238 + }, + { + "epoch": 0.397960437400172, + "grad_norm": 1.5315521117059792, + "learning_rate": 2.259266422497137e-05, + "loss": 0.6277, + "step": 3239 + }, + { + "epoch": 0.3980833026170291, + "grad_norm": 1.312997781795286, + "learning_rate": 2.2587115264545984e-05, + "loss": 0.6497, + "step": 3240 + }, + { + "epoch": 0.39820616783388624, + "grad_norm": 1.2615721886938012, + "learning_rate": 2.2581564908519952e-05, + "loss": 0.71, + "step": 3241 + }, + { + "epoch": 0.39832903305074335, + "grad_norm": 1.1856660065793647, + "learning_rate": 2.2576013157914224e-05, + "loss": 0.6756, + "step": 3242 + }, + { + "epoch": 0.39845189826760047, + "grad_norm": 1.3359619961614464, + "learning_rate": 2.2570460013750012e-05, + "loss": 0.5679, + "step": 3243 + }, + { + "epoch": 0.39857476348445753, + "grad_norm": 1.4101835325816554, + "learning_rate": 2.2564905477048768e-05, + "loss": 0.672, + "step": 3244 + }, + { + "epoch": 0.39869762870131464, + "grad_norm": 1.3416340918874878, + "learning_rate": 2.2559349548832227e-05, + "loss": 0.6618, + "step": 3245 + }, + { + "epoch": 0.39882049391817176, + "grad_norm": 1.1949058258818597, + "learning_rate": 2.2553792230122357e-05, + "loss": 0.6133, + "step": 3246 + }, + { + "epoch": 0.3989433591350289, + "grad_norm": 1.2947800251575485, + "learning_rate": 2.25482335219414e-05, + "loss": 0.6409, + "step": 3247 + }, + { + "epoch": 0.399066224351886, + "grad_norm": 1.326907311998871, + "learning_rate": 2.2542673425311834e-05, + "loss": 0.6191, + "step": 3248 + }, + { + "epoch": 0.3991890895687431, + "grad_norm": 1.2655865325521907, + "learning_rate": 2.2537111941256406e-05, + "loss": 0.6746, + "step": 3249 + }, + { + "epoch": 0.3993119547856002, + "grad_norm": 1.03690816188133, + "learning_rate": 2.2531549070798117e-05, + "loss": 0.616, + "step": 3250 + }, + { + "epoch": 0.3994348200024573, + "grad_norm": 1.6701155222644832, + "learning_rate": 2.252598481496022e-05, + "loss": 0.6188, + "step": 3251 + }, + { + "epoch": 0.3995576852193144, + "grad_norm": 1.4711854734380423, + "learning_rate": 2.252041917476623e-05, + "loss": 0.6402, + "step": 3252 + }, + { + "epoch": 0.3996805504361715, + "grad_norm": 1.2848058775409943, + "learning_rate": 2.2514852151239897e-05, + "loss": 0.573, + "step": 3253 + }, + { + "epoch": 0.3998034156530286, + "grad_norm": 1.1420749003535782, + "learning_rate": 2.250928374540525e-05, + "loss": 0.5564, + "step": 3254 + }, + { + "epoch": 0.39992628086988574, + "grad_norm": 1.4042175784174427, + "learning_rate": 2.250371395828656e-05, + "loss": 0.5693, + "step": 3255 + }, + { + "epoch": 0.40004914608674286, + "grad_norm": 1.2449548731695756, + "learning_rate": 2.2498142790908346e-05, + "loss": 0.6172, + "step": 3256 + }, + { + "epoch": 0.4001720113036, + "grad_norm": 1.203581423835913, + "learning_rate": 2.2492570244295395e-05, + "loss": 0.6144, + "step": 3257 + }, + { + "epoch": 0.40029487652045703, + "grad_norm": 1.3395528462472013, + "learning_rate": 2.248699631947274e-05, + "loss": 0.7415, + "step": 3258 + }, + { + "epoch": 0.40041774173731415, + "grad_norm": 1.2742380692823234, + "learning_rate": 2.2481421017465662e-05, + "loss": 0.564, + "step": 3259 + }, + { + "epoch": 0.40054060695417126, + "grad_norm": 1.3373372087386748, + "learning_rate": 2.2475844339299714e-05, + "loss": 0.6277, + "step": 3260 + }, + { + "epoch": 0.4006634721710284, + "grad_norm": 1.3764700481567107, + "learning_rate": 2.2470266286000672e-05, + "loss": 0.5544, + "step": 3261 + }, + { + "epoch": 0.4007863373878855, + "grad_norm": 1.2196638764476968, + "learning_rate": 2.24646868585946e-05, + "loss": 0.6219, + "step": 3262 + }, + { + "epoch": 0.4009092026047426, + "grad_norm": 1.0478105806511462, + "learning_rate": 2.2459106058107788e-05, + "loss": 0.6405, + "step": 3263 + }, + { + "epoch": 0.4010320678215997, + "grad_norm": 1.3118165149275716, + "learning_rate": 2.2453523885566794e-05, + "loss": 0.6238, + "step": 3264 + }, + { + "epoch": 0.40115493303845684, + "grad_norm": 1.5699851473408712, + "learning_rate": 2.244794034199842e-05, + "loss": 0.643, + "step": 3265 + }, + { + "epoch": 0.4012777982553139, + "grad_norm": 1.5145323255496057, + "learning_rate": 2.244235542842972e-05, + "loss": 0.6878, + "step": 3266 + }, + { + "epoch": 0.401400663472171, + "grad_norm": 1.5159057065925594, + "learning_rate": 2.2436769145888e-05, + "loss": 0.5359, + "step": 3267 + }, + { + "epoch": 0.40152352868902813, + "grad_norm": 1.5078388344358378, + "learning_rate": 2.243118149540083e-05, + "loss": 0.5959, + "step": 3268 + }, + { + "epoch": 0.40164639390588525, + "grad_norm": 1.146619347693364, + "learning_rate": 2.2425592477996012e-05, + "loss": 0.6745, + "step": 3269 + }, + { + "epoch": 0.40176925912274236, + "grad_norm": 1.767482303724687, + "learning_rate": 2.2420002094701615e-05, + "loss": 0.637, + "step": 3270 + }, + { + "epoch": 0.4018921243395995, + "grad_norm": 1.2166597745732455, + "learning_rate": 2.241441034654596e-05, + "loss": 0.5667, + "step": 3271 + }, + { + "epoch": 0.4020149895564566, + "grad_norm": 1.2376417054449578, + "learning_rate": 2.24088172345576e-05, + "loss": 0.7022, + "step": 3272 + }, + { + "epoch": 0.40213785477331365, + "grad_norm": 1.1907550450806428, + "learning_rate": 2.2403222759765358e-05, + "loss": 0.6045, + "step": 3273 + }, + { + "epoch": 0.40226071999017077, + "grad_norm": 1.5349718993399744, + "learning_rate": 2.23976269231983e-05, + "loss": 0.7598, + "step": 3274 + }, + { + "epoch": 0.4023835852070279, + "grad_norm": 1.4444610665867792, + "learning_rate": 2.239202972588575e-05, + "loss": 0.6682, + "step": 3275 + }, + { + "epoch": 0.402506450423885, + "grad_norm": 1.3220353503530338, + "learning_rate": 2.2386431168857263e-05, + "loss": 0.613, + "step": 3276 + }, + { + "epoch": 0.4026293156407421, + "grad_norm": 1.2983445936803282, + "learning_rate": 2.2380831253142673e-05, + "loss": 0.6995, + "step": 3277 + }, + { + "epoch": 0.40275218085759923, + "grad_norm": 1.0340427808167714, + "learning_rate": 2.2375229979772034e-05, + "loss": 0.588, + "step": 3278 + }, + { + "epoch": 0.40287504607445634, + "grad_norm": 1.2036746156684257, + "learning_rate": 2.2369627349775673e-05, + "loss": 0.519, + "step": 3279 + }, + { + "epoch": 0.4029979112913134, + "grad_norm": 1.1995868213116705, + "learning_rate": 2.2364023364184154e-05, + "loss": 0.5328, + "step": 3280 + }, + { + "epoch": 0.4031207765081705, + "grad_norm": 1.340386025255557, + "learning_rate": 2.2358418024028294e-05, + "loss": 0.6464, + "step": 3281 + }, + { + "epoch": 0.40324364172502764, + "grad_norm": 1.3726389374293992, + "learning_rate": 2.2352811330339164e-05, + "loss": 0.6609, + "step": 3282 + }, + { + "epoch": 0.40336650694188475, + "grad_norm": 1.2788012353103457, + "learning_rate": 2.234720328414807e-05, + "loss": 0.5462, + "step": 3283 + }, + { + "epoch": 0.40348937215874187, + "grad_norm": 1.254837863480527, + "learning_rate": 2.2341593886486584e-05, + "loss": 0.624, + "step": 3284 + }, + { + "epoch": 0.403612237375599, + "grad_norm": 1.2853586091814015, + "learning_rate": 2.2335983138386513e-05, + "loss": 0.6424, + "step": 3285 + }, + { + "epoch": 0.4037351025924561, + "grad_norm": 1.2200355675824899, + "learning_rate": 2.2330371040879914e-05, + "loss": 0.5413, + "step": 3286 + }, + { + "epoch": 0.40385796780931316, + "grad_norm": 1.2878276398279134, + "learning_rate": 2.232475759499911e-05, + "loss": 0.7391, + "step": 3287 + }, + { + "epoch": 0.40398083302617027, + "grad_norm": 1.218015649223548, + "learning_rate": 2.2319142801776637e-05, + "loss": 0.6272, + "step": 3288 + }, + { + "epoch": 0.4041036982430274, + "grad_norm": 1.1986055074291175, + "learning_rate": 2.2313526662245324e-05, + "loss": 0.5546, + "step": 3289 + }, + { + "epoch": 0.4042265634598845, + "grad_norm": 1.1198276669215759, + "learning_rate": 2.2307909177438205e-05, + "loss": 0.5789, + "step": 3290 + }, + { + "epoch": 0.4043494286767416, + "grad_norm": 1.4579383955347107, + "learning_rate": 2.230229034838859e-05, + "loss": 0.7434, + "step": 3291 + }, + { + "epoch": 0.40447229389359873, + "grad_norm": 1.7663899913204821, + "learning_rate": 2.229667017613002e-05, + "loss": 0.6948, + "step": 3292 + }, + { + "epoch": 0.40459515911045585, + "grad_norm": 1.5490562994452994, + "learning_rate": 2.229104866169629e-05, + "loss": 0.5501, + "step": 3293 + }, + { + "epoch": 0.40471802432731296, + "grad_norm": 1.3236869479302555, + "learning_rate": 2.2285425806121446e-05, + "loss": 0.7112, + "step": 3294 + }, + { + "epoch": 0.40484088954417, + "grad_norm": 1.4691450430078279, + "learning_rate": 2.2279801610439768e-05, + "loss": 0.6875, + "step": 3295 + }, + { + "epoch": 0.40496375476102714, + "grad_norm": 1.336442650731409, + "learning_rate": 2.22741760756858e-05, + "loss": 0.7052, + "step": 3296 + }, + { + "epoch": 0.40508661997788425, + "grad_norm": 1.2275347467613416, + "learning_rate": 2.2268549202894314e-05, + "loss": 0.5856, + "step": 3297 + }, + { + "epoch": 0.40520948519474137, + "grad_norm": 1.4220932076997166, + "learning_rate": 2.2262920993100345e-05, + "loss": 0.613, + "step": 3298 + }, + { + "epoch": 0.4053323504115985, + "grad_norm": 1.485283443779952, + "learning_rate": 2.2257291447339157e-05, + "loss": 0.6244, + "step": 3299 + }, + { + "epoch": 0.4054552156284556, + "grad_norm": 1.5237046155388518, + "learning_rate": 2.2251660566646275e-05, + "loss": 0.7001, + "step": 3300 + }, + { + "epoch": 0.4055780808453127, + "grad_norm": 1.501783904885649, + "learning_rate": 2.2246028352057457e-05, + "loss": 0.7254, + "step": 3301 + }, + { + "epoch": 0.4057009460621698, + "grad_norm": 3.033988770552471, + "learning_rate": 2.224039480460872e-05, + "loss": 0.6636, + "step": 3302 + }, + { + "epoch": 0.4058238112790269, + "grad_norm": 1.35346960946082, + "learning_rate": 2.2234759925336312e-05, + "loss": 0.6745, + "step": 3303 + }, + { + "epoch": 0.405946676495884, + "grad_norm": 1.3120245620804625, + "learning_rate": 2.222912371527674e-05, + "loss": 0.7097, + "step": 3304 + }, + { + "epoch": 0.4060695417127411, + "grad_norm": 1.4771858510826843, + "learning_rate": 2.2223486175466734e-05, + "loss": 0.6112, + "step": 3305 + }, + { + "epoch": 0.40619240692959824, + "grad_norm": 1.1790424136623359, + "learning_rate": 2.2217847306943298e-05, + "loss": 0.6156, + "step": 3306 + }, + { + "epoch": 0.40631527214645535, + "grad_norm": 1.3462765175868245, + "learning_rate": 2.2212207110743655e-05, + "loss": 0.592, + "step": 3307 + }, + { + "epoch": 0.40643813736331247, + "grad_norm": 1.2257372200965577, + "learning_rate": 2.220656558790529e-05, + "loss": 0.701, + "step": 3308 + }, + { + "epoch": 0.40656100258016953, + "grad_norm": 1.6782583017049542, + "learning_rate": 2.2200922739465915e-05, + "loss": 0.7204, + "step": 3309 + }, + { + "epoch": 0.40668386779702664, + "grad_norm": 1.3454425566535098, + "learning_rate": 2.219527856646351e-05, + "loss": 0.7044, + "step": 3310 + }, + { + "epoch": 0.40680673301388376, + "grad_norm": 1.3718040496771688, + "learning_rate": 2.2189633069936273e-05, + "loss": 0.7009, + "step": 3311 + }, + { + "epoch": 0.4069295982307409, + "grad_norm": 1.203467900709806, + "learning_rate": 2.2183986250922663e-05, + "loss": 0.5794, + "step": 3312 + }, + { + "epoch": 0.407052463447598, + "grad_norm": 1.3560079022897504, + "learning_rate": 2.2178338110461365e-05, + "loss": 0.6078, + "step": 3313 + }, + { + "epoch": 0.4071753286644551, + "grad_norm": 1.4533278317230367, + "learning_rate": 2.2172688649591325e-05, + "loss": 0.7051, + "step": 3314 + }, + { + "epoch": 0.4072981938813122, + "grad_norm": 1.3760954883701098, + "learning_rate": 2.2167037869351728e-05, + "loss": 0.6702, + "step": 3315 + }, + { + "epoch": 0.40742105909816934, + "grad_norm": 1.1818077196663601, + "learning_rate": 2.2161385770781994e-05, + "loss": 0.5981, + "step": 3316 + }, + { + "epoch": 0.4075439243150264, + "grad_norm": 1.4784620306465266, + "learning_rate": 2.215573235492179e-05, + "loss": 0.5883, + "step": 3317 + }, + { + "epoch": 0.4076667895318835, + "grad_norm": 1.5499146609981964, + "learning_rate": 2.2150077622811024e-05, + "loss": 0.6884, + "step": 3318 + }, + { + "epoch": 0.4077896547487406, + "grad_norm": 1.5208026898802873, + "learning_rate": 2.2144421575489853e-05, + "loss": 0.558, + "step": 3319 + }, + { + "epoch": 0.40791251996559774, + "grad_norm": 1.2553723609396739, + "learning_rate": 2.2138764213998666e-05, + "loss": 0.5621, + "step": 3320 + }, + { + "epoch": 0.40803538518245486, + "grad_norm": 1.4505094275698664, + "learning_rate": 2.2133105539378103e-05, + "loss": 0.6816, + "step": 3321 + }, + { + "epoch": 0.408158250399312, + "grad_norm": 1.37000915940128, + "learning_rate": 2.212744555266903e-05, + "loss": 0.7299, + "step": 3322 + }, + { + "epoch": 0.4082811156161691, + "grad_norm": 1.3584347803342223, + "learning_rate": 2.2121784254912568e-05, + "loss": 0.7081, + "step": 3323 + }, + { + "epoch": 0.40840398083302615, + "grad_norm": 1.135022144859272, + "learning_rate": 2.211612164715008e-05, + "loss": 0.5862, + "step": 3324 + }, + { + "epoch": 0.40852684604988326, + "grad_norm": 1.2060920024357518, + "learning_rate": 2.211045773042317e-05, + "loss": 0.6277, + "step": 3325 + }, + { + "epoch": 0.4086497112667404, + "grad_norm": 1.2447633650053305, + "learning_rate": 2.2104792505773666e-05, + "loss": 0.5815, + "step": 3326 + }, + { + "epoch": 0.4087725764835975, + "grad_norm": 1.2954537472941674, + "learning_rate": 2.209912597424366e-05, + "loss": 0.6568, + "step": 3327 + }, + { + "epoch": 0.4088954417004546, + "grad_norm": 1.601495140382785, + "learning_rate": 2.209345813687547e-05, + "loss": 0.5562, + "step": 3328 + }, + { + "epoch": 0.4090183069173117, + "grad_norm": 1.49247132385456, + "learning_rate": 2.208778899471166e-05, + "loss": 0.7212, + "step": 3329 + }, + { + "epoch": 0.40914117213416884, + "grad_norm": 1.3380562287583588, + "learning_rate": 2.2082118548795034e-05, + "loss": 0.6611, + "step": 3330 + }, + { + "epoch": 0.4092640373510259, + "grad_norm": 1.27705228659089, + "learning_rate": 2.2076446800168624e-05, + "loss": 0.6419, + "step": 3331 + }, + { + "epoch": 0.409386902567883, + "grad_norm": 1.1785651779798292, + "learning_rate": 2.207077374987572e-05, + "loss": 0.4905, + "step": 3332 + }, + { + "epoch": 0.40950976778474013, + "grad_norm": 1.3599174398547398, + "learning_rate": 2.2065099398959837e-05, + "loss": 0.7168, + "step": 3333 + }, + { + "epoch": 0.40963263300159725, + "grad_norm": 1.3669693097557376, + "learning_rate": 2.205942374846474e-05, + "loss": 0.7672, + "step": 3334 + }, + { + "epoch": 0.40975549821845436, + "grad_norm": 1.3169453862622456, + "learning_rate": 2.205374679943443e-05, + "loss": 0.5095, + "step": 3335 + }, + { + "epoch": 0.4098783634353115, + "grad_norm": 1.5135642124710704, + "learning_rate": 2.2048068552913136e-05, + "loss": 0.6174, + "step": 3336 + }, + { + "epoch": 0.4100012286521686, + "grad_norm": 1.2634133604573028, + "learning_rate": 2.204238900994534e-05, + "loss": 0.5551, + "step": 3337 + }, + { + "epoch": 0.41012409386902565, + "grad_norm": 1.1351072817720993, + "learning_rate": 2.2036708171575763e-05, + "loss": 0.6424, + "step": 3338 + }, + { + "epoch": 0.41024695908588277, + "grad_norm": 1.4369111819309996, + "learning_rate": 2.2031026038849353e-05, + "loss": 0.5755, + "step": 3339 + }, + { + "epoch": 0.4103698243027399, + "grad_norm": 1.2223499057182563, + "learning_rate": 2.2025342612811297e-05, + "loss": 0.5662, + "step": 3340 + }, + { + "epoch": 0.410492689519597, + "grad_norm": 1.4028540062997914, + "learning_rate": 2.2019657894507027e-05, + "loss": 0.5935, + "step": 3341 + }, + { + "epoch": 0.4106155547364541, + "grad_norm": 1.4288790133396314, + "learning_rate": 2.2013971884982212e-05, + "loss": 0.614, + "step": 3342 + }, + { + "epoch": 0.41073841995331123, + "grad_norm": 1.3997114604287628, + "learning_rate": 2.200828458528276e-05, + "loss": 0.5798, + "step": 3343 + }, + { + "epoch": 0.41086128517016834, + "grad_norm": 1.4587483833203176, + "learning_rate": 2.2002595996454805e-05, + "loss": 0.6589, + "step": 3344 + }, + { + "epoch": 0.41098415038702546, + "grad_norm": 1.777131129814559, + "learning_rate": 2.199690611954473e-05, + "loss": 0.6547, + "step": 3345 + }, + { + "epoch": 0.4111070156038825, + "grad_norm": 1.3646672372065574, + "learning_rate": 2.199121495559915e-05, + "loss": 0.6188, + "step": 3346 + }, + { + "epoch": 0.41122988082073964, + "grad_norm": 1.2692354754169843, + "learning_rate": 2.198552250566492e-05, + "loss": 0.6258, + "step": 3347 + }, + { + "epoch": 0.41135274603759675, + "grad_norm": 1.3169315900194807, + "learning_rate": 2.197982877078913e-05, + "loss": 0.7404, + "step": 3348 + }, + { + "epoch": 0.41147561125445387, + "grad_norm": 1.40143445944332, + "learning_rate": 2.19741337520191e-05, + "loss": 0.7362, + "step": 3349 + }, + { + "epoch": 0.411598476471311, + "grad_norm": 1.3859193247092367, + "learning_rate": 2.19684374504024e-05, + "loss": 0.6806, + "step": 3350 + }, + { + "epoch": 0.4117213416881681, + "grad_norm": 1.33280325276953, + "learning_rate": 2.1962739866986816e-05, + "loss": 0.5798, + "step": 3351 + }, + { + "epoch": 0.4118442069050252, + "grad_norm": 1.3256621988653419, + "learning_rate": 2.195704100282039e-05, + "loss": 0.544, + "step": 3352 + }, + { + "epoch": 0.41196707212188227, + "grad_norm": 1.1064679260853545, + "learning_rate": 2.1951340858951392e-05, + "loss": 0.5967, + "step": 3353 + }, + { + "epoch": 0.4120899373387394, + "grad_norm": 1.5492112091747514, + "learning_rate": 2.1945639436428324e-05, + "loss": 0.6985, + "step": 3354 + }, + { + "epoch": 0.4122128025555965, + "grad_norm": 1.5057146060254767, + "learning_rate": 2.1939936736299925e-05, + "loss": 0.6458, + "step": 3355 + }, + { + "epoch": 0.4123356677724536, + "grad_norm": 1.2693686722541415, + "learning_rate": 2.1934232759615168e-05, + "loss": 0.6546, + "step": 3356 + }, + { + "epoch": 0.41245853298931073, + "grad_norm": 1.5489566543222992, + "learning_rate": 2.192852750742327e-05, + "loss": 0.6547, + "step": 3357 + }, + { + "epoch": 0.41258139820616785, + "grad_norm": 1.6644369613343506, + "learning_rate": 2.1922820980773667e-05, + "loss": 0.5725, + "step": 3358 + }, + { + "epoch": 0.41270426342302496, + "grad_norm": 1.3714662477151225, + "learning_rate": 2.1917113180716044e-05, + "loss": 0.6029, + "step": 3359 + }, + { + "epoch": 0.412827128639882, + "grad_norm": 1.4498702737426121, + "learning_rate": 2.1911404108300307e-05, + "loss": 0.586, + "step": 3360 + }, + { + "epoch": 0.41294999385673914, + "grad_norm": 1.3331092975969738, + "learning_rate": 2.1905693764576608e-05, + "loss": 0.6558, + "step": 3361 + }, + { + "epoch": 0.41307285907359625, + "grad_norm": 1.3440295030078897, + "learning_rate": 2.1899982150595324e-05, + "loss": 0.7367, + "step": 3362 + }, + { + "epoch": 0.41319572429045337, + "grad_norm": 1.360115067429757, + "learning_rate": 2.189426926740707e-05, + "loss": 0.6462, + "step": 3363 + }, + { + "epoch": 0.4133185895073105, + "grad_norm": 1.5413844042630853, + "learning_rate": 2.18885551160627e-05, + "loss": 0.6352, + "step": 3364 + }, + { + "epoch": 0.4134414547241676, + "grad_norm": 1.4236456054226783, + "learning_rate": 2.1882839697613286e-05, + "loss": 0.6409, + "step": 3365 + }, + { + "epoch": 0.4135643199410247, + "grad_norm": 1.4972867769243716, + "learning_rate": 2.1877123013110146e-05, + "loss": 0.6043, + "step": 3366 + }, + { + "epoch": 0.4136871851578818, + "grad_norm": 1.3072007078254688, + "learning_rate": 2.187140506360483e-05, + "loss": 0.5098, + "step": 3367 + }, + { + "epoch": 0.4138100503747389, + "grad_norm": 1.8102966396025628, + "learning_rate": 2.186568585014912e-05, + "loss": 0.6173, + "step": 3368 + }, + { + "epoch": 0.413932915591596, + "grad_norm": 1.3244758532255911, + "learning_rate": 2.1859965373795018e-05, + "loss": 0.5819, + "step": 3369 + }, + { + "epoch": 0.4140557808084531, + "grad_norm": 1.340206361974584, + "learning_rate": 2.185424363559477e-05, + "loss": 0.6917, + "step": 3370 + }, + { + "epoch": 0.41417864602531024, + "grad_norm": 1.2939069265616907, + "learning_rate": 2.1848520636600863e-05, + "loss": 0.6667, + "step": 3371 + }, + { + "epoch": 0.41430151124216735, + "grad_norm": 1.504537074123858, + "learning_rate": 2.1842796377865995e-05, + "loss": 0.6434, + "step": 3372 + }, + { + "epoch": 0.41442437645902447, + "grad_norm": 1.4332700733871364, + "learning_rate": 2.1837070860443115e-05, + "loss": 0.5452, + "step": 3373 + }, + { + "epoch": 0.4145472416758816, + "grad_norm": 1.3219964455846296, + "learning_rate": 2.1831344085385386e-05, + "loss": 0.6845, + "step": 3374 + }, + { + "epoch": 0.41467010689273864, + "grad_norm": 1.4780072827624873, + "learning_rate": 2.182561605374622e-05, + "loss": 0.693, + "step": 3375 + }, + { + "epoch": 0.41479297210959576, + "grad_norm": 1.3665358934227405, + "learning_rate": 2.181988676657924e-05, + "loss": 0.5896, + "step": 3376 + }, + { + "epoch": 0.4149158373264529, + "grad_norm": 1.280974580048406, + "learning_rate": 2.1814156224938322e-05, + "loss": 0.5872, + "step": 3377 + }, + { + "epoch": 0.41503870254331, + "grad_norm": 1.4332057151140534, + "learning_rate": 2.1808424429877557e-05, + "loss": 0.5861, + "step": 3378 + }, + { + "epoch": 0.4151615677601671, + "grad_norm": 1.233056371667068, + "learning_rate": 2.1802691382451272e-05, + "loss": 0.5773, + "step": 3379 + }, + { + "epoch": 0.4152844329770242, + "grad_norm": 1.2729009992526377, + "learning_rate": 2.1796957083714022e-05, + "loss": 0.564, + "step": 3380 + }, + { + "epoch": 0.41540729819388134, + "grad_norm": 1.5072817817307798, + "learning_rate": 2.17912215347206e-05, + "loss": 0.7989, + "step": 3381 + }, + { + "epoch": 0.4155301634107384, + "grad_norm": 1.3245118975015144, + "learning_rate": 2.1785484736526017e-05, + "loss": 0.5678, + "step": 3382 + }, + { + "epoch": 0.4156530286275955, + "grad_norm": 1.7904570669143907, + "learning_rate": 2.1779746690185522e-05, + "loss": 0.7071, + "step": 3383 + }, + { + "epoch": 0.4157758938444526, + "grad_norm": 1.3961431957083748, + "learning_rate": 2.1774007396754594e-05, + "loss": 0.5765, + "step": 3384 + }, + { + "epoch": 0.41589875906130974, + "grad_norm": 1.3560826516302678, + "learning_rate": 2.1768266857288934e-05, + "loss": 0.5877, + "step": 3385 + }, + { + "epoch": 0.41602162427816686, + "grad_norm": 1.4253043070269829, + "learning_rate": 2.176252507284448e-05, + "loss": 0.665, + "step": 3386 + }, + { + "epoch": 0.416144489495024, + "grad_norm": 1.3409621135554424, + "learning_rate": 2.1756782044477397e-05, + "loss": 0.7059, + "step": 3387 + }, + { + "epoch": 0.4162673547118811, + "grad_norm": 1.2565212619731896, + "learning_rate": 2.1751037773244075e-05, + "loss": 0.5764, + "step": 3388 + }, + { + "epoch": 0.41639021992873815, + "grad_norm": 1.163030535878867, + "learning_rate": 2.1745292260201137e-05, + "loss": 0.5689, + "step": 3389 + }, + { + "epoch": 0.41651308514559526, + "grad_norm": 1.291670492490088, + "learning_rate": 2.173954550640543e-05, + "loss": 0.6251, + "step": 3390 + }, + { + "epoch": 0.4166359503624524, + "grad_norm": 1.5480672066997876, + "learning_rate": 2.1733797512914035e-05, + "loss": 0.6169, + "step": 3391 + }, + { + "epoch": 0.4167588155793095, + "grad_norm": 1.2568186418753893, + "learning_rate": 2.1728048280784264e-05, + "loss": 0.6651, + "step": 3392 + }, + { + "epoch": 0.4168816807961666, + "grad_norm": 1.5434019733246809, + "learning_rate": 2.172229781107364e-05, + "loss": 0.716, + "step": 3393 + }, + { + "epoch": 0.4170045460130237, + "grad_norm": 1.3383956630595795, + "learning_rate": 2.1716546104839928e-05, + "loss": 0.6713, + "step": 3394 + }, + { + "epoch": 0.41712741122988084, + "grad_norm": 1.0622292640538367, + "learning_rate": 2.1710793163141117e-05, + "loss": 0.5943, + "step": 3395 + }, + { + "epoch": 0.41725027644673796, + "grad_norm": 1.3123530742270229, + "learning_rate": 2.170503898703543e-05, + "loss": 0.6173, + "step": 3396 + }, + { + "epoch": 0.417373141663595, + "grad_norm": 1.3353647895636316, + "learning_rate": 2.1699283577581302e-05, + "loss": 0.6253, + "step": 3397 + }, + { + "epoch": 0.41749600688045213, + "grad_norm": 1.0919582417341764, + "learning_rate": 2.1693526935837405e-05, + "loss": 0.7299, + "step": 3398 + }, + { + "epoch": 0.41761887209730925, + "grad_norm": 1.3949519704704174, + "learning_rate": 2.168776906286264e-05, + "loss": 0.5954, + "step": 3399 + }, + { + "epoch": 0.41774173731416636, + "grad_norm": 1.2770639765182688, + "learning_rate": 2.1682009959716127e-05, + "loss": 0.6256, + "step": 3400 + }, + { + "epoch": 0.4178646025310235, + "grad_norm": 1.1621149078660744, + "learning_rate": 2.1676249627457218e-05, + "loss": 0.5682, + "step": 3401 + }, + { + "epoch": 0.4179874677478806, + "grad_norm": 1.2223743131288431, + "learning_rate": 2.167048806714548e-05, + "loss": 0.5749, + "step": 3402 + }, + { + "epoch": 0.4181103329647377, + "grad_norm": 1.3940481688979862, + "learning_rate": 2.1664725279840727e-05, + "loss": 0.5716, + "step": 3403 + }, + { + "epoch": 0.41823319818159477, + "grad_norm": 2.4849102096996627, + "learning_rate": 2.1658961266602984e-05, + "loss": 0.4899, + "step": 3404 + }, + { + "epoch": 0.4183560633984519, + "grad_norm": 1.499803879770401, + "learning_rate": 2.1653196028492495e-05, + "loss": 0.6463, + "step": 3405 + }, + { + "epoch": 0.418478928615309, + "grad_norm": 1.3694226653077874, + "learning_rate": 2.1647429566569745e-05, + "loss": 0.754, + "step": 3406 + }, + { + "epoch": 0.4186017938321661, + "grad_norm": 1.7315060066167258, + "learning_rate": 2.164166188189544e-05, + "loss": 0.7452, + "step": 3407 + }, + { + "epoch": 0.41872465904902323, + "grad_norm": 1.4569230185600441, + "learning_rate": 2.16358929755305e-05, + "loss": 0.6706, + "step": 3408 + }, + { + "epoch": 0.41884752426588034, + "grad_norm": 1.4491424340004655, + "learning_rate": 2.1630122848536087e-05, + "loss": 0.5865, + "step": 3409 + }, + { + "epoch": 0.41897038948273746, + "grad_norm": 1.2376531672144617, + "learning_rate": 2.162435150197357e-05, + "loss": 0.5798, + "step": 3410 + }, + { + "epoch": 0.4190932546995945, + "grad_norm": 1.3110796923503247, + "learning_rate": 2.1618578936904552e-05, + "loss": 0.7114, + "step": 3411 + }, + { + "epoch": 0.41921611991645163, + "grad_norm": 1.3800602090839078, + "learning_rate": 2.1612805154390868e-05, + "loss": 0.555, + "step": 3412 + }, + { + "epoch": 0.41933898513330875, + "grad_norm": 1.2511616870094646, + "learning_rate": 2.160703015549456e-05, + "loss": 0.7556, + "step": 3413 + }, + { + "epoch": 0.41946185035016587, + "grad_norm": 1.2472848015219806, + "learning_rate": 2.1601253941277906e-05, + "loss": 0.6803, + "step": 3414 + }, + { + "epoch": 0.419584715567023, + "grad_norm": 1.343935405178687, + "learning_rate": 2.1595476512803397e-05, + "loss": 0.5542, + "step": 3415 + }, + { + "epoch": 0.4197075807838801, + "grad_norm": 1.2314846800941737, + "learning_rate": 2.158969787113375e-05, + "loss": 0.5618, + "step": 3416 + }, + { + "epoch": 0.4198304460007372, + "grad_norm": 1.1608513718561941, + "learning_rate": 2.1583918017331925e-05, + "loss": 0.7079, + "step": 3417 + }, + { + "epoch": 0.41995331121759427, + "grad_norm": 1.2781343848868714, + "learning_rate": 2.1578136952461073e-05, + "loss": 0.5892, + "step": 3418 + }, + { + "epoch": 0.4200761764344514, + "grad_norm": 1.34345790062968, + "learning_rate": 2.157235467758459e-05, + "loss": 0.8701, + "step": 3419 + }, + { + "epoch": 0.4201990416513085, + "grad_norm": 1.1791554080197288, + "learning_rate": 2.156657119376609e-05, + "loss": 0.5343, + "step": 3420 + }, + { + "epoch": 0.4203219068681656, + "grad_norm": 1.2500766333049922, + "learning_rate": 2.1560786502069398e-05, + "loss": 0.6451, + "step": 3421 + }, + { + "epoch": 0.42044477208502273, + "grad_norm": 1.344315547591989, + "learning_rate": 2.1555000603558588e-05, + "loss": 0.7266, + "step": 3422 + }, + { + "epoch": 0.42056763730187985, + "grad_norm": 1.30963704647146, + "learning_rate": 2.154921349929792e-05, + "loss": 0.8358, + "step": 3423 + }, + { + "epoch": 0.42069050251873696, + "grad_norm": 1.1505459951032426, + "learning_rate": 2.1543425190351908e-05, + "loss": 0.6363, + "step": 3424 + }, + { + "epoch": 0.4208133677355941, + "grad_norm": 1.223232236262694, + "learning_rate": 2.153763567778526e-05, + "loss": 0.6237, + "step": 3425 + }, + { + "epoch": 0.42093623295245114, + "grad_norm": 1.3456357796447265, + "learning_rate": 2.1531844962662933e-05, + "loss": 0.6217, + "step": 3426 + }, + { + "epoch": 0.42105909816930825, + "grad_norm": 1.3688335997056595, + "learning_rate": 2.152605304605008e-05, + "loss": 0.7334, + "step": 3427 + }, + { + "epoch": 0.42118196338616537, + "grad_norm": 1.4548641596744412, + "learning_rate": 2.15202599290121e-05, + "loss": 0.5867, + "step": 3428 + }, + { + "epoch": 0.4213048286030225, + "grad_norm": 1.651687531284175, + "learning_rate": 2.1514465612614583e-05, + "loss": 0.536, + "step": 3429 + }, + { + "epoch": 0.4214276938198796, + "grad_norm": 1.3287960741023768, + "learning_rate": 2.150867009792337e-05, + "loss": 0.6307, + "step": 3430 + }, + { + "epoch": 0.4215505590367367, + "grad_norm": 1.7871094662108038, + "learning_rate": 2.1502873386004498e-05, + "loss": 0.6904, + "step": 3431 + }, + { + "epoch": 0.42167342425359383, + "grad_norm": 1.2022194499429109, + "learning_rate": 2.1497075477924245e-05, + "loss": 0.6226, + "step": 3432 + }, + { + "epoch": 0.4217962894704509, + "grad_norm": 1.2532615433842964, + "learning_rate": 2.149127637474909e-05, + "loss": 0.5678, + "step": 3433 + }, + { + "epoch": 0.421919154687308, + "grad_norm": 1.462332733116907, + "learning_rate": 2.1485476077545745e-05, + "loss": 0.6775, + "step": 3434 + }, + { + "epoch": 0.4220420199041651, + "grad_norm": 1.3394230906627682, + "learning_rate": 2.1479674587381136e-05, + "loss": 0.671, + "step": 3435 + }, + { + "epoch": 0.42216488512102224, + "grad_norm": 2.04789210122639, + "learning_rate": 2.1473871905322406e-05, + "loss": 0.5973, + "step": 3436 + }, + { + "epoch": 0.42228775033787935, + "grad_norm": 1.3532358978264498, + "learning_rate": 2.146806803243692e-05, + "loss": 0.6536, + "step": 3437 + }, + { + "epoch": 0.42241061555473647, + "grad_norm": 1.6904486181712066, + "learning_rate": 2.1462262969792272e-05, + "loss": 0.7115, + "step": 3438 + }, + { + "epoch": 0.4225334807715936, + "grad_norm": 1.447254507414375, + "learning_rate": 2.1456456718456256e-05, + "loss": 0.6739, + "step": 3439 + }, + { + "epoch": 0.42265634598845064, + "grad_norm": 1.1858845413896495, + "learning_rate": 2.1450649279496903e-05, + "loss": 0.6731, + "step": 3440 + }, + { + "epoch": 0.42277921120530776, + "grad_norm": 1.4475692848098725, + "learning_rate": 2.1444840653982447e-05, + "loss": 0.6204, + "step": 3441 + }, + { + "epoch": 0.4229020764221649, + "grad_norm": 1.4112078253892182, + "learning_rate": 2.143903084298135e-05, + "loss": 0.6835, + "step": 3442 + }, + { + "epoch": 0.423024941639022, + "grad_norm": 1.0785855388379113, + "learning_rate": 2.1433219847562287e-05, + "loss": 0.5242, + "step": 3443 + }, + { + "epoch": 0.4231478068558791, + "grad_norm": 1.2738292461621952, + "learning_rate": 2.1427407668794152e-05, + "loss": 0.68, + "step": 3444 + }, + { + "epoch": 0.4232706720727362, + "grad_norm": 1.275580692813145, + "learning_rate": 2.1421594307746062e-05, + "loss": 0.7372, + "step": 3445 + }, + { + "epoch": 0.42339353728959334, + "grad_norm": 1.5429549084875198, + "learning_rate": 2.1415779765487342e-05, + "loss": 0.6353, + "step": 3446 + }, + { + "epoch": 0.42351640250645045, + "grad_norm": 1.179510628987097, + "learning_rate": 2.1409964043087548e-05, + "loss": 0.6266, + "step": 3447 + }, + { + "epoch": 0.4236392677233075, + "grad_norm": 1.5222910131290042, + "learning_rate": 2.140414714161643e-05, + "loss": 0.591, + "step": 3448 + }, + { + "epoch": 0.4237621329401646, + "grad_norm": 1.4094096818381, + "learning_rate": 2.1398329062143982e-05, + "loss": 0.5634, + "step": 3449 + }, + { + "epoch": 0.42388499815702174, + "grad_norm": 1.3062676697169266, + "learning_rate": 2.1392509805740396e-05, + "loss": 0.6144, + "step": 3450 + }, + { + "epoch": 0.42400786337387886, + "grad_norm": 1.1483753458772148, + "learning_rate": 2.138668937347609e-05, + "loss": 0.6418, + "step": 3451 + }, + { + "epoch": 0.424130728590736, + "grad_norm": 1.1712850690879462, + "learning_rate": 2.1380867766421693e-05, + "loss": 0.6378, + "step": 3452 + }, + { + "epoch": 0.4242535938075931, + "grad_norm": 1.1650030137688014, + "learning_rate": 2.137504498564805e-05, + "loss": 0.6171, + "step": 3453 + }, + { + "epoch": 0.4243764590244502, + "grad_norm": 1.5431132340873495, + "learning_rate": 2.136922103222623e-05, + "loss": 0.637, + "step": 3454 + }, + { + "epoch": 0.42449932424130726, + "grad_norm": 1.4291176347121837, + "learning_rate": 2.1363395907227502e-05, + "loss": 0.5367, + "step": 3455 + }, + { + "epoch": 0.4246221894581644, + "grad_norm": 1.2750110421586105, + "learning_rate": 2.1357569611723365e-05, + "loss": 0.5859, + "step": 3456 + }, + { + "epoch": 0.4247450546750215, + "grad_norm": 1.2118502046914408, + "learning_rate": 2.135174214678553e-05, + "loss": 0.5842, + "step": 3457 + }, + { + "epoch": 0.4248679198918786, + "grad_norm": 1.3494988255942384, + "learning_rate": 2.134591351348592e-05, + "loss": 0.5466, + "step": 3458 + }, + { + "epoch": 0.4249907851087357, + "grad_norm": 1.4186921478681067, + "learning_rate": 2.1340083712896674e-05, + "loss": 0.5611, + "step": 3459 + }, + { + "epoch": 0.42511365032559284, + "grad_norm": 1.2900068078744247, + "learning_rate": 2.1334252746090142e-05, + "loss": 0.544, + "step": 3460 + }, + { + "epoch": 0.42523651554244996, + "grad_norm": 1.1790126200337352, + "learning_rate": 2.1328420614138903e-05, + "loss": 0.628, + "step": 3461 + }, + { + "epoch": 0.425359380759307, + "grad_norm": 1.4201331352323003, + "learning_rate": 2.1322587318115728e-05, + "loss": 0.6863, + "step": 3462 + }, + { + "epoch": 0.42548224597616413, + "grad_norm": 1.3952737159974218, + "learning_rate": 2.131675285909362e-05, + "loss": 0.5533, + "step": 3463 + }, + { + "epoch": 0.42560511119302125, + "grad_norm": 1.3731822515889234, + "learning_rate": 2.1310917238145793e-05, + "loss": 0.5844, + "step": 3464 + }, + { + "epoch": 0.42572797640987836, + "grad_norm": 1.1259352509729847, + "learning_rate": 2.130508045634566e-05, + "loss": 0.5402, + "step": 3465 + }, + { + "epoch": 0.4258508416267355, + "grad_norm": 1.242487156386763, + "learning_rate": 2.1299242514766875e-05, + "loss": 0.7108, + "step": 3466 + }, + { + "epoch": 0.4259737068435926, + "grad_norm": 1.3507853747514502, + "learning_rate": 2.1293403414483277e-05, + "loss": 0.6303, + "step": 3467 + }, + { + "epoch": 0.4260965720604497, + "grad_norm": 1.4693490228998327, + "learning_rate": 2.128756315656894e-05, + "loss": 0.6299, + "step": 3468 + }, + { + "epoch": 0.42621943727730677, + "grad_norm": 1.1937378536426377, + "learning_rate": 2.128172174209813e-05, + "loss": 0.6623, + "step": 3469 + }, + { + "epoch": 0.4263423024941639, + "grad_norm": 1.1289603434924278, + "learning_rate": 2.127587917214535e-05, + "loss": 0.5223, + "step": 3470 + }, + { + "epoch": 0.426465167711021, + "grad_norm": 1.3695411084313474, + "learning_rate": 2.127003544778529e-05, + "loss": 0.6019, + "step": 3471 + }, + { + "epoch": 0.4265880329278781, + "grad_norm": 1.2946048095233342, + "learning_rate": 2.126419057009288e-05, + "loss": 0.7382, + "step": 3472 + }, + { + "epoch": 0.42671089814473523, + "grad_norm": 1.4163518093859047, + "learning_rate": 2.1258344540143234e-05, + "loss": 0.5762, + "step": 3473 + }, + { + "epoch": 0.42683376336159234, + "grad_norm": 1.116491306654738, + "learning_rate": 2.1252497359011698e-05, + "loss": 0.6106, + "step": 3474 + }, + { + "epoch": 0.42695662857844946, + "grad_norm": 1.5549779299348403, + "learning_rate": 2.1246649027773815e-05, + "loss": 0.6282, + "step": 3475 + }, + { + "epoch": 0.4270794937953066, + "grad_norm": 1.1575593920734617, + "learning_rate": 2.1240799547505365e-05, + "loss": 0.5321, + "step": 3476 + }, + { + "epoch": 0.42720235901216363, + "grad_norm": 1.2864142345359124, + "learning_rate": 2.1234948919282303e-05, + "loss": 0.6346, + "step": 3477 + }, + { + "epoch": 0.42732522422902075, + "grad_norm": 1.273332383644958, + "learning_rate": 2.1229097144180832e-05, + "loss": 0.637, + "step": 3478 + }, + { + "epoch": 0.42744808944587787, + "grad_norm": 1.251676796771201, + "learning_rate": 2.122324422327733e-05, + "loss": 0.5353, + "step": 3479 + }, + { + "epoch": 0.427570954662735, + "grad_norm": 1.2701431657948747, + "learning_rate": 2.1217390157648414e-05, + "loss": 0.5508, + "step": 3480 + }, + { + "epoch": 0.4276938198795921, + "grad_norm": 1.3266420778729997, + "learning_rate": 2.1211534948370903e-05, + "loss": 0.6394, + "step": 3481 + }, + { + "epoch": 0.4278166850964492, + "grad_norm": 1.4627882353558082, + "learning_rate": 2.1205678596521817e-05, + "loss": 0.7113, + "step": 3482 + }, + { + "epoch": 0.4279395503133063, + "grad_norm": 1.4261389085707894, + "learning_rate": 2.1199821103178402e-05, + "loss": 0.5973, + "step": 3483 + }, + { + "epoch": 0.4280624155301634, + "grad_norm": 1.2957586514949393, + "learning_rate": 2.11939624694181e-05, + "loss": 0.5254, + "step": 3484 + }, + { + "epoch": 0.4281852807470205, + "grad_norm": 1.2991919424946516, + "learning_rate": 2.1188102696318573e-05, + "loss": 0.6874, + "step": 3485 + }, + { + "epoch": 0.4283081459638776, + "grad_norm": 1.3683165180127044, + "learning_rate": 2.118224178495768e-05, + "loss": 0.6245, + "step": 3486 + }, + { + "epoch": 0.42843101118073473, + "grad_norm": 1.2315167285915316, + "learning_rate": 2.1176379736413513e-05, + "loss": 0.5204, + "step": 3487 + }, + { + "epoch": 0.42855387639759185, + "grad_norm": 1.3019592707928342, + "learning_rate": 2.1170516551764343e-05, + "loss": 0.5801, + "step": 3488 + }, + { + "epoch": 0.42867674161444896, + "grad_norm": 1.4876652120151799, + "learning_rate": 2.1164652232088674e-05, + "loss": 0.6455, + "step": 3489 + }, + { + "epoch": 0.4287996068313061, + "grad_norm": 1.4653019550760753, + "learning_rate": 2.1158786778465206e-05, + "loss": 0.7316, + "step": 3490 + }, + { + "epoch": 0.42892247204816314, + "grad_norm": 1.3358511966309072, + "learning_rate": 2.1152920191972848e-05, + "loss": 0.6844, + "step": 3491 + }, + { + "epoch": 0.42904533726502025, + "grad_norm": 1.1623128264287934, + "learning_rate": 2.1147052473690726e-05, + "loss": 0.6497, + "step": 3492 + }, + { + "epoch": 0.42916820248187737, + "grad_norm": 1.4629359956351053, + "learning_rate": 2.1141183624698166e-05, + "loss": 0.6236, + "step": 3493 + }, + { + "epoch": 0.4292910676987345, + "grad_norm": 1.4269132231777633, + "learning_rate": 2.1135313646074702e-05, + "loss": 0.6466, + "step": 3494 + }, + { + "epoch": 0.4294139329155916, + "grad_norm": 1.2967068741201084, + "learning_rate": 2.1129442538900087e-05, + "loss": 0.6207, + "step": 3495 + }, + { + "epoch": 0.4295367981324487, + "grad_norm": 1.1081636786839073, + "learning_rate": 2.1123570304254265e-05, + "loss": 0.5955, + "step": 3496 + }, + { + "epoch": 0.42965966334930583, + "grad_norm": 1.2408979060011305, + "learning_rate": 2.11176969432174e-05, + "loss": 0.6014, + "step": 3497 + }, + { + "epoch": 0.42978252856616295, + "grad_norm": 1.4705562773061758, + "learning_rate": 2.1111822456869853e-05, + "loss": 0.5885, + "step": 3498 + }, + { + "epoch": 0.42990539378302, + "grad_norm": 1.1270580161368646, + "learning_rate": 2.1105946846292207e-05, + "loss": 0.6394, + "step": 3499 + }, + { + "epoch": 0.4300282589998771, + "grad_norm": 1.3680656516654517, + "learning_rate": 2.1100070112565237e-05, + "loss": 0.7149, + "step": 3500 + }, + { + "epoch": 0.43015112421673424, + "grad_norm": 1.5119150884567174, + "learning_rate": 2.1094192256769927e-05, + "loss": 0.683, + "step": 3501 + }, + { + "epoch": 0.43027398943359135, + "grad_norm": 1.1882053786060716, + "learning_rate": 2.108831327998747e-05, + "loss": 0.6852, + "step": 3502 + }, + { + "epoch": 0.43039685465044847, + "grad_norm": 1.3467092595731702, + "learning_rate": 2.108243318329928e-05, + "loss": 0.5993, + "step": 3503 + }, + { + "epoch": 0.4305197198673056, + "grad_norm": 1.2252654449193163, + "learning_rate": 2.107655196778694e-05, + "loss": 0.6428, + "step": 3504 + }, + { + "epoch": 0.4306425850841627, + "grad_norm": 1.4843352012868505, + "learning_rate": 2.1070669634532276e-05, + "loss": 0.6756, + "step": 3505 + }, + { + "epoch": 0.43076545030101976, + "grad_norm": 1.228734169144027, + "learning_rate": 2.1064786184617306e-05, + "loss": 0.552, + "step": 3506 + }, + { + "epoch": 0.4308883155178769, + "grad_norm": 1.3860589304386575, + "learning_rate": 2.1058901619124247e-05, + "loss": 0.5642, + "step": 3507 + }, + { + "epoch": 0.431011180734734, + "grad_norm": 1.5264744128359216, + "learning_rate": 2.1053015939135533e-05, + "loss": 0.7452, + "step": 3508 + }, + { + "epoch": 0.4311340459515911, + "grad_norm": 1.418921177946135, + "learning_rate": 2.1047129145733787e-05, + "loss": 0.6867, + "step": 3509 + }, + { + "epoch": 0.4312569111684482, + "grad_norm": 1.2808596740830946, + "learning_rate": 2.1041241240001856e-05, + "loss": 0.7363, + "step": 3510 + }, + { + "epoch": 0.43137977638530534, + "grad_norm": 1.2464491864666372, + "learning_rate": 2.1035352223022773e-05, + "loss": 0.7, + "step": 3511 + }, + { + "epoch": 0.43150264160216245, + "grad_norm": 1.4428272361157695, + "learning_rate": 2.1029462095879795e-05, + "loss": 0.5306, + "step": 3512 + }, + { + "epoch": 0.4316255068190195, + "grad_norm": 1.2847832048025258, + "learning_rate": 2.1023570859656358e-05, + "loss": 0.6244, + "step": 3513 + }, + { + "epoch": 0.4317483720358766, + "grad_norm": 1.1897331667663213, + "learning_rate": 2.1017678515436134e-05, + "loss": 0.6292, + "step": 3514 + }, + { + "epoch": 0.43187123725273374, + "grad_norm": 1.2578260351544996, + "learning_rate": 2.1011785064302967e-05, + "loss": 0.5592, + "step": 3515 + }, + { + "epoch": 0.43199410246959086, + "grad_norm": 1.4942458314074962, + "learning_rate": 2.100589050734093e-05, + "loss": 0.6732, + "step": 3516 + }, + { + "epoch": 0.432116967686448, + "grad_norm": 1.2956015986861527, + "learning_rate": 2.0999994845634285e-05, + "loss": 0.6269, + "step": 3517 + }, + { + "epoch": 0.4322398329033051, + "grad_norm": 1.3175340532920685, + "learning_rate": 2.0994098080267496e-05, + "loss": 0.6093, + "step": 3518 + }, + { + "epoch": 0.4323626981201622, + "grad_norm": 1.3453191037459895, + "learning_rate": 2.0988200212325237e-05, + "loss": 0.6281, + "step": 3519 + }, + { + "epoch": 0.43248556333701926, + "grad_norm": 1.1100017930976023, + "learning_rate": 2.0982301242892386e-05, + "loss": 0.5573, + "step": 3520 + }, + { + "epoch": 0.4326084285538764, + "grad_norm": 1.3691318020400036, + "learning_rate": 2.0976401173054016e-05, + "loss": 0.6505, + "step": 3521 + }, + { + "epoch": 0.4327312937707335, + "grad_norm": 1.3761625591021982, + "learning_rate": 2.0970500003895408e-05, + "loss": 0.6689, + "step": 3522 + }, + { + "epoch": 0.4328541589875906, + "grad_norm": 1.2238728358041044, + "learning_rate": 2.0964597736502043e-05, + "loss": 0.4826, + "step": 3523 + }, + { + "epoch": 0.4329770242044477, + "grad_norm": 1.2760978775899323, + "learning_rate": 2.0958694371959614e-05, + "loss": 0.7522, + "step": 3524 + }, + { + "epoch": 0.43309988942130484, + "grad_norm": 1.2735162659376245, + "learning_rate": 2.095278991135399e-05, + "loss": 0.7004, + "step": 3525 + }, + { + "epoch": 0.43322275463816196, + "grad_norm": 1.2865401142458193, + "learning_rate": 2.0946884355771274e-05, + "loss": 0.6095, + "step": 3526 + }, + { + "epoch": 0.43334561985501907, + "grad_norm": 1.4271250460710811, + "learning_rate": 2.0940977706297747e-05, + "loss": 0.6103, + "step": 3527 + }, + { + "epoch": 0.43346848507187613, + "grad_norm": 1.3118349745397995, + "learning_rate": 2.0935069964019897e-05, + "loss": 0.6236, + "step": 3528 + }, + { + "epoch": 0.43359135028873325, + "grad_norm": 1.2641029968273507, + "learning_rate": 2.0929161130024415e-05, + "loss": 0.6141, + "step": 3529 + }, + { + "epoch": 0.43371421550559036, + "grad_norm": 1.1963873737381538, + "learning_rate": 2.0923251205398198e-05, + "loss": 0.5964, + "step": 3530 + }, + { + "epoch": 0.4338370807224475, + "grad_norm": 1.447872104245257, + "learning_rate": 2.0917340191228337e-05, + "loss": 0.6553, + "step": 3531 + }, + { + "epoch": 0.4339599459393046, + "grad_norm": 1.5704484172987203, + "learning_rate": 2.091142808860212e-05, + "loss": 0.684, + "step": 3532 + }, + { + "epoch": 0.4340828111561617, + "grad_norm": 1.119433282395661, + "learning_rate": 2.0905514898607045e-05, + "loss": 0.5585, + "step": 3533 + }, + { + "epoch": 0.4342056763730188, + "grad_norm": 1.4462673565174435, + "learning_rate": 2.0899600622330802e-05, + "loss": 0.6424, + "step": 3534 + }, + { + "epoch": 0.4343285415898759, + "grad_norm": 1.6351129460811595, + "learning_rate": 2.0893685260861288e-05, + "loss": 0.4973, + "step": 3535 + }, + { + "epoch": 0.434451406806733, + "grad_norm": 1.3040331531009521, + "learning_rate": 2.0887768815286585e-05, + "loss": 0.7803, + "step": 3536 + }, + { + "epoch": 0.4345742720235901, + "grad_norm": 1.4574794729512208, + "learning_rate": 2.0881851286694998e-05, + "loss": 0.592, + "step": 3537 + }, + { + "epoch": 0.43469713724044723, + "grad_norm": 1.2444503311016155, + "learning_rate": 2.0875932676175013e-05, + "loss": 0.6506, + "step": 3538 + }, + { + "epoch": 0.43482000245730434, + "grad_norm": 1.1924768567215682, + "learning_rate": 2.0870012984815312e-05, + "loss": 0.5625, + "step": 3539 + }, + { + "epoch": 0.43494286767416146, + "grad_norm": 1.3777776442672132, + "learning_rate": 2.0864092213704797e-05, + "loss": 0.6166, + "step": 3540 + }, + { + "epoch": 0.4350657328910186, + "grad_norm": 1.3031490973736857, + "learning_rate": 2.0858170363932545e-05, + "loss": 0.5435, + "step": 3541 + }, + { + "epoch": 0.43518859810787563, + "grad_norm": 1.7107670355891171, + "learning_rate": 2.0852247436587847e-05, + "loss": 0.7049, + "step": 3542 + }, + { + "epoch": 0.43531146332473275, + "grad_norm": 1.1516773573194925, + "learning_rate": 2.0846323432760192e-05, + "loss": 0.6324, + "step": 3543 + }, + { + "epoch": 0.43543432854158987, + "grad_norm": 1.152072180773817, + "learning_rate": 2.084039835353925e-05, + "loss": 0.6264, + "step": 3544 + }, + { + "epoch": 0.435557193758447, + "grad_norm": 1.0773374208395947, + "learning_rate": 2.0834472200014906e-05, + "loss": 0.6413, + "step": 3545 + }, + { + "epoch": 0.4356800589753041, + "grad_norm": 1.1382554743279147, + "learning_rate": 2.0828544973277244e-05, + "loss": 0.623, + "step": 3546 + }, + { + "epoch": 0.4358029241921612, + "grad_norm": 1.351263416899528, + "learning_rate": 2.0822616674416533e-05, + "loss": 0.6132, + "step": 3547 + }, + { + "epoch": 0.4359257894090183, + "grad_norm": 1.1337283245204235, + "learning_rate": 2.0816687304523243e-05, + "loss": 0.6389, + "step": 3548 + }, + { + "epoch": 0.4360486546258754, + "grad_norm": 1.386060331131095, + "learning_rate": 2.0810756864688045e-05, + "loss": 0.748, + "step": 3549 + }, + { + "epoch": 0.4361715198427325, + "grad_norm": 1.0210122980684389, + "learning_rate": 2.080482535600181e-05, + "loss": 0.5665, + "step": 3550 + }, + { + "epoch": 0.4362943850595896, + "grad_norm": 1.2743636725868617, + "learning_rate": 2.0798892779555592e-05, + "loss": 0.6544, + "step": 3551 + }, + { + "epoch": 0.43641725027644673, + "grad_norm": 1.2063966035705025, + "learning_rate": 2.079295913644066e-05, + "loss": 0.6131, + "step": 3552 + }, + { + "epoch": 0.43654011549330385, + "grad_norm": 1.209310452380257, + "learning_rate": 2.0787024427748455e-05, + "loss": 0.5675, + "step": 3553 + }, + { + "epoch": 0.43666298071016096, + "grad_norm": 1.2966964900213451, + "learning_rate": 2.078108865457064e-05, + "loss": 0.6714, + "step": 3554 + }, + { + "epoch": 0.4367858459270181, + "grad_norm": 1.2008599360903591, + "learning_rate": 2.0775151817999063e-05, + "loss": 0.6704, + "step": 3555 + }, + { + "epoch": 0.4369087111438752, + "grad_norm": 1.0400466061083526, + "learning_rate": 2.0769213919125764e-05, + "loss": 0.5177, + "step": 3556 + }, + { + "epoch": 0.43703157636073225, + "grad_norm": 1.1393015265189632, + "learning_rate": 2.0763274959042972e-05, + "loss": 0.5468, + "step": 3557 + }, + { + "epoch": 0.43715444157758937, + "grad_norm": 1.229536238099014, + "learning_rate": 2.0757334938843135e-05, + "loss": 0.6055, + "step": 3558 + }, + { + "epoch": 0.4372773067944465, + "grad_norm": 1.2007466956340924, + "learning_rate": 2.075139385961886e-05, + "loss": 0.5369, + "step": 3559 + }, + { + "epoch": 0.4374001720113036, + "grad_norm": 1.0828416064117952, + "learning_rate": 2.0745451722462996e-05, + "loss": 0.654, + "step": 3560 + }, + { + "epoch": 0.4375230372281607, + "grad_norm": 1.2401853254845558, + "learning_rate": 2.0739508528468544e-05, + "loss": 0.6407, + "step": 3561 + }, + { + "epoch": 0.43764590244501783, + "grad_norm": 1.3709376423885293, + "learning_rate": 2.0733564278728723e-05, + "loss": 0.647, + "step": 3562 + }, + { + "epoch": 0.43776876766187495, + "grad_norm": 1.3701327279378608, + "learning_rate": 2.072761897433693e-05, + "loss": 0.6621, + "step": 3563 + }, + { + "epoch": 0.437891632878732, + "grad_norm": 1.286140118392058, + "learning_rate": 2.072167261638678e-05, + "loss": 0.5152, + "step": 3564 + }, + { + "epoch": 0.4380144980955891, + "grad_norm": 1.363175860850957, + "learning_rate": 2.0715725205972054e-05, + "loss": 0.5822, + "step": 3565 + }, + { + "epoch": 0.43813736331244624, + "grad_norm": 1.1993207260321368, + "learning_rate": 2.070977674418675e-05, + "loss": 0.6987, + "step": 3566 + }, + { + "epoch": 0.43826022852930335, + "grad_norm": 1.4861955833302396, + "learning_rate": 2.0703827232125033e-05, + "loss": 0.6389, + "step": 3567 + }, + { + "epoch": 0.43838309374616047, + "grad_norm": 1.1792785668228283, + "learning_rate": 2.069787667088129e-05, + "loss": 0.6862, + "step": 3568 + }, + { + "epoch": 0.4385059589630176, + "grad_norm": 1.3568363291150614, + "learning_rate": 2.069192506155009e-05, + "loss": 0.5728, + "step": 3569 + }, + { + "epoch": 0.4386288241798747, + "grad_norm": 1.5954392799221275, + "learning_rate": 2.068597240522618e-05, + "loss": 0.7716, + "step": 3570 + }, + { + "epoch": 0.43875168939673176, + "grad_norm": 1.168970342508959, + "learning_rate": 2.068001870300453e-05, + "loss": 0.5904, + "step": 3571 + }, + { + "epoch": 0.4388745546135889, + "grad_norm": 1.2184237657991792, + "learning_rate": 2.067406395598027e-05, + "loss": 0.5849, + "step": 3572 + }, + { + "epoch": 0.438997419830446, + "grad_norm": 1.2053715822610087, + "learning_rate": 2.0668108165248747e-05, + "loss": 0.5025, + "step": 3573 + }, + { + "epoch": 0.4391202850473031, + "grad_norm": 1.3292318428606023, + "learning_rate": 2.0662151331905486e-05, + "loss": 0.6254, + "step": 3574 + }, + { + "epoch": 0.4392431502641602, + "grad_norm": 1.3748585063043088, + "learning_rate": 2.0656193457046206e-05, + "loss": 0.4897, + "step": 3575 + }, + { + "epoch": 0.43936601548101734, + "grad_norm": 1.3308241304257984, + "learning_rate": 2.065023454176682e-05, + "loss": 0.682, + "step": 3576 + }, + { + "epoch": 0.43948888069787445, + "grad_norm": 1.6031579596144083, + "learning_rate": 2.064427458716344e-05, + "loss": 0.7342, + "step": 3577 + }, + { + "epoch": 0.43961174591473157, + "grad_norm": 1.6034968033814327, + "learning_rate": 2.0638313594332344e-05, + "loss": 0.5246, + "step": 3578 + }, + { + "epoch": 0.4397346111315886, + "grad_norm": 1.148235864436196, + "learning_rate": 2.0632351564370035e-05, + "loss": 0.5508, + "step": 3579 + }, + { + "epoch": 0.43985747634844574, + "grad_norm": 1.141131474918344, + "learning_rate": 2.062638849837318e-05, + "loss": 0.6139, + "step": 3580 + }, + { + "epoch": 0.43998034156530286, + "grad_norm": 1.4464687481745717, + "learning_rate": 2.0620424397438646e-05, + "loss": 0.5841, + "step": 3581 + }, + { + "epoch": 0.44010320678216, + "grad_norm": 1.2235385362703148, + "learning_rate": 2.06144592626635e-05, + "loss": 0.6308, + "step": 3582 + }, + { + "epoch": 0.4402260719990171, + "grad_norm": 1.478486063299159, + "learning_rate": 2.060849309514498e-05, + "loss": 0.6587, + "step": 3583 + }, + { + "epoch": 0.4403489372158742, + "grad_norm": 1.5184577482791952, + "learning_rate": 2.0602525895980528e-05, + "loss": 0.6423, + "step": 3584 + }, + { + "epoch": 0.4404718024327313, + "grad_norm": 1.3135871539553015, + "learning_rate": 2.0596557666267776e-05, + "loss": 0.5182, + "step": 3585 + }, + { + "epoch": 0.4405946676495884, + "grad_norm": 1.3000995568174691, + "learning_rate": 2.0590588407104532e-05, + "loss": 0.5481, + "step": 3586 + }, + { + "epoch": 0.4407175328664455, + "grad_norm": 1.2714251339203333, + "learning_rate": 2.0584618119588806e-05, + "loss": 0.5812, + "step": 3587 + }, + { + "epoch": 0.4408403980833026, + "grad_norm": 1.6919445942176656, + "learning_rate": 2.0578646804818793e-05, + "loss": 0.6534, + "step": 3588 + }, + { + "epoch": 0.4409632633001597, + "grad_norm": 1.3104436716938188, + "learning_rate": 2.0572674463892883e-05, + "loss": 0.6328, + "step": 3589 + }, + { + "epoch": 0.44108612851701684, + "grad_norm": 1.302723041980785, + "learning_rate": 2.0566701097909643e-05, + "loss": 0.6413, + "step": 3590 + }, + { + "epoch": 0.44120899373387396, + "grad_norm": 1.2510691571512815, + "learning_rate": 2.0560726707967836e-05, + "loss": 0.5479, + "step": 3591 + }, + { + "epoch": 0.44133185895073107, + "grad_norm": 1.4895279549387337, + "learning_rate": 2.0554751295166412e-05, + "loss": 0.5594, + "step": 3592 + }, + { + "epoch": 0.44145472416758813, + "grad_norm": 1.963769478316416, + "learning_rate": 2.054877486060452e-05, + "loss": 0.6617, + "step": 3593 + }, + { + "epoch": 0.44157758938444525, + "grad_norm": 1.3754972396567011, + "learning_rate": 2.0542797405381476e-05, + "loss": 0.6361, + "step": 3594 + }, + { + "epoch": 0.44170045460130236, + "grad_norm": 1.4466093772937216, + "learning_rate": 2.0536818930596785e-05, + "loss": 0.5859, + "step": 3595 + }, + { + "epoch": 0.4418233198181595, + "grad_norm": 1.4113580549715155, + "learning_rate": 2.053083943735017e-05, + "loss": 0.6837, + "step": 3596 + }, + { + "epoch": 0.4419461850350166, + "grad_norm": 1.3087318655254474, + "learning_rate": 2.0524858926741505e-05, + "loss": 0.4933, + "step": 3597 + }, + { + "epoch": 0.4420690502518737, + "grad_norm": 1.1740542181453728, + "learning_rate": 2.051887739987087e-05, + "loss": 0.6128, + "step": 3598 + }, + { + "epoch": 0.4421919154687308, + "grad_norm": 1.144889549007166, + "learning_rate": 2.0512894857838528e-05, + "loss": 0.5785, + "step": 3599 + }, + { + "epoch": 0.4423147806855879, + "grad_norm": 1.7498999054171522, + "learning_rate": 2.050691130174493e-05, + "loss": 0.6747, + "step": 3600 + }, + { + "epoch": 0.442437645902445, + "grad_norm": 1.417112458018011, + "learning_rate": 2.0500926732690713e-05, + "loss": 0.8139, + "step": 3601 + }, + { + "epoch": 0.4425605111193021, + "grad_norm": 1.631690852708493, + "learning_rate": 2.0494941151776698e-05, + "loss": 0.6746, + "step": 3602 + }, + { + "epoch": 0.44268337633615923, + "grad_norm": 1.3054974075286592, + "learning_rate": 2.0488954560103895e-05, + "loss": 0.5305, + "step": 3603 + }, + { + "epoch": 0.44280624155301634, + "grad_norm": 1.2082922427885265, + "learning_rate": 2.0482966958773494e-05, + "loss": 0.7933, + "step": 3604 + }, + { + "epoch": 0.44292910676987346, + "grad_norm": 1.2328659124677859, + "learning_rate": 2.047697834888688e-05, + "loss": 0.6267, + "step": 3605 + }, + { + "epoch": 0.4430519719867306, + "grad_norm": 1.6593836235354964, + "learning_rate": 2.047098873154562e-05, + "loss": 0.6938, + "step": 3606 + }, + { + "epoch": 0.4431748372035877, + "grad_norm": 1.4302238196496895, + "learning_rate": 2.0464998107851464e-05, + "loss": 0.6579, + "step": 3607 + }, + { + "epoch": 0.44329770242044475, + "grad_norm": 1.3055432580985649, + "learning_rate": 2.0459006478906348e-05, + "loss": 0.568, + "step": 3608 + }, + { + "epoch": 0.44342056763730187, + "grad_norm": 1.2139579767226087, + "learning_rate": 2.045301384581239e-05, + "loss": 0.703, + "step": 3609 + }, + { + "epoch": 0.443543432854159, + "grad_norm": 1.5718974968447121, + "learning_rate": 2.0447020209671904e-05, + "loss": 0.6374, + "step": 3610 + }, + { + "epoch": 0.4436662980710161, + "grad_norm": 1.3595250223046766, + "learning_rate": 2.044102557158737e-05, + "loss": 0.534, + "step": 3611 + }, + { + "epoch": 0.4437891632878732, + "grad_norm": 1.4430962889133303, + "learning_rate": 2.0435029932661472e-05, + "loss": 0.6416, + "step": 3612 + }, + { + "epoch": 0.4439120285047303, + "grad_norm": 1.550653633024119, + "learning_rate": 2.0429033293997066e-05, + "loss": 0.6806, + "step": 3613 + }, + { + "epoch": 0.44403489372158744, + "grad_norm": 1.384290570440926, + "learning_rate": 2.042303565669719e-05, + "loss": 0.6624, + "step": 3614 + }, + { + "epoch": 0.4441577589384445, + "grad_norm": 1.2460722997927791, + "learning_rate": 2.0417037021865077e-05, + "loss": 0.6273, + "step": 3615 + }, + { + "epoch": 0.4442806241553016, + "grad_norm": 1.5439007294976186, + "learning_rate": 2.0411037390604134e-05, + "loss": 0.6251, + "step": 3616 + }, + { + "epoch": 0.44440348937215873, + "grad_norm": 1.2217201508346363, + "learning_rate": 2.0405036764017956e-05, + "loss": 0.6874, + "step": 3617 + }, + { + "epoch": 0.44452635458901585, + "grad_norm": 1.4584471212306531, + "learning_rate": 2.0399035143210315e-05, + "loss": 0.6447, + "step": 3618 + }, + { + "epoch": 0.44464921980587296, + "grad_norm": 1.5881319656746442, + "learning_rate": 2.039303252928518e-05, + "loss": 0.6115, + "step": 3619 + }, + { + "epoch": 0.4447720850227301, + "grad_norm": 1.2141820201050342, + "learning_rate": 2.038702892334668e-05, + "loss": 0.6064, + "step": 3620 + }, + { + "epoch": 0.4448949502395872, + "grad_norm": 1.3490284841524718, + "learning_rate": 2.038102432649915e-05, + "loss": 0.7269, + "step": 3621 + }, + { + "epoch": 0.44501781545644425, + "grad_norm": 1.356387828704352, + "learning_rate": 2.0375018739847087e-05, + "loss": 0.5842, + "step": 3622 + }, + { + "epoch": 0.44514068067330137, + "grad_norm": 1.3993363946042585, + "learning_rate": 2.0369012164495195e-05, + "loss": 0.6596, + "step": 3623 + }, + { + "epoch": 0.4452635458901585, + "grad_norm": 1.3262606798213514, + "learning_rate": 2.036300460154832e-05, + "loss": 0.5481, + "step": 3624 + }, + { + "epoch": 0.4453864111070156, + "grad_norm": 1.4058479917869016, + "learning_rate": 2.035699605211154e-05, + "loss": 0.558, + "step": 3625 + }, + { + "epoch": 0.4455092763238727, + "grad_norm": 1.0222869065140592, + "learning_rate": 2.0350986517290072e-05, + "loss": 0.6424, + "step": 3626 + }, + { + "epoch": 0.44563214154072983, + "grad_norm": 1.35387698569259, + "learning_rate": 2.034497599818934e-05, + "loss": 0.6011, + "step": 3627 + }, + { + "epoch": 0.44575500675758695, + "grad_norm": 1.4201669877050878, + "learning_rate": 2.0338964495914932e-05, + "loss": 0.6691, + "step": 3628 + }, + { + "epoch": 0.44587787197444406, + "grad_norm": 1.253579019338957, + "learning_rate": 2.0332952011572634e-05, + "loss": 0.6333, + "step": 3629 + }, + { + "epoch": 0.4460007371913011, + "grad_norm": 1.6179076771203558, + "learning_rate": 2.0326938546268398e-05, + "loss": 0.644, + "step": 3630 + }, + { + "epoch": 0.44612360240815824, + "grad_norm": 1.4392364954793477, + "learning_rate": 2.0320924101108364e-05, + "loss": 0.659, + "step": 3631 + }, + { + "epoch": 0.44624646762501535, + "grad_norm": 1.2718568319133743, + "learning_rate": 2.0314908677198846e-05, + "loss": 0.7068, + "step": 3632 + }, + { + "epoch": 0.44636933284187247, + "grad_norm": 1.2601608476331965, + "learning_rate": 2.0308892275646343e-05, + "loss": 0.6256, + "step": 3633 + }, + { + "epoch": 0.4464921980587296, + "grad_norm": 1.0918091047581853, + "learning_rate": 2.0302874897557545e-05, + "loss": 0.5978, + "step": 3634 + }, + { + "epoch": 0.4466150632755867, + "grad_norm": 1.278563615500784, + "learning_rate": 2.029685654403929e-05, + "loss": 0.6068, + "step": 3635 + }, + { + "epoch": 0.4467379284924438, + "grad_norm": 1.1908491006723305, + "learning_rate": 2.029083721619863e-05, + "loss": 0.5899, + "step": 3636 + }, + { + "epoch": 0.4468607937093009, + "grad_norm": 1.36335015215458, + "learning_rate": 2.0284816915142775e-05, + "loss": 0.782, + "step": 3637 + }, + { + "epoch": 0.446983658926158, + "grad_norm": 1.1472550301864122, + "learning_rate": 2.027879564197912e-05, + "loss": 0.6343, + "step": 3638 + }, + { + "epoch": 0.4471065241430151, + "grad_norm": 1.3940365937179287, + "learning_rate": 2.0272773397815247e-05, + "loss": 0.7331, + "step": 3639 + }, + { + "epoch": 0.4472293893598722, + "grad_norm": 1.2677159944857712, + "learning_rate": 2.02667501837589e-05, + "loss": 0.6363, + "step": 3640 + }, + { + "epoch": 0.44735225457672934, + "grad_norm": 1.5426917768645334, + "learning_rate": 2.0260726000918006e-05, + "loss": 0.7541, + "step": 3641 + }, + { + "epoch": 0.44747511979358645, + "grad_norm": 1.4850028110107596, + "learning_rate": 2.025470085040069e-05, + "loss": 0.6509, + "step": 3642 + }, + { + "epoch": 0.44759798501044357, + "grad_norm": 1.8031594989339639, + "learning_rate": 2.0248674733315224e-05, + "loss": 0.6243, + "step": 3643 + }, + { + "epoch": 0.4477208502273006, + "grad_norm": 1.1021554617678455, + "learning_rate": 2.0242647650770084e-05, + "loss": 0.5468, + "step": 3644 + }, + { + "epoch": 0.44784371544415774, + "grad_norm": 1.3550325042329932, + "learning_rate": 2.0236619603873905e-05, + "loss": 0.7536, + "step": 3645 + }, + { + "epoch": 0.44796658066101486, + "grad_norm": 1.3232798983527259, + "learning_rate": 2.0230590593735515e-05, + "loss": 0.5793, + "step": 3646 + }, + { + "epoch": 0.448089445877872, + "grad_norm": 1.4676840465092558, + "learning_rate": 2.02245606214639e-05, + "loss": 0.7422, + "step": 3647 + }, + { + "epoch": 0.4482123110947291, + "grad_norm": 1.3887119222202766, + "learning_rate": 2.0218529688168244e-05, + "loss": 0.6347, + "step": 3648 + }, + { + "epoch": 0.4483351763115862, + "grad_norm": 1.3772483119447994, + "learning_rate": 2.02124977949579e-05, + "loss": 0.5996, + "step": 3649 + }, + { + "epoch": 0.4484580415284433, + "grad_norm": 1.2993344381193968, + "learning_rate": 2.0206464942942388e-05, + "loss": 0.5611, + "step": 3650 + }, + { + "epoch": 0.4485809067453004, + "grad_norm": 1.131376215645332, + "learning_rate": 2.0200431133231414e-05, + "loss": 0.523, + "step": 3651 + }, + { + "epoch": 0.4487037719621575, + "grad_norm": 1.2981366021004856, + "learning_rate": 2.0194396366934863e-05, + "loss": 0.6278, + "step": 3652 + }, + { + "epoch": 0.4488266371790146, + "grad_norm": 1.2301986669159366, + "learning_rate": 2.018836064516278e-05, + "loss": 0.5217, + "step": 3653 + }, + { + "epoch": 0.4489495023958717, + "grad_norm": 1.3841404068176357, + "learning_rate": 2.018232396902541e-05, + "loss": 0.6049, + "step": 3654 + }, + { + "epoch": 0.44907236761272884, + "grad_norm": 1.296572262075572, + "learning_rate": 2.0176286339633148e-05, + "loss": 0.5866, + "step": 3655 + }, + { + "epoch": 0.44919523282958596, + "grad_norm": 1.3856732084259802, + "learning_rate": 2.0170247758096586e-05, + "loss": 0.6829, + "step": 3656 + }, + { + "epoch": 0.44931809804644307, + "grad_norm": 1.353419333497641, + "learning_rate": 2.016420822552648e-05, + "loss": 0.6269, + "step": 3657 + }, + { + "epoch": 0.4494409632633002, + "grad_norm": 1.2953919556798261, + "learning_rate": 2.0158167743033764e-05, + "loss": 0.5509, + "step": 3658 + }, + { + "epoch": 0.44956382848015725, + "grad_norm": 1.345605625985593, + "learning_rate": 2.0152126311729542e-05, + "loss": 0.5877, + "step": 3659 + }, + { + "epoch": 0.44968669369701436, + "grad_norm": 1.2057993768284678, + "learning_rate": 2.0146083932725096e-05, + "loss": 0.6886, + "step": 3660 + }, + { + "epoch": 0.4498095589138715, + "grad_norm": 1.3175371906896025, + "learning_rate": 2.0140040607131888e-05, + "loss": 0.5779, + "step": 3661 + }, + { + "epoch": 0.4499324241307286, + "grad_norm": 1.1957163696985378, + "learning_rate": 2.0133996336061538e-05, + "loss": 0.5579, + "step": 3662 + }, + { + "epoch": 0.4500552893475857, + "grad_norm": 1.3490837194344436, + "learning_rate": 2.0127951120625864e-05, + "loss": 0.5597, + "step": 3663 + }, + { + "epoch": 0.4501781545644428, + "grad_norm": 1.3549790814528637, + "learning_rate": 2.0121904961936835e-05, + "loss": 0.6071, + "step": 3664 + }, + { + "epoch": 0.45030101978129994, + "grad_norm": 1.2987560426529738, + "learning_rate": 2.0115857861106604e-05, + "loss": 0.7209, + "step": 3665 + }, + { + "epoch": 0.450423884998157, + "grad_norm": 1.1021450883703203, + "learning_rate": 2.0109809819247498e-05, + "loss": 0.649, + "step": 3666 + }, + { + "epoch": 0.4505467502150141, + "grad_norm": 1.174820536257423, + "learning_rate": 2.010376083747201e-05, + "loss": 0.7063, + "step": 3667 + }, + { + "epoch": 0.45066961543187123, + "grad_norm": 1.4396062751727705, + "learning_rate": 2.0097710916892823e-05, + "loss": 0.6355, + "step": 3668 + }, + { + "epoch": 0.45079248064872834, + "grad_norm": 1.0816226089212564, + "learning_rate": 2.0091660058622767e-05, + "loss": 0.6383, + "step": 3669 + }, + { + "epoch": 0.45091534586558546, + "grad_norm": 1.296232076205354, + "learning_rate": 2.0085608263774864e-05, + "loss": 0.5588, + "step": 3670 + }, + { + "epoch": 0.4510382110824426, + "grad_norm": 1.5590898714911274, + "learning_rate": 2.0079555533462306e-05, + "loss": 0.6342, + "step": 3671 + }, + { + "epoch": 0.4511610762992997, + "grad_norm": 1.3247259302682552, + "learning_rate": 2.0073501868798444e-05, + "loss": 0.7013, + "step": 3672 + }, + { + "epoch": 0.45128394151615675, + "grad_norm": 1.2154716104760397, + "learning_rate": 2.0067447270896822e-05, + "loss": 0.5339, + "step": 3673 + }, + { + "epoch": 0.45140680673301387, + "grad_norm": 1.114222000982366, + "learning_rate": 2.0061391740871133e-05, + "loss": 0.6528, + "step": 3674 + }, + { + "epoch": 0.451529671949871, + "grad_norm": 1.15977197514122, + "learning_rate": 2.0055335279835257e-05, + "loss": 0.5696, + "step": 3675 + }, + { + "epoch": 0.4516525371667281, + "grad_norm": 1.2994001558667179, + "learning_rate": 2.0049277888903244e-05, + "loss": 0.5775, + "step": 3676 + }, + { + "epoch": 0.4517754023835852, + "grad_norm": 1.4055986357515828, + "learning_rate": 2.0043219569189312e-05, + "loss": 0.6377, + "step": 3677 + }, + { + "epoch": 0.4518982676004423, + "grad_norm": 1.2861334440730345, + "learning_rate": 2.0037160321807846e-05, + "loss": 0.7786, + "step": 3678 + }, + { + "epoch": 0.45202113281729944, + "grad_norm": 1.1967032964961712, + "learning_rate": 2.00311001478734e-05, + "loss": 0.5507, + "step": 3679 + }, + { + "epoch": 0.45214399803415656, + "grad_norm": 1.5310305814525436, + "learning_rate": 2.0025039048500712e-05, + "loss": 0.5561, + "step": 3680 + }, + { + "epoch": 0.4522668632510136, + "grad_norm": 1.6039050442525151, + "learning_rate": 2.0018977024804682e-05, + "loss": 0.6803, + "step": 3681 + }, + { + "epoch": 0.45238972846787073, + "grad_norm": 1.425255523701518, + "learning_rate": 2.0012914077900374e-05, + "loss": 0.6021, + "step": 3682 + }, + { + "epoch": 0.45251259368472785, + "grad_norm": 1.2639826232576261, + "learning_rate": 2.0006850208903034e-05, + "loss": 0.5973, + "step": 3683 + }, + { + "epoch": 0.45263545890158496, + "grad_norm": 1.1627560978684526, + "learning_rate": 2.000078541892807e-05, + "loss": 0.6714, + "step": 3684 + }, + { + "epoch": 0.4527583241184421, + "grad_norm": 1.154607282256261, + "learning_rate": 1.9994719709091052e-05, + "loss": 0.7082, + "step": 3685 + }, + { + "epoch": 0.4528811893352992, + "grad_norm": 1.2373389347740353, + "learning_rate": 1.9988653080507743e-05, + "loss": 0.6576, + "step": 3686 + }, + { + "epoch": 0.4530040545521563, + "grad_norm": 1.3886700522407687, + "learning_rate": 1.9982585534294054e-05, + "loss": 0.6471, + "step": 3687 + }, + { + "epoch": 0.45312691976901337, + "grad_norm": 1.4109106976038837, + "learning_rate": 1.9976517071566065e-05, + "loss": 0.5998, + "step": 3688 + }, + { + "epoch": 0.4532497849858705, + "grad_norm": 1.4516845833112728, + "learning_rate": 1.9970447693440036e-05, + "loss": 0.599, + "step": 3689 + }, + { + "epoch": 0.4533726502027276, + "grad_norm": 1.3072646731374264, + "learning_rate": 1.9964377401032386e-05, + "loss": 0.6001, + "step": 3690 + }, + { + "epoch": 0.4534955154195847, + "grad_norm": 1.3378503973997127, + "learning_rate": 1.9958306195459708e-05, + "loss": 0.7439, + "step": 3691 + }, + { + "epoch": 0.45361838063644183, + "grad_norm": 1.6228780187193528, + "learning_rate": 1.995223407783877e-05, + "loss": 0.6618, + "step": 3692 + }, + { + "epoch": 0.45374124585329895, + "grad_norm": 1.2390147392240438, + "learning_rate": 1.9946161049286474e-05, + "loss": 0.6153, + "step": 3693 + }, + { + "epoch": 0.45386411107015606, + "grad_norm": 1.3843263909989256, + "learning_rate": 1.994008711091994e-05, + "loss": 0.5373, + "step": 3694 + }, + { + "epoch": 0.4539869762870131, + "grad_norm": 1.2880044681607195, + "learning_rate": 1.9934012263856417e-05, + "loss": 0.5557, + "step": 3695 + }, + { + "epoch": 0.45410984150387024, + "grad_norm": 1.2750745525386145, + "learning_rate": 1.992793650921334e-05, + "loss": 0.628, + "step": 3696 + }, + { + "epoch": 0.45423270672072735, + "grad_norm": 1.0745362173817081, + "learning_rate": 1.99218598481083e-05, + "loss": 0.6217, + "step": 3697 + }, + { + "epoch": 0.45435557193758447, + "grad_norm": 1.4687755714889328, + "learning_rate": 1.9915782281659052e-05, + "loss": 0.6717, + "step": 3698 + }, + { + "epoch": 0.4544784371544416, + "grad_norm": 1.1911644193582225, + "learning_rate": 1.9909703810983542e-05, + "loss": 0.6516, + "step": 3699 + }, + { + "epoch": 0.4546013023712987, + "grad_norm": 1.3717092150771575, + "learning_rate": 1.9903624437199853e-05, + "loss": 0.6041, + "step": 3700 + }, + { + "epoch": 0.4547241675881558, + "grad_norm": 1.473235670777995, + "learning_rate": 1.9897544161426252e-05, + "loss": 0.7015, + "step": 3701 + }, + { + "epoch": 0.4548470328050129, + "grad_norm": 1.251566173090998, + "learning_rate": 1.9891462984781162e-05, + "loss": 0.6571, + "step": 3702 + }, + { + "epoch": 0.45496989802187, + "grad_norm": 1.1473380659866728, + "learning_rate": 1.988538090838318e-05, + "loss": 0.5992, + "step": 3703 + }, + { + "epoch": 0.4550927632387271, + "grad_norm": 1.3568890183383173, + "learning_rate": 1.987929793335106e-05, + "loss": 0.6078, + "step": 3704 + }, + { + "epoch": 0.4552156284555842, + "grad_norm": 1.198861842845574, + "learning_rate": 1.987321406080373e-05, + "loss": 0.6082, + "step": 3705 + }, + { + "epoch": 0.45533849367244134, + "grad_norm": 1.7384048791408333, + "learning_rate": 1.9867129291860283e-05, + "loss": 0.6645, + "step": 3706 + }, + { + "epoch": 0.45546135888929845, + "grad_norm": 1.3572765356707082, + "learning_rate": 1.986104362763996e-05, + "loss": 0.597, + "step": 3707 + }, + { + "epoch": 0.45558422410615557, + "grad_norm": 1.1978982998013663, + "learning_rate": 1.985495706926219e-05, + "loss": 0.6041, + "step": 3708 + }, + { + "epoch": 0.4557070893230127, + "grad_norm": 1.1341290043769148, + "learning_rate": 1.984886961784655e-05, + "loss": 0.6731, + "step": 3709 + }, + { + "epoch": 0.45582995453986974, + "grad_norm": 1.2886893888930566, + "learning_rate": 1.984278127451279e-05, + "loss": 0.6579, + "step": 3710 + }, + { + "epoch": 0.45595281975672686, + "grad_norm": 1.322680233079386, + "learning_rate": 1.9836692040380826e-05, + "loss": 0.654, + "step": 3711 + }, + { + "epoch": 0.456075684973584, + "grad_norm": 1.4401124143781512, + "learning_rate": 1.9830601916570722e-05, + "loss": 0.5738, + "step": 3712 + }, + { + "epoch": 0.4561985501904411, + "grad_norm": 1.243888099544029, + "learning_rate": 1.9824510904202725e-05, + "loss": 0.6981, + "step": 3713 + }, + { + "epoch": 0.4563214154072982, + "grad_norm": 1.2110228764802953, + "learning_rate": 1.9818419004397234e-05, + "loss": 0.5488, + "step": 3714 + }, + { + "epoch": 0.4564442806241553, + "grad_norm": 1.2592549023493207, + "learning_rate": 1.981232621827482e-05, + "loss": 0.6423, + "step": 3715 + }, + { + "epoch": 0.45656714584101243, + "grad_norm": 1.1448693333504705, + "learning_rate": 1.980623254695621e-05, + "loss": 0.8262, + "step": 3716 + }, + { + "epoch": 0.4566900110578695, + "grad_norm": 1.2140376234421097, + "learning_rate": 1.9800137991562286e-05, + "loss": 0.5382, + "step": 3717 + }, + { + "epoch": 0.4568128762747266, + "grad_norm": 1.2736906900887848, + "learning_rate": 1.9794042553214106e-05, + "loss": 0.6918, + "step": 3718 + }, + { + "epoch": 0.4569357414915837, + "grad_norm": 1.3932240844962591, + "learning_rate": 1.9787946233032896e-05, + "loss": 0.692, + "step": 3719 + }, + { + "epoch": 0.45705860670844084, + "grad_norm": 1.1606858854489719, + "learning_rate": 1.978184903214002e-05, + "loss": 0.5459, + "step": 3720 + }, + { + "epoch": 0.45718147192529796, + "grad_norm": 1.0359673180926554, + "learning_rate": 1.977575095165703e-05, + "loss": 0.6175, + "step": 3721 + }, + { + "epoch": 0.45730433714215507, + "grad_norm": 1.5578775028924268, + "learning_rate": 1.9769651992705627e-05, + "loss": 0.7043, + "step": 3722 + }, + { + "epoch": 0.4574272023590122, + "grad_norm": 1.4408694250576741, + "learning_rate": 1.9763552156407666e-05, + "loss": 0.7181, + "step": 3723 + }, + { + "epoch": 0.45755006757586925, + "grad_norm": 1.2521246379512494, + "learning_rate": 1.9757451443885184e-05, + "loss": 0.6042, + "step": 3724 + }, + { + "epoch": 0.45767293279272636, + "grad_norm": 1.290472233646275, + "learning_rate": 1.9751349856260357e-05, + "loss": 0.5695, + "step": 3725 + }, + { + "epoch": 0.4577957980095835, + "grad_norm": 1.258548104034713, + "learning_rate": 1.9745247394655544e-05, + "loss": 0.5503, + "step": 3726 + }, + { + "epoch": 0.4579186632264406, + "grad_norm": 1.215852457342528, + "learning_rate": 1.973914406019324e-05, + "loss": 0.5684, + "step": 3727 + }, + { + "epoch": 0.4580415284432977, + "grad_norm": 1.2416447280128335, + "learning_rate": 1.9733039853996126e-05, + "loss": 0.5681, + "step": 3728 + }, + { + "epoch": 0.4581643936601548, + "grad_norm": 1.2747921926988273, + "learning_rate": 1.9726934777187023e-05, + "loss": 0.6212, + "step": 3729 + }, + { + "epoch": 0.45828725887701194, + "grad_norm": 1.3585046595335897, + "learning_rate": 1.9720828830888922e-05, + "loss": 0.6303, + "step": 3730 + }, + { + "epoch": 0.458410124093869, + "grad_norm": 1.61069838460963, + "learning_rate": 1.9714722016224977e-05, + "loss": 0.6449, + "step": 3731 + }, + { + "epoch": 0.4585329893107261, + "grad_norm": 1.5184298098804947, + "learning_rate": 1.970861433431849e-05, + "loss": 0.5386, + "step": 3732 + }, + { + "epoch": 0.45865585452758323, + "grad_norm": 1.1162760646650856, + "learning_rate": 1.970250578629293e-05, + "loss": 0.5641, + "step": 3733 + }, + { + "epoch": 0.45877871974444034, + "grad_norm": 1.098717743414248, + "learning_rate": 1.9696396373271935e-05, + "loss": 0.5954, + "step": 3734 + }, + { + "epoch": 0.45890158496129746, + "grad_norm": 1.3096202519613267, + "learning_rate": 1.9690286096379277e-05, + "loss": 0.5636, + "step": 3735 + }, + { + "epoch": 0.4590244501781546, + "grad_norm": 1.151746113371294, + "learning_rate": 1.9684174956738912e-05, + "loss": 0.6505, + "step": 3736 + }, + { + "epoch": 0.4591473153950117, + "grad_norm": 1.3319707268626126, + "learning_rate": 1.9678062955474943e-05, + "loss": 0.5278, + "step": 3737 + }, + { + "epoch": 0.4592701806118688, + "grad_norm": 1.1796552079905098, + "learning_rate": 1.9671950093711633e-05, + "loss": 0.5871, + "step": 3738 + }, + { + "epoch": 0.45939304582872587, + "grad_norm": 1.561430907926399, + "learning_rate": 1.9665836372573397e-05, + "loss": 0.5791, + "step": 3739 + }, + { + "epoch": 0.459515911045583, + "grad_norm": 1.1960841023618576, + "learning_rate": 1.965972179318482e-05, + "loss": 0.6526, + "step": 3740 + }, + { + "epoch": 0.4596387762624401, + "grad_norm": 1.218789779812076, + "learning_rate": 1.965360635667064e-05, + "loss": 0.5007, + "step": 3741 + }, + { + "epoch": 0.4597616414792972, + "grad_norm": 1.3203182736302792, + "learning_rate": 1.964749006415575e-05, + "loss": 0.5807, + "step": 3742 + }, + { + "epoch": 0.4598845066961543, + "grad_norm": 1.2717629415883203, + "learning_rate": 1.9641372916765207e-05, + "loss": 0.6308, + "step": 3743 + }, + { + "epoch": 0.46000737191301144, + "grad_norm": 1.281574065848736, + "learning_rate": 1.963525491562421e-05, + "loss": 0.6638, + "step": 3744 + }, + { + "epoch": 0.46013023712986856, + "grad_norm": 1.3397532176789233, + "learning_rate": 1.962913606185814e-05, + "loss": 0.6202, + "step": 3745 + }, + { + "epoch": 0.4602531023467256, + "grad_norm": 1.3580139994482554, + "learning_rate": 1.9623016356592504e-05, + "loss": 0.4735, + "step": 3746 + }, + { + "epoch": 0.46037596756358273, + "grad_norm": 1.1892680773124515, + "learning_rate": 1.9616895800952994e-05, + "loss": 0.5591, + "step": 3747 + }, + { + "epoch": 0.46049883278043985, + "grad_norm": 1.190529255668072, + "learning_rate": 1.961077439606544e-05, + "loss": 0.5957, + "step": 3748 + }, + { + "epoch": 0.46062169799729696, + "grad_norm": 1.2386781273918077, + "learning_rate": 1.9604652143055843e-05, + "loss": 0.6015, + "step": 3749 + }, + { + "epoch": 0.4607445632141541, + "grad_norm": 1.3390762694138658, + "learning_rate": 1.9598529043050343e-05, + "loss": 0.6863, + "step": 3750 + }, + { + "epoch": 0.4608674284310112, + "grad_norm": 1.6510267020480978, + "learning_rate": 1.9592405097175248e-05, + "loss": 0.684, + "step": 3751 + }, + { + "epoch": 0.4609902936478683, + "grad_norm": 1.3381448099549105, + "learning_rate": 1.958628030655702e-05, + "loss": 0.5401, + "step": 3752 + }, + { + "epoch": 0.46111315886472537, + "grad_norm": 1.2803738222763699, + "learning_rate": 1.958015467232227e-05, + "loss": 0.6338, + "step": 3753 + }, + { + "epoch": 0.4612360240815825, + "grad_norm": 1.3839873291308296, + "learning_rate": 1.9574028195597776e-05, + "loss": 0.6562, + "step": 3754 + }, + { + "epoch": 0.4613588892984396, + "grad_norm": 1.2829827792894752, + "learning_rate": 1.9567900877510456e-05, + "loss": 0.6404, + "step": 3755 + }, + { + "epoch": 0.4614817545152967, + "grad_norm": 1.1809218325535302, + "learning_rate": 1.9561772719187394e-05, + "loss": 0.6138, + "step": 3756 + }, + { + "epoch": 0.46160461973215383, + "grad_norm": 1.6879531517714317, + "learning_rate": 1.9555643721755826e-05, + "loss": 0.7663, + "step": 3757 + }, + { + "epoch": 0.46172748494901095, + "grad_norm": 1.149127655615933, + "learning_rate": 1.9549513886343135e-05, + "loss": 0.6626, + "step": 3758 + }, + { + "epoch": 0.46185035016586806, + "grad_norm": 1.2412182580528726, + "learning_rate": 1.9543383214076874e-05, + "loss": 0.5754, + "step": 3759 + }, + { + "epoch": 0.4619732153827252, + "grad_norm": 1.1649984197532444, + "learning_rate": 1.9537251706084733e-05, + "loss": 0.631, + "step": 3760 + }, + { + "epoch": 0.46209608059958224, + "grad_norm": 1.0601781212343289, + "learning_rate": 1.9531119363494566e-05, + "loss": 0.7062, + "step": 3761 + }, + { + "epoch": 0.46221894581643935, + "grad_norm": 1.2157223454954063, + "learning_rate": 1.952498618743438e-05, + "loss": 0.4988, + "step": 3762 + }, + { + "epoch": 0.46234181103329647, + "grad_norm": 1.38230037475248, + "learning_rate": 1.9518852179032325e-05, + "loss": 0.763, + "step": 3763 + }, + { + "epoch": 0.4624646762501536, + "grad_norm": 1.4139891712596242, + "learning_rate": 1.9512717339416724e-05, + "loss": 0.5702, + "step": 3764 + }, + { + "epoch": 0.4625875414670107, + "grad_norm": 1.3227720502833038, + "learning_rate": 1.950658166971603e-05, + "loss": 0.679, + "step": 3765 + }, + { + "epoch": 0.4627104066838678, + "grad_norm": 1.0302683673686026, + "learning_rate": 1.9500445171058866e-05, + "loss": 0.6118, + "step": 3766 + }, + { + "epoch": 0.46283327190072493, + "grad_norm": 1.2677348840814187, + "learning_rate": 1.9494307844573997e-05, + "loss": 0.6256, + "step": 3767 + }, + { + "epoch": 0.462956137117582, + "grad_norm": 1.2768898060533036, + "learning_rate": 1.9488169691390348e-05, + "loss": 0.5321, + "step": 3768 + }, + { + "epoch": 0.4630790023344391, + "grad_norm": 0.9528117595121683, + "learning_rate": 1.948203071263699e-05, + "loss": 0.5518, + "step": 3769 + }, + { + "epoch": 0.4632018675512962, + "grad_norm": 1.3014750816793834, + "learning_rate": 1.947589090944315e-05, + "loss": 0.5846, + "step": 3770 + }, + { + "epoch": 0.46332473276815334, + "grad_norm": 1.2810220849076575, + "learning_rate": 1.9469750282938208e-05, + "loss": 0.7296, + "step": 3771 + }, + { + "epoch": 0.46344759798501045, + "grad_norm": 1.4634174632782038, + "learning_rate": 1.9463608834251687e-05, + "loss": 0.7016, + "step": 3772 + }, + { + "epoch": 0.46357046320186757, + "grad_norm": 1.352038168508579, + "learning_rate": 1.9457466564513268e-05, + "loss": 0.6406, + "step": 3773 + }, + { + "epoch": 0.4636933284187247, + "grad_norm": 0.9929995844471909, + "learning_rate": 1.945132347485278e-05, + "loss": 0.4639, + "step": 3774 + }, + { + "epoch": 0.46381619363558174, + "grad_norm": 1.4605747791875108, + "learning_rate": 1.9445179566400206e-05, + "loss": 0.6709, + "step": 3775 + }, + { + "epoch": 0.46393905885243886, + "grad_norm": 1.3641331514214992, + "learning_rate": 1.943903484028568e-05, + "loss": 0.5931, + "step": 3776 + }, + { + "epoch": 0.464061924069296, + "grad_norm": 1.3700958863068857, + "learning_rate": 1.9432889297639485e-05, + "loss": 0.7184, + "step": 3777 + }, + { + "epoch": 0.4641847892861531, + "grad_norm": 1.3889907957773826, + "learning_rate": 1.9426742939592052e-05, + "loss": 0.6643, + "step": 3778 + }, + { + "epoch": 0.4643076545030102, + "grad_norm": 1.6335137487396247, + "learning_rate": 1.942059576727396e-05, + "loss": 0.6309, + "step": 3779 + }, + { + "epoch": 0.4644305197198673, + "grad_norm": 1.2657516068331456, + "learning_rate": 1.941444778181595e-05, + "loss": 0.737, + "step": 3780 + }, + { + "epoch": 0.46455338493672443, + "grad_norm": 1.2545993595888654, + "learning_rate": 1.94082989843489e-05, + "loss": 0.4973, + "step": 3781 + }, + { + "epoch": 0.4646762501535815, + "grad_norm": 1.3654930877391755, + "learning_rate": 1.9402149376003837e-05, + "loss": 0.653, + "step": 3782 + }, + { + "epoch": 0.4647991153704386, + "grad_norm": 1.1264959459315358, + "learning_rate": 1.9395998957911945e-05, + "loss": 0.6475, + "step": 3783 + }, + { + "epoch": 0.4649219805872957, + "grad_norm": 1.4074164695526388, + "learning_rate": 1.938984773120455e-05, + "loss": 0.5942, + "step": 3784 + }, + { + "epoch": 0.46504484580415284, + "grad_norm": 1.1488133025699512, + "learning_rate": 1.938369569701314e-05, + "loss": 0.6057, + "step": 3785 + }, + { + "epoch": 0.46516771102100996, + "grad_norm": 1.1493092857007638, + "learning_rate": 1.9377542856469335e-05, + "loss": 0.6902, + "step": 3786 + }, + { + "epoch": 0.46529057623786707, + "grad_norm": 1.700424576299404, + "learning_rate": 1.937138921070491e-05, + "loss": 0.5651, + "step": 3787 + }, + { + "epoch": 0.4654134414547242, + "grad_norm": 1.3052921795858181, + "learning_rate": 1.9365234760851792e-05, + "loss": 0.721, + "step": 3788 + }, + { + "epoch": 0.4655363066715813, + "grad_norm": 1.4643499055674702, + "learning_rate": 1.9359079508042046e-05, + "loss": 0.6033, + "step": 3789 + }, + { + "epoch": 0.46565917188843836, + "grad_norm": 1.6635211743791694, + "learning_rate": 1.9352923453407896e-05, + "loss": 0.7195, + "step": 3790 + }, + { + "epoch": 0.4657820371052955, + "grad_norm": 1.1950497775249331, + "learning_rate": 1.934676659808171e-05, + "loss": 0.6337, + "step": 3791 + }, + { + "epoch": 0.4659049023221526, + "grad_norm": 1.2240223318366525, + "learning_rate": 1.934060894319599e-05, + "loss": 0.5517, + "step": 3792 + }, + { + "epoch": 0.4660277675390097, + "grad_norm": 1.0876730607570364, + "learning_rate": 1.933445048988341e-05, + "loss": 0.6187, + "step": 3793 + }, + { + "epoch": 0.4661506327558668, + "grad_norm": 1.4896757462740706, + "learning_rate": 1.932829123927677e-05, + "loss": 0.6179, + "step": 3794 + }, + { + "epoch": 0.46627349797272394, + "grad_norm": 1.7634194147032483, + "learning_rate": 1.9322131192509028e-05, + "loss": 0.6759, + "step": 3795 + }, + { + "epoch": 0.46639636318958105, + "grad_norm": 1.2542152950416465, + "learning_rate": 1.9315970350713278e-05, + "loss": 0.6996, + "step": 3796 + }, + { + "epoch": 0.4665192284064381, + "grad_norm": 1.5079041862957505, + "learning_rate": 1.930980871502278e-05, + "loss": 0.6142, + "step": 3797 + }, + { + "epoch": 0.46664209362329523, + "grad_norm": 1.4182529581695562, + "learning_rate": 1.9303646286570913e-05, + "loss": 0.6602, + "step": 3798 + }, + { + "epoch": 0.46676495884015234, + "grad_norm": 1.271319533093495, + "learning_rate": 1.9297483066491222e-05, + "loss": 0.6356, + "step": 3799 + }, + { + "epoch": 0.46688782405700946, + "grad_norm": 1.3221681207996063, + "learning_rate": 1.9291319055917393e-05, + "loss": 0.6204, + "step": 3800 + }, + { + "epoch": 0.4670106892738666, + "grad_norm": 1.1898691118673697, + "learning_rate": 1.9285154255983257e-05, + "loss": 0.6151, + "step": 3801 + }, + { + "epoch": 0.4671335544907237, + "grad_norm": 2.004111280533378, + "learning_rate": 1.927898866782278e-05, + "loss": 0.6515, + "step": 3802 + }, + { + "epoch": 0.4672564197075808, + "grad_norm": 1.3491662009494219, + "learning_rate": 1.9272822292570092e-05, + "loss": 0.6157, + "step": 3803 + }, + { + "epoch": 0.46737928492443787, + "grad_norm": 1.1863502750744326, + "learning_rate": 1.926665513135945e-05, + "loss": 0.5614, + "step": 3804 + }, + { + "epoch": 0.467502150141295, + "grad_norm": 1.2406472768872499, + "learning_rate": 1.9260487185325267e-05, + "loss": 0.6041, + "step": 3805 + }, + { + "epoch": 0.4676250153581521, + "grad_norm": 1.5208541268136644, + "learning_rate": 1.92543184556021e-05, + "loss": 0.602, + "step": 3806 + }, + { + "epoch": 0.4677478805750092, + "grad_norm": 1.653283243747774, + "learning_rate": 1.924814894332464e-05, + "loss": 0.7092, + "step": 3807 + }, + { + "epoch": 0.4678707457918663, + "grad_norm": 1.0472641279347328, + "learning_rate": 1.9241978649627738e-05, + "loss": 0.5967, + "step": 3808 + }, + { + "epoch": 0.46799361100872344, + "grad_norm": 1.396801622017612, + "learning_rate": 1.9235807575646368e-05, + "loss": 0.6205, + "step": 3809 + }, + { + "epoch": 0.46811647622558056, + "grad_norm": 1.4386213988666408, + "learning_rate": 1.9229635722515667e-05, + "loss": 0.7292, + "step": 3810 + }, + { + "epoch": 0.4682393414424377, + "grad_norm": 1.3045528248863723, + "learning_rate": 1.9223463091370903e-05, + "loss": 0.551, + "step": 3811 + }, + { + "epoch": 0.46836220665929473, + "grad_norm": 1.200766541261277, + "learning_rate": 1.9217289683347496e-05, + "loss": 0.6229, + "step": 3812 + }, + { + "epoch": 0.46848507187615185, + "grad_norm": 1.3989599598443423, + "learning_rate": 1.9211115499580995e-05, + "loss": 0.6407, + "step": 3813 + }, + { + "epoch": 0.46860793709300896, + "grad_norm": 1.1915537422818472, + "learning_rate": 1.9204940541207113e-05, + "loss": 0.5981, + "step": 3814 + }, + { + "epoch": 0.4687308023098661, + "grad_norm": 1.3462727399978962, + "learning_rate": 1.919876480936169e-05, + "loss": 0.6542, + "step": 3815 + }, + { + "epoch": 0.4688536675267232, + "grad_norm": 1.2386780017279388, + "learning_rate": 1.919258830518071e-05, + "loss": 0.5555, + "step": 3816 + }, + { + "epoch": 0.4689765327435803, + "grad_norm": 1.4495370448771092, + "learning_rate": 1.91864110298003e-05, + "loss": 0.8099, + "step": 3817 + }, + { + "epoch": 0.4690993979604374, + "grad_norm": 1.4352042039153425, + "learning_rate": 1.918023298435673e-05, + "loss": 0.6357, + "step": 3818 + }, + { + "epoch": 0.4692222631772945, + "grad_norm": 1.2802503446938998, + "learning_rate": 1.9174054169986415e-05, + "loss": 0.5493, + "step": 3819 + }, + { + "epoch": 0.4693451283941516, + "grad_norm": 1.3373062680926957, + "learning_rate": 1.9167874587825902e-05, + "loss": 0.6193, + "step": 3820 + }, + { + "epoch": 0.4694679936110087, + "grad_norm": 1.360734048908169, + "learning_rate": 1.916169423901189e-05, + "loss": 0.6583, + "step": 3821 + }, + { + "epoch": 0.46959085882786583, + "grad_norm": 1.2438042689672133, + "learning_rate": 1.9155513124681216e-05, + "loss": 0.7309, + "step": 3822 + }, + { + "epoch": 0.46971372404472295, + "grad_norm": 1.1078731485563078, + "learning_rate": 1.9149331245970844e-05, + "loss": 0.5816, + "step": 3823 + }, + { + "epoch": 0.46983658926158006, + "grad_norm": 1.1250145983570483, + "learning_rate": 1.91431486040179e-05, + "loss": 0.5534, + "step": 3824 + }, + { + "epoch": 0.4699594544784372, + "grad_norm": 1.2459069303696255, + "learning_rate": 1.913696519995964e-05, + "loss": 0.6332, + "step": 3825 + }, + { + "epoch": 0.47008231969529424, + "grad_norm": 1.2969302619450613, + "learning_rate": 1.9130781034933463e-05, + "loss": 0.5694, + "step": 3826 + }, + { + "epoch": 0.47020518491215135, + "grad_norm": 1.2350086332234365, + "learning_rate": 1.9124596110076908e-05, + "loss": 0.5915, + "step": 3827 + }, + { + "epoch": 0.47032805012900847, + "grad_norm": 1.4477504729911732, + "learning_rate": 1.911841042652764e-05, + "loss": 0.6547, + "step": 3828 + }, + { + "epoch": 0.4704509153458656, + "grad_norm": 1.4374148151895036, + "learning_rate": 1.911222398542349e-05, + "loss": 0.6946, + "step": 3829 + }, + { + "epoch": 0.4705737805627227, + "grad_norm": 1.1278006503290798, + "learning_rate": 1.91060367879024e-05, + "loss": 0.6139, + "step": 3830 + }, + { + "epoch": 0.4706966457795798, + "grad_norm": 1.1292263964994338, + "learning_rate": 1.9099848835102476e-05, + "loss": 0.607, + "step": 3831 + }, + { + "epoch": 0.47081951099643693, + "grad_norm": 1.306383412549856, + "learning_rate": 1.9093660128161943e-05, + "loss": 0.6231, + "step": 3832 + }, + { + "epoch": 0.470942376213294, + "grad_norm": 1.3271496958927365, + "learning_rate": 1.908747066821918e-05, + "loss": 0.6352, + "step": 3833 + }, + { + "epoch": 0.4710652414301511, + "grad_norm": 1.7351260736050274, + "learning_rate": 1.908128045641269e-05, + "loss": 0.6636, + "step": 3834 + }, + { + "epoch": 0.4711881066470082, + "grad_norm": 1.1554619042014789, + "learning_rate": 1.9075089493881137e-05, + "loss": 0.578, + "step": 3835 + }, + { + "epoch": 0.47131097186386534, + "grad_norm": 1.579208006198566, + "learning_rate": 1.9068897781763294e-05, + "loss": 0.5996, + "step": 3836 + }, + { + "epoch": 0.47143383708072245, + "grad_norm": 1.1189354842550163, + "learning_rate": 1.9062705321198095e-05, + "loss": 0.7431, + "step": 3837 + }, + { + "epoch": 0.47155670229757957, + "grad_norm": 1.1851703991136269, + "learning_rate": 1.90565121133246e-05, + "loss": 0.5675, + "step": 3838 + }, + { + "epoch": 0.4716795675144367, + "grad_norm": 1.189395990193646, + "learning_rate": 1.905031815928201e-05, + "loss": 0.5666, + "step": 3839 + }, + { + "epoch": 0.4718024327312938, + "grad_norm": 1.1323045826251998, + "learning_rate": 1.9044123460209655e-05, + "loss": 0.5559, + "step": 3840 + }, + { + "epoch": 0.47192529794815086, + "grad_norm": 1.2095688872997774, + "learning_rate": 1.9037928017247023e-05, + "loss": 0.6432, + "step": 3841 + }, + { + "epoch": 0.472048163165008, + "grad_norm": 1.1425337711530468, + "learning_rate": 1.9031731831533716e-05, + "loss": 0.5268, + "step": 3842 + }, + { + "epoch": 0.4721710283818651, + "grad_norm": 1.1967060198454642, + "learning_rate": 1.902553490420949e-05, + "loss": 0.5558, + "step": 3843 + }, + { + "epoch": 0.4722938935987222, + "grad_norm": 5.929724944469359, + "learning_rate": 1.9019337236414218e-05, + "loss": 0.6937, + "step": 3844 + }, + { + "epoch": 0.4724167588155793, + "grad_norm": 1.4035200851847638, + "learning_rate": 1.9013138829287932e-05, + "loss": 0.5517, + "step": 3845 + }, + { + "epoch": 0.47253962403243643, + "grad_norm": 1.9565600568143076, + "learning_rate": 1.900693968397078e-05, + "loss": 0.7654, + "step": 3846 + }, + { + "epoch": 0.47266248924929355, + "grad_norm": 1.215992595253324, + "learning_rate": 1.9000739801603066e-05, + "loss": 0.6724, + "step": 3847 + }, + { + "epoch": 0.4727853544661506, + "grad_norm": 1.3089493401745964, + "learning_rate": 1.8994539183325207e-05, + "loss": 0.5212, + "step": 3848 + }, + { + "epoch": 0.4729082196830077, + "grad_norm": 1.3454363918809649, + "learning_rate": 1.8988337830277772e-05, + "loss": 0.5585, + "step": 3849 + }, + { + "epoch": 0.47303108489986484, + "grad_norm": 2.2578457010290274, + "learning_rate": 1.898213574360146e-05, + "loss": 0.5749, + "step": 3850 + }, + { + "epoch": 0.47315395011672196, + "grad_norm": 1.199849069490536, + "learning_rate": 1.8975932924437098e-05, + "loss": 0.7011, + "step": 3851 + }, + { + "epoch": 0.47327681533357907, + "grad_norm": 1.1679738950183993, + "learning_rate": 1.8969729373925668e-05, + "loss": 0.5525, + "step": 3852 + }, + { + "epoch": 0.4733996805504362, + "grad_norm": 1.2374539239496998, + "learning_rate": 1.896352509320825e-05, + "loss": 0.6659, + "step": 3853 + }, + { + "epoch": 0.4735225457672933, + "grad_norm": 1.0888289679307717, + "learning_rate": 1.8957320083426108e-05, + "loss": 0.7005, + "step": 3854 + }, + { + "epoch": 0.47364541098415036, + "grad_norm": 1.3099238607809705, + "learning_rate": 1.8951114345720598e-05, + "loss": 0.547, + "step": 3855 + }, + { + "epoch": 0.4737682762010075, + "grad_norm": 1.3122879693491132, + "learning_rate": 1.8944907881233225e-05, + "loss": 0.5819, + "step": 3856 + }, + { + "epoch": 0.4738911414178646, + "grad_norm": 1.389670071645873, + "learning_rate": 1.8938700691105632e-05, + "loss": 0.5933, + "step": 3857 + }, + { + "epoch": 0.4740140066347217, + "grad_norm": 1.1342834924475842, + "learning_rate": 1.8932492776479596e-05, + "loss": 0.6408, + "step": 3858 + }, + { + "epoch": 0.4741368718515788, + "grad_norm": 1.183331606933879, + "learning_rate": 1.892628413849701e-05, + "loss": 0.7883, + "step": 3859 + }, + { + "epoch": 0.47425973706843594, + "grad_norm": 1.1461954674078751, + "learning_rate": 1.892007477829992e-05, + "loss": 0.6126, + "step": 3860 + }, + { + "epoch": 0.47438260228529305, + "grad_norm": 1.0041573143812426, + "learning_rate": 1.8913864697030497e-05, + "loss": 0.5689, + "step": 3861 + }, + { + "epoch": 0.47450546750215017, + "grad_norm": 1.2589390496285016, + "learning_rate": 1.8907653895831047e-05, + "loss": 0.6554, + "step": 3862 + }, + { + "epoch": 0.47462833271900723, + "grad_norm": 1.2688369093449443, + "learning_rate": 1.8901442375844006e-05, + "loss": 0.5972, + "step": 3863 + }, + { + "epoch": 0.47475119793586434, + "grad_norm": 1.1773339844840642, + "learning_rate": 1.8895230138211942e-05, + "loss": 0.7355, + "step": 3864 + }, + { + "epoch": 0.47487406315272146, + "grad_norm": 1.4232526457491783, + "learning_rate": 1.8889017184077554e-05, + "loss": 0.6642, + "step": 3865 + }, + { + "epoch": 0.4749969283695786, + "grad_norm": 1.2387041624722668, + "learning_rate": 1.8882803514583676e-05, + "loss": 0.5916, + "step": 3866 + }, + { + "epoch": 0.4751197935864357, + "grad_norm": 1.2014310627429468, + "learning_rate": 1.8876589130873273e-05, + "loss": 0.6093, + "step": 3867 + }, + { + "epoch": 0.4752426588032928, + "grad_norm": 1.468685839038577, + "learning_rate": 1.8870374034089434e-05, + "loss": 0.6477, + "step": 3868 + }, + { + "epoch": 0.4753655240201499, + "grad_norm": 1.1716285743240171, + "learning_rate": 1.8864158225375403e-05, + "loss": 0.4885, + "step": 3869 + }, + { + "epoch": 0.475488389237007, + "grad_norm": 1.212137782523293, + "learning_rate": 1.8857941705874514e-05, + "loss": 0.6683, + "step": 3870 + }, + { + "epoch": 0.4756112544538641, + "grad_norm": 1.0855627580319434, + "learning_rate": 1.8851724476730275e-05, + "loss": 0.5891, + "step": 3871 + }, + { + "epoch": 0.4757341196707212, + "grad_norm": 0.994494879277994, + "learning_rate": 1.88455065390863e-05, + "loss": 0.6229, + "step": 3872 + }, + { + "epoch": 0.4758569848875783, + "grad_norm": 1.3576949677231227, + "learning_rate": 1.8839287894086334e-05, + "loss": 0.6177, + "step": 3873 + }, + { + "epoch": 0.47597985010443544, + "grad_norm": 1.229682497151213, + "learning_rate": 1.8833068542874258e-05, + "loss": 0.6387, + "step": 3874 + }, + { + "epoch": 0.47610271532129256, + "grad_norm": 1.2766634456115593, + "learning_rate": 1.882684848659408e-05, + "loss": 0.7477, + "step": 3875 + }, + { + "epoch": 0.4762255805381497, + "grad_norm": 1.2280841408300218, + "learning_rate": 1.8820627726389944e-05, + "loss": 0.6304, + "step": 3876 + }, + { + "epoch": 0.47634844575500673, + "grad_norm": 1.1794192925905904, + "learning_rate": 1.8814406263406115e-05, + "loss": 0.662, + "step": 3877 + }, + { + "epoch": 0.47647131097186385, + "grad_norm": 1.2696634327089864, + "learning_rate": 1.880818409878699e-05, + "loss": 0.4927, + "step": 3878 + }, + { + "epoch": 0.47659417618872096, + "grad_norm": 1.2643467182076942, + "learning_rate": 1.8801961233677095e-05, + "loss": 0.7021, + "step": 3879 + }, + { + "epoch": 0.4767170414055781, + "grad_norm": 1.120866202006693, + "learning_rate": 1.879573766922109e-05, + "loss": 0.5069, + "step": 3880 + }, + { + "epoch": 0.4768399066224352, + "grad_norm": 1.238310712323139, + "learning_rate": 1.878951340656376e-05, + "loss": 0.5573, + "step": 3881 + }, + { + "epoch": 0.4769627718392923, + "grad_norm": 1.2055907768366447, + "learning_rate": 1.8783288446850006e-05, + "loss": 0.6603, + "step": 3882 + }, + { + "epoch": 0.4770856370561494, + "grad_norm": 1.2474755951350283, + "learning_rate": 1.8777062791224883e-05, + "loss": 0.6108, + "step": 3883 + }, + { + "epoch": 0.4772085022730065, + "grad_norm": 1.1493208629770617, + "learning_rate": 1.877083644083356e-05, + "loss": 0.4694, + "step": 3884 + }, + { + "epoch": 0.4773313674898636, + "grad_norm": 1.2759934911300765, + "learning_rate": 1.876460939682132e-05, + "loss": 0.6714, + "step": 3885 + }, + { + "epoch": 0.4774542327067207, + "grad_norm": 1.3292483996972644, + "learning_rate": 1.8758381660333595e-05, + "loss": 0.6958, + "step": 3886 + }, + { + "epoch": 0.47757709792357783, + "grad_norm": 1.3099601239690601, + "learning_rate": 1.8752153232515946e-05, + "loss": 0.5689, + "step": 3887 + }, + { + "epoch": 0.47769996314043495, + "grad_norm": 1.1998058695201876, + "learning_rate": 1.874592411451404e-05, + "loss": 0.5783, + "step": 3888 + }, + { + "epoch": 0.47782282835729206, + "grad_norm": 1.221999083778615, + "learning_rate": 1.873969430747368e-05, + "loss": 0.5579, + "step": 3889 + }, + { + "epoch": 0.4779456935741492, + "grad_norm": 1.425148751396884, + "learning_rate": 1.8733463812540812e-05, + "loss": 0.711, + "step": 3890 + }, + { + "epoch": 0.4780685587910063, + "grad_norm": 1.157917943643076, + "learning_rate": 1.8727232630861483e-05, + "loss": 0.526, + "step": 3891 + }, + { + "epoch": 0.47819142400786335, + "grad_norm": 1.2431070054618614, + "learning_rate": 1.8721000763581888e-05, + "loss": 0.6659, + "step": 3892 + }, + { + "epoch": 0.47831428922472047, + "grad_norm": 1.2273812670605957, + "learning_rate": 1.8714768211848336e-05, + "loss": 0.6081, + "step": 3893 + }, + { + "epoch": 0.4784371544415776, + "grad_norm": 1.2195187676532657, + "learning_rate": 1.870853497680726e-05, + "loss": 0.5924, + "step": 3894 + }, + { + "epoch": 0.4785600196584347, + "grad_norm": 1.1930906241096562, + "learning_rate": 1.8702301059605226e-05, + "loss": 0.5347, + "step": 3895 + }, + { + "epoch": 0.4786828848752918, + "grad_norm": 1.4670961216467175, + "learning_rate": 1.869606646138892e-05, + "loss": 0.6944, + "step": 3896 + }, + { + "epoch": 0.47880575009214893, + "grad_norm": 1.2560203286670037, + "learning_rate": 1.8689831183305157e-05, + "loss": 0.5865, + "step": 3897 + }, + { + "epoch": 0.47892861530900604, + "grad_norm": 1.1317849276282275, + "learning_rate": 1.8683595226500884e-05, + "loss": 0.6399, + "step": 3898 + }, + { + "epoch": 0.4790514805258631, + "grad_norm": 1.334795555914241, + "learning_rate": 1.867735859212315e-05, + "loss": 0.6431, + "step": 3899 + }, + { + "epoch": 0.4791743457427202, + "grad_norm": 1.2379064088776899, + "learning_rate": 1.8671121281319156e-05, + "loss": 0.5823, + "step": 3900 + }, + { + "epoch": 0.47929721095957734, + "grad_norm": 1.2052324367348517, + "learning_rate": 1.866488329523621e-05, + "loss": 0.6312, + "step": 3901 + }, + { + "epoch": 0.47942007617643445, + "grad_norm": 1.4151659137131156, + "learning_rate": 1.865864463502175e-05, + "loss": 0.6978, + "step": 3902 + }, + { + "epoch": 0.47954294139329157, + "grad_norm": 1.0919430001731778, + "learning_rate": 1.8652405301823333e-05, + "loss": 0.5815, + "step": 3903 + }, + { + "epoch": 0.4796658066101487, + "grad_norm": 1.1501855124572116, + "learning_rate": 1.8646165296788654e-05, + "loss": 0.5219, + "step": 3904 + }, + { + "epoch": 0.4797886718270058, + "grad_norm": 1.2920686359944922, + "learning_rate": 1.863992462106551e-05, + "loss": 0.561, + "step": 3905 + }, + { + "epoch": 0.47991153704386286, + "grad_norm": 1.3054348030246359, + "learning_rate": 1.863368327580184e-05, + "loss": 0.672, + "step": 3906 + }, + { + "epoch": 0.48003440226072, + "grad_norm": 1.3704183117422477, + "learning_rate": 1.8627441262145692e-05, + "loss": 0.5322, + "step": 3907 + }, + { + "epoch": 0.4801572674775771, + "grad_norm": 1.32338813938963, + "learning_rate": 1.8621198581245255e-05, + "loss": 0.5301, + "step": 3908 + }, + { + "epoch": 0.4802801326944342, + "grad_norm": 1.393926224069048, + "learning_rate": 1.8614955234248816e-05, + "loss": 0.6182, + "step": 3909 + }, + { + "epoch": 0.4804029979112913, + "grad_norm": 1.232028524975572, + "learning_rate": 1.8608711222304814e-05, + "loss": 0.6223, + "step": 3910 + }, + { + "epoch": 0.48052586312814843, + "grad_norm": 1.3480236031868313, + "learning_rate": 1.8602466546561776e-05, + "loss": 0.5617, + "step": 3911 + }, + { + "epoch": 0.48064872834500555, + "grad_norm": 1.402248561510517, + "learning_rate": 1.859622120816839e-05, + "loss": 0.6883, + "step": 3912 + }, + { + "epoch": 0.4807715935618626, + "grad_norm": 1.4119192842469264, + "learning_rate": 1.858997520827343e-05, + "loss": 0.5531, + "step": 3913 + }, + { + "epoch": 0.4808944587787197, + "grad_norm": 1.3824768791455402, + "learning_rate": 1.858372854802581e-05, + "loss": 0.6134, + "step": 3914 + }, + { + "epoch": 0.48101732399557684, + "grad_norm": 1.5161934896528293, + "learning_rate": 1.857748122857457e-05, + "loss": 0.5843, + "step": 3915 + }, + { + "epoch": 0.48114018921243396, + "grad_norm": 1.265571303039805, + "learning_rate": 1.8571233251068853e-05, + "loss": 0.686, + "step": 3916 + }, + { + "epoch": 0.48126305442929107, + "grad_norm": 1.6204475975658281, + "learning_rate": 1.856498461665795e-05, + "loss": 0.7124, + "step": 3917 + }, + { + "epoch": 0.4813859196461482, + "grad_norm": 1.1998188612031968, + "learning_rate": 1.8558735326491233e-05, + "loss": 0.5955, + "step": 3918 + }, + { + "epoch": 0.4815087848630053, + "grad_norm": 1.3732887495346038, + "learning_rate": 1.855248538171824e-05, + "loss": 0.5736, + "step": 3919 + }, + { + "epoch": 0.4816316500798624, + "grad_norm": 1.1315451835794066, + "learning_rate": 1.85462347834886e-05, + "loss": 0.6889, + "step": 3920 + }, + { + "epoch": 0.4817545152967195, + "grad_norm": 1.706709807023273, + "learning_rate": 1.8539983532952065e-05, + "loss": 0.5909, + "step": 3921 + }, + { + "epoch": 0.4818773805135766, + "grad_norm": 1.3553359706327686, + "learning_rate": 1.853373163125852e-05, + "loss": 0.5591, + "step": 3922 + }, + { + "epoch": 0.4820002457304337, + "grad_norm": 1.4481804493459811, + "learning_rate": 1.852747907955796e-05, + "loss": 0.6079, + "step": 3923 + }, + { + "epoch": 0.4821231109472908, + "grad_norm": 1.3157924787753428, + "learning_rate": 1.8521225879000496e-05, + "loss": 0.6669, + "step": 3924 + }, + { + "epoch": 0.48224597616414794, + "grad_norm": 1.1915106817460905, + "learning_rate": 1.851497203073637e-05, + "loss": 0.6423, + "step": 3925 + }, + { + "epoch": 0.48236884138100505, + "grad_norm": 1.1908432484732963, + "learning_rate": 1.850871753591593e-05, + "loss": 0.6364, + "step": 3926 + }, + { + "epoch": 0.48249170659786217, + "grad_norm": 1.5026907797533768, + "learning_rate": 1.8502462395689663e-05, + "loss": 0.7041, + "step": 3927 + }, + { + "epoch": 0.48261457181471923, + "grad_norm": 1.4761198190557943, + "learning_rate": 1.8496206611208144e-05, + "loss": 0.5851, + "step": 3928 + }, + { + "epoch": 0.48273743703157634, + "grad_norm": 1.1941401008463572, + "learning_rate": 1.8489950183622097e-05, + "loss": 0.5064, + "step": 3929 + }, + { + "epoch": 0.48286030224843346, + "grad_norm": 1.3488540416466381, + "learning_rate": 1.8483693114082346e-05, + "loss": 0.6037, + "step": 3930 + }, + { + "epoch": 0.4829831674652906, + "grad_norm": 1.3235984888584975, + "learning_rate": 1.847743540373984e-05, + "loss": 0.5638, + "step": 3931 + }, + { + "epoch": 0.4831060326821477, + "grad_norm": 1.1979134729963379, + "learning_rate": 1.8471177053745644e-05, + "loss": 0.6182, + "step": 3932 + }, + { + "epoch": 0.4832288978990048, + "grad_norm": 1.0980091273154853, + "learning_rate": 1.8464918065250935e-05, + "loss": 0.5257, + "step": 3933 + }, + { + "epoch": 0.4833517631158619, + "grad_norm": 1.5124262816052743, + "learning_rate": 1.8458658439407024e-05, + "loss": 0.6918, + "step": 3934 + }, + { + "epoch": 0.483474628332719, + "grad_norm": 1.1759437393722867, + "learning_rate": 1.845239817736532e-05, + "loss": 0.6792, + "step": 3935 + }, + { + "epoch": 0.4835974935495761, + "grad_norm": 1.4256831239603953, + "learning_rate": 1.8446137280277362e-05, + "loss": 0.5355, + "step": 3936 + }, + { + "epoch": 0.4837203587664332, + "grad_norm": 1.1390764772394804, + "learning_rate": 1.84398757492948e-05, + "loss": 0.726, + "step": 3937 + }, + { + "epoch": 0.4838432239832903, + "grad_norm": 1.2160134173766664, + "learning_rate": 1.8433613585569406e-05, + "loss": 0.6016, + "step": 3938 + }, + { + "epoch": 0.48396608920014744, + "grad_norm": 1.3570052914943052, + "learning_rate": 1.8427350790253055e-05, + "loss": 0.5868, + "step": 3939 + }, + { + "epoch": 0.48408895441700456, + "grad_norm": 1.1710670953754942, + "learning_rate": 1.8421087364497756e-05, + "loss": 0.5463, + "step": 3940 + }, + { + "epoch": 0.4842118196338617, + "grad_norm": 1.329549011304685, + "learning_rate": 1.8414823309455625e-05, + "loss": 0.6259, + "step": 3941 + }, + { + "epoch": 0.4843346848507188, + "grad_norm": 1.1701940018542463, + "learning_rate": 1.8408558626278892e-05, + "loss": 0.6886, + "step": 3942 + }, + { + "epoch": 0.48445755006757585, + "grad_norm": 1.4394459639078374, + "learning_rate": 1.84022933161199e-05, + "loss": 0.5924, + "step": 3943 + }, + { + "epoch": 0.48458041528443296, + "grad_norm": 1.2501543049753259, + "learning_rate": 1.8396027380131123e-05, + "loss": 0.7162, + "step": 3944 + }, + { + "epoch": 0.4847032805012901, + "grad_norm": 1.423710296190459, + "learning_rate": 1.838976081946513e-05, + "loss": 0.7092, + "step": 3945 + }, + { + "epoch": 0.4848261457181472, + "grad_norm": 1.4242589093647058, + "learning_rate": 1.8383493635274618e-05, + "loss": 0.665, + "step": 3946 + }, + { + "epoch": 0.4849490109350043, + "grad_norm": 1.852403357008487, + "learning_rate": 1.8377225828712393e-05, + "loss": 0.7168, + "step": 3947 + }, + { + "epoch": 0.4850718761518614, + "grad_norm": 1.4061737541307144, + "learning_rate": 1.8370957400931383e-05, + "loss": 0.6926, + "step": 3948 + }, + { + "epoch": 0.48519474136871854, + "grad_norm": 1.2899695409584422, + "learning_rate": 1.8364688353084614e-05, + "loss": 0.7261, + "step": 3949 + }, + { + "epoch": 0.4853176065855756, + "grad_norm": 1.3927430913034793, + "learning_rate": 1.835841868632525e-05, + "loss": 0.619, + "step": 3950 + }, + { + "epoch": 0.4854404718024327, + "grad_norm": 1.31743166084498, + "learning_rate": 1.8352148401806546e-05, + "loss": 0.5976, + "step": 3951 + }, + { + "epoch": 0.48556333701928983, + "grad_norm": 1.6535920027805293, + "learning_rate": 1.8345877500681887e-05, + "loss": 0.678, + "step": 3952 + }, + { + "epoch": 0.48568620223614695, + "grad_norm": 1.2331135111180023, + "learning_rate": 1.8339605984104755e-05, + "loss": 0.7656, + "step": 3953 + }, + { + "epoch": 0.48580906745300406, + "grad_norm": 1.351550056655753, + "learning_rate": 1.833333385322876e-05, + "loss": 0.5604, + "step": 3954 + }, + { + "epoch": 0.4859319326698612, + "grad_norm": 1.2685867639261659, + "learning_rate": 1.8327061109207622e-05, + "loss": 0.5686, + "step": 3955 + }, + { + "epoch": 0.4860547978867183, + "grad_norm": 1.3979412470588533, + "learning_rate": 1.8320787753195168e-05, + "loss": 0.525, + "step": 3956 + }, + { + "epoch": 0.48617766310357535, + "grad_norm": 1.2735549819428555, + "learning_rate": 1.8314513786345345e-05, + "loss": 0.6477, + "step": 3957 + }, + { + "epoch": 0.48630052832043247, + "grad_norm": 1.3398615456611416, + "learning_rate": 1.8308239209812204e-05, + "loss": 0.4973, + "step": 3958 + }, + { + "epoch": 0.4864233935372896, + "grad_norm": 1.5892238156837684, + "learning_rate": 1.8301964024749917e-05, + "loss": 0.6681, + "step": 3959 + }, + { + "epoch": 0.4865462587541467, + "grad_norm": 1.264851310366645, + "learning_rate": 1.8295688232312764e-05, + "loss": 0.7201, + "step": 3960 + }, + { + "epoch": 0.4866691239710038, + "grad_norm": 1.2717837861154042, + "learning_rate": 1.8289411833655134e-05, + "loss": 0.5433, + "step": 3961 + }, + { + "epoch": 0.48679198918786093, + "grad_norm": 1.3604225561234031, + "learning_rate": 1.8283134829931526e-05, + "loss": 0.6688, + "step": 3962 + }, + { + "epoch": 0.48691485440471804, + "grad_norm": 1.2857383491276482, + "learning_rate": 1.827685722229656e-05, + "loss": 0.6983, + "step": 3963 + }, + { + "epoch": 0.4870377196215751, + "grad_norm": 1.1498216041913718, + "learning_rate": 1.8270579011904957e-05, + "loss": 0.6248, + "step": 3964 + }, + { + "epoch": 0.4871605848384322, + "grad_norm": 1.2424510714226107, + "learning_rate": 1.8264300199911557e-05, + "loss": 0.6813, + "step": 3965 + }, + { + "epoch": 0.48728345005528934, + "grad_norm": 1.3468098537277904, + "learning_rate": 1.8258020787471307e-05, + "loss": 0.5722, + "step": 3966 + }, + { + "epoch": 0.48740631527214645, + "grad_norm": 1.1878259532198152, + "learning_rate": 1.8251740775739258e-05, + "loss": 0.644, + "step": 3967 + }, + { + "epoch": 0.48752918048900357, + "grad_norm": 1.1792922568071738, + "learning_rate": 1.824546016587058e-05, + "loss": 0.5821, + "step": 3968 + }, + { + "epoch": 0.4876520457058607, + "grad_norm": 1.3019137287072324, + "learning_rate": 1.823917895902056e-05, + "loss": 0.586, + "step": 3969 + }, + { + "epoch": 0.4877749109227178, + "grad_norm": 1.6163375612651039, + "learning_rate": 1.8232897156344574e-05, + "loss": 0.628, + "step": 3970 + }, + { + "epoch": 0.4878977761395749, + "grad_norm": 1.160085485938795, + "learning_rate": 1.822661475899812e-05, + "loss": 0.5845, + "step": 3971 + }, + { + "epoch": 0.488020641356432, + "grad_norm": 1.072315424761548, + "learning_rate": 1.8220331768136806e-05, + "loss": 0.5189, + "step": 3972 + }, + { + "epoch": 0.4881435065732891, + "grad_norm": 1.3395919184146052, + "learning_rate": 1.821404818491635e-05, + "loss": 0.5671, + "step": 3973 + }, + { + "epoch": 0.4882663717901462, + "grad_norm": 1.170367249248654, + "learning_rate": 1.820776401049257e-05, + "loss": 0.6395, + "step": 3974 + }, + { + "epoch": 0.4883892370070033, + "grad_norm": 1.1062076119527504, + "learning_rate": 1.8201479246021405e-05, + "loss": 0.6731, + "step": 3975 + }, + { + "epoch": 0.48851210222386043, + "grad_norm": 1.330336174215417, + "learning_rate": 1.81951938926589e-05, + "loss": 0.7377, + "step": 3976 + }, + { + "epoch": 0.48863496744071755, + "grad_norm": 1.1693819482673609, + "learning_rate": 1.8188907951561194e-05, + "loss": 0.7323, + "step": 3977 + }, + { + "epoch": 0.48875783265757466, + "grad_norm": 1.3984572245048115, + "learning_rate": 1.8182621423884555e-05, + "loss": 0.7069, + "step": 3978 + }, + { + "epoch": 0.4888806978744317, + "grad_norm": 1.352842834859655, + "learning_rate": 1.8176334310785344e-05, + "loss": 0.5823, + "step": 3979 + }, + { + "epoch": 0.48900356309128884, + "grad_norm": 1.4084505361855988, + "learning_rate": 1.8170046613420037e-05, + "loss": 0.6394, + "step": 3980 + }, + { + "epoch": 0.48912642830814596, + "grad_norm": 1.2910782248246004, + "learning_rate": 1.8163758332945215e-05, + "loss": 0.595, + "step": 3981 + }, + { + "epoch": 0.48924929352500307, + "grad_norm": 1.2007454586100965, + "learning_rate": 1.815746947051756e-05, + "loss": 0.5586, + "step": 3982 + }, + { + "epoch": 0.4893721587418602, + "grad_norm": 1.1779389429707823, + "learning_rate": 1.8151180027293877e-05, + "loss": 0.5531, + "step": 3983 + }, + { + "epoch": 0.4894950239587173, + "grad_norm": 1.4185078098441628, + "learning_rate": 1.8144890004431066e-05, + "loss": 0.75, + "step": 3984 + }, + { + "epoch": 0.4896178891755744, + "grad_norm": 1.2985140851080599, + "learning_rate": 1.8138599403086127e-05, + "loss": 0.7059, + "step": 3985 + }, + { + "epoch": 0.4897407543924315, + "grad_norm": 1.2304544813724767, + "learning_rate": 1.8132308224416186e-05, + "loss": 0.6887, + "step": 3986 + }, + { + "epoch": 0.4898636196092886, + "grad_norm": 1.6281528188690844, + "learning_rate": 1.812601646957846e-05, + "loss": 0.675, + "step": 3987 + }, + { + "epoch": 0.4899864848261457, + "grad_norm": 1.3120905238535745, + "learning_rate": 1.811972413973028e-05, + "loss": 0.6878, + "step": 3988 + }, + { + "epoch": 0.4901093500430028, + "grad_norm": 1.3966419903641167, + "learning_rate": 1.8113431236029078e-05, + "loss": 0.4754, + "step": 3989 + }, + { + "epoch": 0.49023221525985994, + "grad_norm": 1.2110139366546557, + "learning_rate": 1.8107137759632387e-05, + "loss": 0.6193, + "step": 3990 + }, + { + "epoch": 0.49035508047671705, + "grad_norm": 1.1034041379579198, + "learning_rate": 1.8100843711697854e-05, + "loss": 0.5114, + "step": 3991 + }, + { + "epoch": 0.49047794569357417, + "grad_norm": 1.269176677360592, + "learning_rate": 1.8094549093383236e-05, + "loss": 0.6554, + "step": 3992 + }, + { + "epoch": 0.4906008109104313, + "grad_norm": 1.1745252298478908, + "learning_rate": 1.8088253905846377e-05, + "loss": 0.5559, + "step": 3993 + }, + { + "epoch": 0.49072367612728834, + "grad_norm": 1.2569470894206287, + "learning_rate": 1.8081958150245243e-05, + "loss": 0.6458, + "step": 3994 + }, + { + "epoch": 0.49084654134414546, + "grad_norm": 1.1071051545350132, + "learning_rate": 1.807566182773789e-05, + "loss": 0.6753, + "step": 3995 + }, + { + "epoch": 0.4909694065610026, + "grad_norm": 1.485515065482353, + "learning_rate": 1.8069364939482496e-05, + "loss": 0.8065, + "step": 3996 + }, + { + "epoch": 0.4910922717778597, + "grad_norm": 1.2719664974343001, + "learning_rate": 1.8063067486637324e-05, + "loss": 0.6759, + "step": 3997 + }, + { + "epoch": 0.4912151369947168, + "grad_norm": 1.1012949690966631, + "learning_rate": 1.8056769470360748e-05, + "loss": 0.6881, + "step": 3998 + }, + { + "epoch": 0.4913380022115739, + "grad_norm": 1.4116783784253788, + "learning_rate": 1.8050470891811257e-05, + "loss": 0.5498, + "step": 3999 + }, + { + "epoch": 0.49146086742843104, + "grad_norm": 1.1771107035530757, + "learning_rate": 1.804417175214743e-05, + "loss": 0.6001, + "step": 4000 + }, + { + "epoch": 0.4915837326452881, + "grad_norm": 1.323488423482268, + "learning_rate": 1.8037872052527948e-05, + "loss": 0.5352, + "step": 4001 + }, + { + "epoch": 0.4917065978621452, + "grad_norm": 1.081515157418436, + "learning_rate": 1.8031571794111602e-05, + "loss": 0.6692, + "step": 4002 + }, + { + "epoch": 0.4918294630790023, + "grad_norm": 1.178207696312425, + "learning_rate": 1.8025270978057285e-05, + "loss": 0.5448, + "step": 4003 + }, + { + "epoch": 0.49195232829585944, + "grad_norm": 1.312095966470591, + "learning_rate": 1.8018969605523996e-05, + "loss": 0.6346, + "step": 4004 + }, + { + "epoch": 0.49207519351271656, + "grad_norm": 1.2656166605227273, + "learning_rate": 1.8012667677670825e-05, + "loss": 0.5655, + "step": 4005 + }, + { + "epoch": 0.4921980587295737, + "grad_norm": 1.4401440523876121, + "learning_rate": 1.8006365195656972e-05, + "loss": 0.5488, + "step": 4006 + }, + { + "epoch": 0.4923209239464308, + "grad_norm": 1.0550221865150673, + "learning_rate": 1.8000062160641737e-05, + "loss": 0.5652, + "step": 4007 + }, + { + "epoch": 0.49244378916328785, + "grad_norm": 1.1665649908073752, + "learning_rate": 1.7993758573784525e-05, + "loss": 0.61, + "step": 4008 + }, + { + "epoch": 0.49256665438014496, + "grad_norm": 1.3664517998404058, + "learning_rate": 1.798745443624484e-05, + "loss": 0.5461, + "step": 4009 + }, + { + "epoch": 0.4926895195970021, + "grad_norm": 1.2538349216063378, + "learning_rate": 1.798114974918228e-05, + "loss": 0.6501, + "step": 4010 + }, + { + "epoch": 0.4928123848138592, + "grad_norm": 1.3750056395116717, + "learning_rate": 1.797484451375656e-05, + "loss": 0.5696, + "step": 4011 + }, + { + "epoch": 0.4929352500307163, + "grad_norm": 1.183800803828898, + "learning_rate": 1.7968538731127486e-05, + "loss": 0.6432, + "step": 4012 + }, + { + "epoch": 0.4930581152475734, + "grad_norm": 1.0995790839787227, + "learning_rate": 1.7962232402454965e-05, + "loss": 0.7439, + "step": 4013 + }, + { + "epoch": 0.49318098046443054, + "grad_norm": 1.3459653532812588, + "learning_rate": 1.7955925528898997e-05, + "loss": 0.6072, + "step": 4014 + }, + { + "epoch": 0.4933038456812876, + "grad_norm": 1.2564983489589492, + "learning_rate": 1.7949618111619706e-05, + "loss": 0.6304, + "step": 4015 + }, + { + "epoch": 0.4934267108981447, + "grad_norm": 1.2370714810031957, + "learning_rate": 1.794331015177729e-05, + "loss": 0.5368, + "step": 4016 + }, + { + "epoch": 0.49354957611500183, + "grad_norm": 1.1620414890413449, + "learning_rate": 1.793700165053206e-05, + "loss": 0.7101, + "step": 4017 + }, + { + "epoch": 0.49367244133185895, + "grad_norm": 1.285144200121842, + "learning_rate": 1.793069260904442e-05, + "loss": 0.6257, + "step": 4018 + }, + { + "epoch": 0.49379530654871606, + "grad_norm": 1.284579315241585, + "learning_rate": 1.7924383028474884e-05, + "loss": 0.6212, + "step": 4019 + }, + { + "epoch": 0.4939181717655732, + "grad_norm": 1.2297687551014476, + "learning_rate": 1.7918072909984057e-05, + "loss": 0.6584, + "step": 4020 + }, + { + "epoch": 0.4940410369824303, + "grad_norm": 1.6336034810353335, + "learning_rate": 1.7911762254732636e-05, + "loss": 0.5272, + "step": 4021 + }, + { + "epoch": 0.4941639021992874, + "grad_norm": 1.1735820693197858, + "learning_rate": 1.7905451063881435e-05, + "loss": 0.5608, + "step": 4022 + }, + { + "epoch": 0.49428676741614447, + "grad_norm": 1.4571359777539556, + "learning_rate": 1.7899139338591354e-05, + "loss": 0.7525, + "step": 4023 + }, + { + "epoch": 0.4944096326330016, + "grad_norm": 1.1214372261890664, + "learning_rate": 1.7892827080023393e-05, + "loss": 0.6091, + "step": 4024 + }, + { + "epoch": 0.4945324978498587, + "grad_norm": 1.3760779139906787, + "learning_rate": 1.7886514289338656e-05, + "loss": 0.5982, + "step": 4025 + }, + { + "epoch": 0.4946553630667158, + "grad_norm": 1.348188876159567, + "learning_rate": 1.7880200967698332e-05, + "loss": 0.7368, + "step": 4026 + }, + { + "epoch": 0.49477822828357293, + "grad_norm": 1.590861871268304, + "learning_rate": 1.7873887116263715e-05, + "loss": 0.7055, + "step": 4027 + }, + { + "epoch": 0.49490109350043004, + "grad_norm": 1.4430557883115331, + "learning_rate": 1.7867572736196204e-05, + "loss": 0.7765, + "step": 4028 + }, + { + "epoch": 0.49502395871728716, + "grad_norm": 1.369578528628881, + "learning_rate": 1.7861257828657283e-05, + "loss": 0.6915, + "step": 4029 + }, + { + "epoch": 0.4951468239341442, + "grad_norm": 1.1860294216006273, + "learning_rate": 1.785494239480854e-05, + "loss": 0.5485, + "step": 4030 + }, + { + "epoch": 0.49526968915100134, + "grad_norm": 1.6124576888605764, + "learning_rate": 1.784862643581166e-05, + "loss": 0.6868, + "step": 4031 + }, + { + "epoch": 0.49539255436785845, + "grad_norm": 1.3539297465639135, + "learning_rate": 1.7842309952828424e-05, + "loss": 0.6754, + "step": 4032 + }, + { + "epoch": 0.49551541958471557, + "grad_norm": 1.38008113344926, + "learning_rate": 1.7835992947020702e-05, + "loss": 0.6065, + "step": 4033 + }, + { + "epoch": 0.4956382848015727, + "grad_norm": 1.4468687450358388, + "learning_rate": 1.782967541955047e-05, + "loss": 0.7067, + "step": 4034 + }, + { + "epoch": 0.4957611500184298, + "grad_norm": 1.9350090409291731, + "learning_rate": 1.7823357371579797e-05, + "loss": 0.8362, + "step": 4035 + }, + { + "epoch": 0.4958840152352869, + "grad_norm": 1.239578856891161, + "learning_rate": 1.7817038804270848e-05, + "loss": 0.6929, + "step": 4036 + }, + { + "epoch": 0.496006880452144, + "grad_norm": 1.1144238610730497, + "learning_rate": 1.781071971878587e-05, + "loss": 0.6098, + "step": 4037 + }, + { + "epoch": 0.4961297456690011, + "grad_norm": 1.0390828803124725, + "learning_rate": 1.7804400116287238e-05, + "loss": 0.5875, + "step": 4038 + }, + { + "epoch": 0.4962526108858582, + "grad_norm": 1.1350964837653306, + "learning_rate": 1.7798079997937387e-05, + "loss": 0.5675, + "step": 4039 + }, + { + "epoch": 0.4963754761027153, + "grad_norm": 1.1912465878346779, + "learning_rate": 1.7791759364898865e-05, + "loss": 0.5667, + "step": 4040 + }, + { + "epoch": 0.49649834131957243, + "grad_norm": 1.1914052739671983, + "learning_rate": 1.7785438218334317e-05, + "loss": 0.6198, + "step": 4041 + }, + { + "epoch": 0.49662120653642955, + "grad_norm": 1.15865428350756, + "learning_rate": 1.777911655940647e-05, + "loss": 0.6325, + "step": 4042 + }, + { + "epoch": 0.49674407175328666, + "grad_norm": 1.1044735166108692, + "learning_rate": 1.7772794389278156e-05, + "loss": 0.643, + "step": 4043 + }, + { + "epoch": 0.4968669369701438, + "grad_norm": 1.1523941156599116, + "learning_rate": 1.77664717091123e-05, + "loss": 0.5509, + "step": 4044 + }, + { + "epoch": 0.49698980218700084, + "grad_norm": 1.249300695245001, + "learning_rate": 1.776014852007191e-05, + "loss": 0.4827, + "step": 4045 + }, + { + "epoch": 0.49711266740385796, + "grad_norm": 1.2449973605688345, + "learning_rate": 1.77538248233201e-05, + "loss": 0.5164, + "step": 4046 + }, + { + "epoch": 0.49723553262071507, + "grad_norm": 1.3311162842358943, + "learning_rate": 1.7747500620020076e-05, + "loss": 0.7772, + "step": 4047 + }, + { + "epoch": 0.4973583978375722, + "grad_norm": 1.3941870557070586, + "learning_rate": 1.7741175911335125e-05, + "loss": 0.5517, + "step": 4048 + }, + { + "epoch": 0.4974812630544293, + "grad_norm": 1.5873531239285186, + "learning_rate": 1.773485069842865e-05, + "loss": 0.6909, + "step": 4049 + }, + { + "epoch": 0.4976041282712864, + "grad_norm": 1.3890647143033608, + "learning_rate": 1.772852498246412e-05, + "loss": 0.4811, + "step": 4050 + }, + { + "epoch": 0.49772699348814353, + "grad_norm": 1.2260703821099521, + "learning_rate": 1.7722198764605114e-05, + "loss": 0.6377, + "step": 4051 + }, + { + "epoch": 0.4978498587050006, + "grad_norm": 1.3626262795835533, + "learning_rate": 1.77158720460153e-05, + "loss": 0.4713, + "step": 4052 + }, + { + "epoch": 0.4979727239218577, + "grad_norm": 1.2280581700247875, + "learning_rate": 1.770954482785844e-05, + "loss": 0.563, + "step": 4053 + }, + { + "epoch": 0.4980955891387148, + "grad_norm": 1.0903358961968306, + "learning_rate": 1.770321711129838e-05, + "loss": 0.5832, + "step": 4054 + }, + { + "epoch": 0.49821845435557194, + "grad_norm": 1.0833627861444814, + "learning_rate": 1.7696888897499062e-05, + "loss": 0.6336, + "step": 4055 + }, + { + "epoch": 0.49834131957242905, + "grad_norm": 1.4167608285978608, + "learning_rate": 1.769056018762452e-05, + "loss": 0.622, + "step": 4056 + }, + { + "epoch": 0.49846418478928617, + "grad_norm": 1.5951464443564447, + "learning_rate": 1.7684230982838883e-05, + "loss": 0.6907, + "step": 4057 + }, + { + "epoch": 0.4985870500061433, + "grad_norm": 1.357866858480301, + "learning_rate": 1.7677901284306363e-05, + "loss": 0.6938, + "step": 4058 + }, + { + "epoch": 0.49870991522300034, + "grad_norm": 1.170498305686023, + "learning_rate": 1.767157109319127e-05, + "loss": 0.5761, + "step": 4059 + }, + { + "epoch": 0.49883278043985746, + "grad_norm": 1.2068934971010823, + "learning_rate": 1.7665240410657996e-05, + "loss": 0.6647, + "step": 4060 + }, + { + "epoch": 0.4989556456567146, + "grad_norm": 1.212683185234647, + "learning_rate": 1.7658909237871035e-05, + "loss": 0.5289, + "step": 4061 + }, + { + "epoch": 0.4990785108735717, + "grad_norm": 1.3949767302163711, + "learning_rate": 1.7652577575994965e-05, + "loss": 0.535, + "step": 4062 + }, + { + "epoch": 0.4992013760904288, + "grad_norm": 1.5652203080142404, + "learning_rate": 1.7646245426194453e-05, + "loss": 0.6328, + "step": 4063 + }, + { + "epoch": 0.4993242413072859, + "grad_norm": 1.2619051216504946, + "learning_rate": 1.7639912789634257e-05, + "loss": 0.6938, + "step": 4064 + }, + { + "epoch": 0.49944710652414304, + "grad_norm": 1.1884190470838978, + "learning_rate": 1.763357966747922e-05, + "loss": 0.5348, + "step": 4065 + }, + { + "epoch": 0.4995699717410001, + "grad_norm": 1.3055523886472606, + "learning_rate": 1.7627246060894285e-05, + "loss": 0.5388, + "step": 4066 + }, + { + "epoch": 0.4996928369578572, + "grad_norm": 1.410254371230192, + "learning_rate": 1.7620911971044472e-05, + "loss": 0.5372, + "step": 4067 + }, + { + "epoch": 0.4998157021747143, + "grad_norm": 1.2973704068330616, + "learning_rate": 1.7614577399094904e-05, + "loss": 0.7042, + "step": 4068 + }, + { + "epoch": 0.49993856739157144, + "grad_norm": 1.253438301267003, + "learning_rate": 1.7608242346210775e-05, + "loss": 0.6639, + "step": 4069 + }, + { + "epoch": 0.5000614326084285, + "grad_norm": 1.3781990407498623, + "learning_rate": 1.7601906813557383e-05, + "loss": 0.6479, + "step": 4070 + }, + { + "epoch": 0.5001842978252856, + "grad_norm": 1.1248389712934395, + "learning_rate": 1.7595570802300107e-05, + "loss": 0.6218, + "step": 4071 + }, + { + "epoch": 0.5003071630421427, + "grad_norm": 1.275307165526083, + "learning_rate": 1.758923431360442e-05, + "loss": 0.5902, + "step": 4072 + }, + { + "epoch": 0.5004300282589998, + "grad_norm": 1.3864789441261725, + "learning_rate": 1.7582897348635867e-05, + "loss": 0.6913, + "step": 4073 + }, + { + "epoch": 0.500552893475857, + "grad_norm": 1.3196199385113057, + "learning_rate": 1.7576559908560104e-05, + "loss": 0.5124, + "step": 4074 + }, + { + "epoch": 0.5006757586927141, + "grad_norm": 1.2759800899381155, + "learning_rate": 1.7570221994542845e-05, + "loss": 0.6253, + "step": 4075 + }, + { + "epoch": 0.5007986239095712, + "grad_norm": 1.257043437090135, + "learning_rate": 1.7563883607749927e-05, + "loss": 0.613, + "step": 4076 + }, + { + "epoch": 0.5009214891264283, + "grad_norm": 1.0977965811094175, + "learning_rate": 1.755754474934724e-05, + "loss": 0.6688, + "step": 4077 + }, + { + "epoch": 0.5010443543432854, + "grad_norm": 1.074189633282041, + "learning_rate": 1.7551205420500785e-05, + "loss": 0.5806, + "step": 4078 + }, + { + "epoch": 0.5011672195601425, + "grad_norm": 1.2463539681180344, + "learning_rate": 1.7544865622376638e-05, + "loss": 0.5413, + "step": 4079 + }, + { + "epoch": 0.5012900847769997, + "grad_norm": 1.5869235147898462, + "learning_rate": 1.753852535614097e-05, + "loss": 0.6563, + "step": 4080 + }, + { + "epoch": 0.5014129499938568, + "grad_norm": 1.3332992105791501, + "learning_rate": 1.7532184622960014e-05, + "loss": 0.5557, + "step": 4081 + }, + { + "epoch": 0.5015358152107139, + "grad_norm": 1.4068258925727244, + "learning_rate": 1.7525843424000128e-05, + "loss": 0.5322, + "step": 4082 + }, + { + "epoch": 0.501658680427571, + "grad_norm": 1.263114118210626, + "learning_rate": 1.751950176042772e-05, + "loss": 0.559, + "step": 4083 + }, + { + "epoch": 0.5017815456444281, + "grad_norm": 1.1623772346584238, + "learning_rate": 1.7513159633409305e-05, + "loss": 0.6637, + "step": 4084 + }, + { + "epoch": 0.5019044108612851, + "grad_norm": 1.2067843085170566, + "learning_rate": 1.7506817044111477e-05, + "loss": 0.5546, + "step": 4085 + }, + { + "epoch": 0.5020272760781422, + "grad_norm": 1.034361708716484, + "learning_rate": 1.75004739937009e-05, + "loss": 0.5707, + "step": 4086 + }, + { + "epoch": 0.5021501412949994, + "grad_norm": 1.6132587902964262, + "learning_rate": 1.7494130483344357e-05, + "loss": 0.7053, + "step": 4087 + }, + { + "epoch": 0.5022730065118565, + "grad_norm": 1.1845983411063636, + "learning_rate": 1.7487786514208685e-05, + "loss": 0.5298, + "step": 4088 + }, + { + "epoch": 0.5023958717287136, + "grad_norm": 1.1301760183377083, + "learning_rate": 1.748144208746082e-05, + "loss": 0.585, + "step": 4089 + }, + { + "epoch": 0.5025187369455707, + "grad_norm": 1.2950692366443164, + "learning_rate": 1.747509720426777e-05, + "loss": 0.558, + "step": 4090 + }, + { + "epoch": 0.5026416021624278, + "grad_norm": 1.3279213847296365, + "learning_rate": 1.7468751865796645e-05, + "loss": 0.5539, + "step": 4091 + }, + { + "epoch": 0.5027644673792849, + "grad_norm": 1.0614348918976513, + "learning_rate": 1.746240607321462e-05, + "loss": 0.6641, + "step": 4092 + }, + { + "epoch": 0.502887332596142, + "grad_norm": 1.1477658997742797, + "learning_rate": 1.7456059827688976e-05, + "loss": 0.6219, + "step": 4093 + }, + { + "epoch": 0.5030101978129992, + "grad_norm": 1.1172307686395038, + "learning_rate": 1.744971313038705e-05, + "loss": 0.573, + "step": 4094 + }, + { + "epoch": 0.5031330630298563, + "grad_norm": 1.5268839997184922, + "learning_rate": 1.744336598247628e-05, + "loss": 0.6936, + "step": 4095 + }, + { + "epoch": 0.5032559282467134, + "grad_norm": 1.318093938235832, + "learning_rate": 1.7437018385124182e-05, + "loss": 0.6179, + "step": 4096 + }, + { + "epoch": 0.5033787934635705, + "grad_norm": 1.1529378199287859, + "learning_rate": 1.7430670339498358e-05, + "loss": 0.5979, + "step": 4097 + }, + { + "epoch": 0.5035016586804276, + "grad_norm": 1.1728566673096008, + "learning_rate": 1.7424321846766487e-05, + "loss": 0.546, + "step": 4098 + }, + { + "epoch": 0.5036245238972846, + "grad_norm": 1.4037085578872663, + "learning_rate": 1.7417972908096337e-05, + "loss": 0.6173, + "step": 4099 + }, + { + "epoch": 0.5037473891141417, + "grad_norm": 2.3344741726222478, + "learning_rate": 1.741162352465575e-05, + "loss": 0.6939, + "step": 4100 + }, + { + "epoch": 0.5038702543309989, + "grad_norm": 1.1434958286746266, + "learning_rate": 1.7405273697612656e-05, + "loss": 0.5828, + "step": 4101 + }, + { + "epoch": 0.503993119547856, + "grad_norm": 1.0802047293035066, + "learning_rate": 1.7398923428135066e-05, + "loss": 0.6274, + "step": 4102 + }, + { + "epoch": 0.5041159847647131, + "grad_norm": 1.153731063334193, + "learning_rate": 1.739257271739107e-05, + "loss": 0.581, + "step": 4103 + }, + { + "epoch": 0.5042388499815702, + "grad_norm": 1.1279532455273549, + "learning_rate": 1.7386221566548836e-05, + "loss": 0.6022, + "step": 4104 + }, + { + "epoch": 0.5043617151984273, + "grad_norm": 1.3047958472735839, + "learning_rate": 1.7379869976776617e-05, + "loss": 0.6781, + "step": 4105 + }, + { + "epoch": 0.5044845804152844, + "grad_norm": 1.2237535793439767, + "learning_rate": 1.7373517949242755e-05, + "loss": 0.7058, + "step": 4106 + }, + { + "epoch": 0.5046074456321415, + "grad_norm": 1.3385248173765294, + "learning_rate": 1.7367165485115657e-05, + "loss": 0.6024, + "step": 4107 + }, + { + "epoch": 0.5047303108489987, + "grad_norm": 1.281988045557728, + "learning_rate": 1.736081258556382e-05, + "loss": 0.61, + "step": 4108 + }, + { + "epoch": 0.5048531760658558, + "grad_norm": 1.2659191532774394, + "learning_rate": 1.7354459251755816e-05, + "loss": 0.5225, + "step": 4109 + }, + { + "epoch": 0.5049760412827129, + "grad_norm": 1.159112761484028, + "learning_rate": 1.7348105484860305e-05, + "loss": 0.5857, + "step": 4110 + }, + { + "epoch": 0.50509890649957, + "grad_norm": 1.4586572773343138, + "learning_rate": 1.7341751286046018e-05, + "loss": 0.5559, + "step": 4111 + }, + { + "epoch": 0.5052217717164271, + "grad_norm": 1.3943357280092812, + "learning_rate": 1.733539665648177e-05, + "loss": 0.7045, + "step": 4112 + }, + { + "epoch": 0.5053446369332842, + "grad_norm": 1.1533395152014136, + "learning_rate": 1.732904159733645e-05, + "loss": 0.5404, + "step": 4113 + }, + { + "epoch": 0.5054675021501412, + "grad_norm": 1.317841366098898, + "learning_rate": 1.7322686109779032e-05, + "loss": 0.6736, + "step": 4114 + }, + { + "epoch": 0.5055903673669984, + "grad_norm": 1.5601316078709793, + "learning_rate": 1.731633019497857e-05, + "loss": 0.5915, + "step": 4115 + }, + { + "epoch": 0.5057132325838555, + "grad_norm": 1.2119405160490337, + "learning_rate": 1.7309973854104186e-05, + "loss": 0.5593, + "step": 4116 + }, + { + "epoch": 0.5058360978007126, + "grad_norm": 1.2729887651554948, + "learning_rate": 1.7303617088325097e-05, + "loss": 0.6082, + "step": 4117 + }, + { + "epoch": 0.5059589630175697, + "grad_norm": 1.2979866858563829, + "learning_rate": 1.729725989881058e-05, + "loss": 0.5095, + "step": 4118 + }, + { + "epoch": 0.5060818282344268, + "grad_norm": 1.1133417546869844, + "learning_rate": 1.7290902286730007e-05, + "loss": 0.6859, + "step": 4119 + }, + { + "epoch": 0.5062046934512839, + "grad_norm": 1.24349039006455, + "learning_rate": 1.7284544253252813e-05, + "loss": 0.5704, + "step": 4120 + }, + { + "epoch": 0.506327558668141, + "grad_norm": 1.0009344920391352, + "learning_rate": 1.727818579954852e-05, + "loss": 0.5908, + "step": 4121 + }, + { + "epoch": 0.5064504238849982, + "grad_norm": 1.5733966882018366, + "learning_rate": 1.7271826926786724e-05, + "loss": 0.6726, + "step": 4122 + }, + { + "epoch": 0.5065732891018553, + "grad_norm": 1.2704838305898034, + "learning_rate": 1.7265467636137097e-05, + "loss": 0.5759, + "step": 4123 + }, + { + "epoch": 0.5066961543187124, + "grad_norm": 1.4453877547742615, + "learning_rate": 1.7259107928769392e-05, + "loss": 0.7, + "step": 4124 + }, + { + "epoch": 0.5068190195355695, + "grad_norm": 1.3010405389497752, + "learning_rate": 1.725274780585343e-05, + "loss": 0.5916, + "step": 4125 + }, + { + "epoch": 0.5069418847524266, + "grad_norm": 1.33414495047856, + "learning_rate": 1.724638726855912e-05, + "loss": 0.6306, + "step": 4126 + }, + { + "epoch": 0.5070647499692837, + "grad_norm": 1.3576829218820794, + "learning_rate": 1.7240026318056446e-05, + "loss": 0.5969, + "step": 4127 + }, + { + "epoch": 0.5071876151861408, + "grad_norm": 1.2273646041169446, + "learning_rate": 1.7233664955515454e-05, + "loss": 0.5772, + "step": 4128 + }, + { + "epoch": 0.5073104804029979, + "grad_norm": 1.2189189092826462, + "learning_rate": 1.722730318210628e-05, + "loss": 0.5233, + "step": 4129 + }, + { + "epoch": 0.507433345619855, + "grad_norm": 1.1064246406620746, + "learning_rate": 1.722094099899913e-05, + "loss": 0.6584, + "step": 4130 + }, + { + "epoch": 0.5075562108367121, + "grad_norm": 1.100524040694271, + "learning_rate": 1.7214578407364286e-05, + "loss": 0.6458, + "step": 4131 + }, + { + "epoch": 0.5076790760535692, + "grad_norm": 1.414949426513427, + "learning_rate": 1.7208215408372107e-05, + "loss": 0.6001, + "step": 4132 + }, + { + "epoch": 0.5078019412704263, + "grad_norm": 1.3403963939400776, + "learning_rate": 1.720185200319302e-05, + "loss": 0.5232, + "step": 4133 + }, + { + "epoch": 0.5079248064872834, + "grad_norm": 1.4281987800201277, + "learning_rate": 1.7195488192997543e-05, + "loss": 0.6651, + "step": 4134 + }, + { + "epoch": 0.5080476717041406, + "grad_norm": 1.230898176973714, + "learning_rate": 1.7189123978956246e-05, + "loss": 0.6841, + "step": 4135 + }, + { + "epoch": 0.5081705369209977, + "grad_norm": 1.0237237719276406, + "learning_rate": 1.718275936223979e-05, + "loss": 0.5787, + "step": 4136 + }, + { + "epoch": 0.5082934021378548, + "grad_norm": 1.164475454376925, + "learning_rate": 1.7176394344018912e-05, + "loss": 0.5337, + "step": 4137 + }, + { + "epoch": 0.5084162673547119, + "grad_norm": 1.2885799895020256, + "learning_rate": 1.7170028925464403e-05, + "loss": 0.6488, + "step": 4138 + }, + { + "epoch": 0.508539132571569, + "grad_norm": 1.1119921320152204, + "learning_rate": 1.716366310774715e-05, + "loss": 0.6776, + "step": 4139 + }, + { + "epoch": 0.5086619977884261, + "grad_norm": 1.260066014774468, + "learning_rate": 1.7157296892038096e-05, + "loss": 0.5744, + "step": 4140 + }, + { + "epoch": 0.5087848630052832, + "grad_norm": 1.2265349404310013, + "learning_rate": 1.7150930279508273e-05, + "loss": 0.6722, + "step": 4141 + }, + { + "epoch": 0.5089077282221404, + "grad_norm": 1.1601032548669743, + "learning_rate": 1.714456327132877e-05, + "loss": 0.6354, + "step": 4142 + }, + { + "epoch": 0.5090305934389974, + "grad_norm": 1.1140711721194152, + "learning_rate": 1.7138195868670764e-05, + "loss": 0.5157, + "step": 4143 + }, + { + "epoch": 0.5091534586558545, + "grad_norm": 1.2476388272361378, + "learning_rate": 1.7131828072705494e-05, + "loss": 0.6695, + "step": 4144 + }, + { + "epoch": 0.5092763238727116, + "grad_norm": 1.1398265546580781, + "learning_rate": 1.7125459884604278e-05, + "loss": 0.5443, + "step": 4145 + }, + { + "epoch": 0.5093991890895687, + "grad_norm": 1.2271153426683994, + "learning_rate": 1.7119091305538495e-05, + "loss": 0.706, + "step": 4146 + }, + { + "epoch": 0.5095220543064258, + "grad_norm": 1.3897713058448364, + "learning_rate": 1.711272233667961e-05, + "loss": 0.7794, + "step": 4147 + }, + { + "epoch": 0.509644919523283, + "grad_norm": 1.3632815675562604, + "learning_rate": 1.710635297919916e-05, + "loss": 0.665, + "step": 4148 + }, + { + "epoch": 0.5097677847401401, + "grad_norm": 1.4636988227024073, + "learning_rate": 1.7099983234268733e-05, + "loss": 0.6445, + "step": 4149 + }, + { + "epoch": 0.5098906499569972, + "grad_norm": 1.3594616797617187, + "learning_rate": 1.709361310306001e-05, + "loss": 0.5707, + "step": 4150 + }, + { + "epoch": 0.5100135151738543, + "grad_norm": 1.0839391831662053, + "learning_rate": 1.7087242586744733e-05, + "loss": 0.5236, + "step": 4151 + }, + { + "epoch": 0.5101363803907114, + "grad_norm": 1.2396075885187536, + "learning_rate": 1.708087168649472e-05, + "loss": 0.6279, + "step": 4152 + }, + { + "epoch": 0.5102592456075685, + "grad_norm": 1.2726100901347932, + "learning_rate": 1.7074500403481855e-05, + "loss": 0.6162, + "step": 4153 + }, + { + "epoch": 0.5103821108244256, + "grad_norm": 1.3519484155270383, + "learning_rate": 1.7068128738878095e-05, + "loss": 0.8016, + "step": 4154 + }, + { + "epoch": 0.5105049760412828, + "grad_norm": 1.2885760768321537, + "learning_rate": 1.706175669385546e-05, + "loss": 0.7005, + "step": 4155 + }, + { + "epoch": 0.5106278412581399, + "grad_norm": 1.351093471844497, + "learning_rate": 1.7055384269586063e-05, + "loss": 0.6009, + "step": 4156 + }, + { + "epoch": 0.5107507064749969, + "grad_norm": 1.4108676220448408, + "learning_rate": 1.7049011467242055e-05, + "loss": 0.524, + "step": 4157 + }, + { + "epoch": 0.510873571691854, + "grad_norm": 1.234532666295897, + "learning_rate": 1.7042638287995673e-05, + "loss": 0.6825, + "step": 4158 + }, + { + "epoch": 0.5109964369087111, + "grad_norm": 1.3503834106842694, + "learning_rate": 1.7036264733019226e-05, + "loss": 0.5705, + "step": 4159 + }, + { + "epoch": 0.5111193021255682, + "grad_norm": 1.6374761778382572, + "learning_rate": 1.702989080348509e-05, + "loss": 0.7748, + "step": 4160 + }, + { + "epoch": 0.5112421673424253, + "grad_norm": 1.772215824639301, + "learning_rate": 1.7023516500565702e-05, + "loss": 0.5845, + "step": 4161 + }, + { + "epoch": 0.5113650325592825, + "grad_norm": 1.2088284149459396, + "learning_rate": 1.7017141825433576e-05, + "loss": 0.6628, + "step": 4162 + }, + { + "epoch": 0.5114878977761396, + "grad_norm": 1.1357381958854131, + "learning_rate": 1.7010766779261292e-05, + "loss": 0.6064, + "step": 4163 + }, + { + "epoch": 0.5116107629929967, + "grad_norm": 1.2410418575049407, + "learning_rate": 1.7004391363221502e-05, + "loss": 0.567, + "step": 4164 + }, + { + "epoch": 0.5117336282098538, + "grad_norm": 1.2646041410878872, + "learning_rate": 1.6998015578486918e-05, + "loss": 0.5433, + "step": 4165 + }, + { + "epoch": 0.5118564934267109, + "grad_norm": 1.3195757369266008, + "learning_rate": 1.699163942623033e-05, + "loss": 0.5645, + "step": 4166 + }, + { + "epoch": 0.511979358643568, + "grad_norm": 1.3059311238032743, + "learning_rate": 1.6985262907624583e-05, + "loss": 0.7651, + "step": 4167 + }, + { + "epoch": 0.5121022238604251, + "grad_norm": 1.1670871361774835, + "learning_rate": 1.6978886023842598e-05, + "loss": 0.6327, + "step": 4168 + }, + { + "epoch": 0.5122250890772823, + "grad_norm": 1.2080195404543168, + "learning_rate": 1.6972508776057362e-05, + "loss": 0.6837, + "step": 4169 + }, + { + "epoch": 0.5123479542941394, + "grad_norm": 1.0702341946954859, + "learning_rate": 1.6966131165441928e-05, + "loss": 0.6506, + "step": 4170 + }, + { + "epoch": 0.5124708195109965, + "grad_norm": 1.1732205377553286, + "learning_rate": 1.6959753193169422e-05, + "loss": 0.658, + "step": 4171 + }, + { + "epoch": 0.5125936847278535, + "grad_norm": 1.7414066796847802, + "learning_rate": 1.695337486041302e-05, + "loss": 0.6465, + "step": 4172 + }, + { + "epoch": 0.5127165499447106, + "grad_norm": 1.2722621694330176, + "learning_rate": 1.694699616834598e-05, + "loss": 0.5129, + "step": 4173 + }, + { + "epoch": 0.5128394151615677, + "grad_norm": 1.1952760059924372, + "learning_rate": 1.6940617118141626e-05, + "loss": 0.5487, + "step": 4174 + }, + { + "epoch": 0.5129622803784248, + "grad_norm": 1.2370908378212728, + "learning_rate": 1.693423771097334e-05, + "loss": 0.6055, + "step": 4175 + }, + { + "epoch": 0.513085145595282, + "grad_norm": 1.3658047838052374, + "learning_rate": 1.6927857948014565e-05, + "loss": 0.6713, + "step": 4176 + }, + { + "epoch": 0.5132080108121391, + "grad_norm": 1.0537336997375397, + "learning_rate": 1.6921477830438827e-05, + "loss": 0.6098, + "step": 4177 + }, + { + "epoch": 0.5133308760289962, + "grad_norm": 2.027039349092157, + "learning_rate": 1.6915097359419703e-05, + "loss": 0.6523, + "step": 4178 + }, + { + "epoch": 0.5134537412458533, + "grad_norm": 1.3199844304620736, + "learning_rate": 1.690871653613084e-05, + "loss": 0.6951, + "step": 4179 + }, + { + "epoch": 0.5135766064627104, + "grad_norm": 1.303947882354836, + "learning_rate": 1.6902335361745944e-05, + "loss": 0.6889, + "step": 4180 + }, + { + "epoch": 0.5136994716795675, + "grad_norm": 1.5858356225006345, + "learning_rate": 1.6895953837438802e-05, + "loss": 0.7444, + "step": 4181 + }, + { + "epoch": 0.5138223368964246, + "grad_norm": 1.4602729737795226, + "learning_rate": 1.6889571964383242e-05, + "loss": 0.5328, + "step": 4182 + }, + { + "epoch": 0.5139452021132818, + "grad_norm": 1.2845383323544521, + "learning_rate": 1.6883189743753174e-05, + "loss": 0.6098, + "step": 4183 + }, + { + "epoch": 0.5140680673301389, + "grad_norm": 1.230293509612916, + "learning_rate": 1.687680717672257e-05, + "loss": 0.6571, + "step": 4184 + }, + { + "epoch": 0.514190932546996, + "grad_norm": 1.3171010024603018, + "learning_rate": 1.6870424264465454e-05, + "loss": 0.6318, + "step": 4185 + }, + { + "epoch": 0.514313797763853, + "grad_norm": 1.3994129857337814, + "learning_rate": 1.6864041008155926e-05, + "loss": 0.6262, + "step": 4186 + }, + { + "epoch": 0.5144366629807101, + "grad_norm": 1.1630718011210157, + "learning_rate": 1.6857657408968146e-05, + "loss": 0.6474, + "step": 4187 + }, + { + "epoch": 0.5145595281975672, + "grad_norm": 1.3791567167848149, + "learning_rate": 1.6851273468076328e-05, + "loss": 0.6549, + "step": 4188 + }, + { + "epoch": 0.5146823934144243, + "grad_norm": 1.4607468530445382, + "learning_rate": 1.6844889186654757e-05, + "loss": 0.6337, + "step": 4189 + }, + { + "epoch": 0.5148052586312815, + "grad_norm": 1.17377402988719, + "learning_rate": 1.6838504565877795e-05, + "loss": 0.607, + "step": 4190 + }, + { + "epoch": 0.5149281238481386, + "grad_norm": 1.3141352628802079, + "learning_rate": 1.6832119606919835e-05, + "loss": 0.6169, + "step": 4191 + }, + { + "epoch": 0.5150509890649957, + "grad_norm": 1.702820595098977, + "learning_rate": 1.6825734310955356e-05, + "loss": 0.6696, + "step": 4192 + }, + { + "epoch": 0.5151738542818528, + "grad_norm": 1.420768079606018, + "learning_rate": 1.681934867915889e-05, + "loss": 0.6014, + "step": 4193 + }, + { + "epoch": 0.5152967194987099, + "grad_norm": 1.3545565631776781, + "learning_rate": 1.6812962712705037e-05, + "loss": 0.5505, + "step": 4194 + }, + { + "epoch": 0.515419584715567, + "grad_norm": 1.5561849122572577, + "learning_rate": 1.6806576412768446e-05, + "loss": 0.6761, + "step": 4195 + }, + { + "epoch": 0.5155424499324242, + "grad_norm": 1.4311139511429454, + "learning_rate": 1.6800189780523844e-05, + "loss": 0.5588, + "step": 4196 + }, + { + "epoch": 0.5156653151492813, + "grad_norm": 1.3301938350512337, + "learning_rate": 1.6793802817146003e-05, + "loss": 0.4948, + "step": 4197 + }, + { + "epoch": 0.5157881803661384, + "grad_norm": 1.3333436747497975, + "learning_rate": 1.6787415523809775e-05, + "loss": 0.6123, + "step": 4198 + }, + { + "epoch": 0.5159110455829955, + "grad_norm": 1.2054281189318998, + "learning_rate": 1.6781027901690043e-05, + "loss": 0.7135, + "step": 4199 + }, + { + "epoch": 0.5160339107998526, + "grad_norm": 1.184937987692237, + "learning_rate": 1.6774639951961783e-05, + "loss": 0.6463, + "step": 4200 + }, + { + "epoch": 0.5161567760167096, + "grad_norm": 1.239996960534812, + "learning_rate": 1.6768251675800012e-05, + "loss": 0.565, + "step": 4201 + }, + { + "epoch": 0.5162796412335667, + "grad_norm": 1.2867754615546094, + "learning_rate": 1.6761863074379815e-05, + "loss": 0.5347, + "step": 4202 + }, + { + "epoch": 0.5164025064504238, + "grad_norm": 1.1974639057733678, + "learning_rate": 1.6755474148876328e-05, + "loss": 0.5869, + "step": 4203 + }, + { + "epoch": 0.516525371667281, + "grad_norm": 1.2763405444851272, + "learning_rate": 1.674908490046476e-05, + "loss": 0.5826, + "step": 4204 + }, + { + "epoch": 0.5166482368841381, + "grad_norm": 1.224450744448321, + "learning_rate": 1.6742695330320367e-05, + "loss": 0.5902, + "step": 4205 + }, + { + "epoch": 0.5167711021009952, + "grad_norm": 1.6384752329853172, + "learning_rate": 1.6736305439618466e-05, + "loss": 0.6321, + "step": 4206 + }, + { + "epoch": 0.5168939673178523, + "grad_norm": 1.245638682918695, + "learning_rate": 1.672991522953444e-05, + "loss": 0.6485, + "step": 4207 + }, + { + "epoch": 0.5170168325347094, + "grad_norm": 1.3713922888966426, + "learning_rate": 1.672352470124373e-05, + "loss": 0.5802, + "step": 4208 + }, + { + "epoch": 0.5171396977515665, + "grad_norm": 1.191380667012763, + "learning_rate": 1.671713385592183e-05, + "loss": 0.6404, + "step": 4209 + }, + { + "epoch": 0.5172625629684237, + "grad_norm": 1.199208593002673, + "learning_rate": 1.6710742694744288e-05, + "loss": 0.6505, + "step": 4210 + }, + { + "epoch": 0.5173854281852808, + "grad_norm": 1.0906193162500728, + "learning_rate": 1.6704351218886722e-05, + "loss": 0.7048, + "step": 4211 + }, + { + "epoch": 0.5175082934021379, + "grad_norm": 1.1237142300137535, + "learning_rate": 1.6697959429524803e-05, + "loss": 0.4851, + "step": 4212 + }, + { + "epoch": 0.517631158618995, + "grad_norm": 1.0980293518129507, + "learning_rate": 1.6691567327834264e-05, + "loss": 0.5834, + "step": 4213 + }, + { + "epoch": 0.5177540238358521, + "grad_norm": 1.248790045168421, + "learning_rate": 1.668517491499088e-05, + "loss": 0.6309, + "step": 4214 + }, + { + "epoch": 0.5178768890527092, + "grad_norm": 1.7008877651711838, + "learning_rate": 1.6678782192170503e-05, + "loss": 0.7584, + "step": 4215 + }, + { + "epoch": 0.5179997542695662, + "grad_norm": 1.3412685710041883, + "learning_rate": 1.6672389160549027e-05, + "loss": 0.6299, + "step": 4216 + }, + { + "epoch": 0.5181226194864234, + "grad_norm": 1.325990495813332, + "learning_rate": 1.6665995821302413e-05, + "loss": 0.6442, + "step": 4217 + }, + { + "epoch": 0.5182454847032805, + "grad_norm": 1.1588260062038955, + "learning_rate": 1.6659602175606665e-05, + "loss": 0.5337, + "step": 4218 + }, + { + "epoch": 0.5183683499201376, + "grad_norm": 1.1741284281965074, + "learning_rate": 1.6653208224637868e-05, + "loss": 0.6968, + "step": 4219 + }, + { + "epoch": 0.5184912151369947, + "grad_norm": 1.2818938413670031, + "learning_rate": 1.6646813969572133e-05, + "loss": 0.5866, + "step": 4220 + }, + { + "epoch": 0.5186140803538518, + "grad_norm": 1.17270943643584, + "learning_rate": 1.664041941158565e-05, + "loss": 0.6661, + "step": 4221 + }, + { + "epoch": 0.5187369455707089, + "grad_norm": 1.2755035117725944, + "learning_rate": 1.6634024551854656e-05, + "loss": 0.6087, + "step": 4222 + }, + { + "epoch": 0.518859810787566, + "grad_norm": 1.1229287358743867, + "learning_rate": 1.662762939155544e-05, + "loss": 0.5305, + "step": 4223 + }, + { + "epoch": 0.5189826760044232, + "grad_norm": 1.0196750648931596, + "learning_rate": 1.6621233931864357e-05, + "loss": 0.6042, + "step": 4224 + }, + { + "epoch": 0.5191055412212803, + "grad_norm": 1.3669485603117042, + "learning_rate": 1.661483817395781e-05, + "loss": 0.6463, + "step": 4225 + }, + { + "epoch": 0.5192284064381374, + "grad_norm": 1.1244019793287447, + "learning_rate": 1.6608442119012242e-05, + "loss": 0.6436, + "step": 4226 + }, + { + "epoch": 0.5193512716549945, + "grad_norm": 1.2137328208209355, + "learning_rate": 1.6602045768204186e-05, + "loss": 0.593, + "step": 4227 + }, + { + "epoch": 0.5194741368718516, + "grad_norm": 2.383449284933318, + "learning_rate": 1.6595649122710197e-05, + "loss": 0.7098, + "step": 4228 + }, + { + "epoch": 0.5195970020887087, + "grad_norm": 1.1986989797239005, + "learning_rate": 1.6589252183706904e-05, + "loss": 0.5912, + "step": 4229 + }, + { + "epoch": 0.5197198673055657, + "grad_norm": 1.1713263645811969, + "learning_rate": 1.6582854952370972e-05, + "loss": 0.67, + "step": 4230 + }, + { + "epoch": 0.5198427325224229, + "grad_norm": 1.2420122903387165, + "learning_rate": 1.657645742987914e-05, + "loss": 0.4883, + "step": 4231 + }, + { + "epoch": 0.51996559773928, + "grad_norm": 1.2984824821828782, + "learning_rate": 1.6570059617408187e-05, + "loss": 0.604, + "step": 4232 + }, + { + "epoch": 0.5200884629561371, + "grad_norm": 1.227260097553568, + "learning_rate": 1.656366151613495e-05, + "loss": 0.6249, + "step": 4233 + }, + { + "epoch": 0.5202113281729942, + "grad_norm": 1.6198075940274366, + "learning_rate": 1.6557263127236323e-05, + "loss": 0.6997, + "step": 4234 + }, + { + "epoch": 0.5203341933898513, + "grad_norm": 1.2063466094921436, + "learning_rate": 1.6550864451889234e-05, + "loss": 0.5956, + "step": 4235 + }, + { + "epoch": 0.5204570586067084, + "grad_norm": 1.2155964531899865, + "learning_rate": 1.654446549127069e-05, + "loss": 0.6371, + "step": 4236 + }, + { + "epoch": 0.5205799238235655, + "grad_norm": 1.213939539574419, + "learning_rate": 1.6538066246557735e-05, + "loss": 0.6441, + "step": 4237 + }, + { + "epoch": 0.5207027890404227, + "grad_norm": 1.0432296659337787, + "learning_rate": 1.653166671892747e-05, + "loss": 0.5139, + "step": 4238 + }, + { + "epoch": 0.5208256542572798, + "grad_norm": 1.2795083806509446, + "learning_rate": 1.6525266909557046e-05, + "loss": 0.5605, + "step": 4239 + }, + { + "epoch": 0.5209485194741369, + "grad_norm": 1.1280372899233686, + "learning_rate": 1.6518866819623665e-05, + "loss": 0.5644, + "step": 4240 + }, + { + "epoch": 0.521071384690994, + "grad_norm": 1.0669101686390612, + "learning_rate": 1.6512466450304584e-05, + "loss": 0.6787, + "step": 4241 + }, + { + "epoch": 0.5211942499078511, + "grad_norm": 1.5096765739971247, + "learning_rate": 1.6506065802777107e-05, + "loss": 0.5195, + "step": 4242 + }, + { + "epoch": 0.5213171151247082, + "grad_norm": 1.5131996739756128, + "learning_rate": 1.6499664878218592e-05, + "loss": 0.5811, + "step": 4243 + }, + { + "epoch": 0.5214399803415654, + "grad_norm": 1.1787990236115518, + "learning_rate": 1.649326367780645e-05, + "loss": 0.5787, + "step": 4244 + }, + { + "epoch": 0.5215628455584224, + "grad_norm": 1.342246600835556, + "learning_rate": 1.6486862202718134e-05, + "loss": 0.6023, + "step": 4245 + }, + { + "epoch": 0.5216857107752795, + "grad_norm": 1.575249703996527, + "learning_rate": 1.6480460454131165e-05, + "loss": 0.5617, + "step": 4246 + }, + { + "epoch": 0.5218085759921366, + "grad_norm": 1.4669737609737286, + "learning_rate": 1.6474058433223092e-05, + "loss": 0.6462, + "step": 4247 + }, + { + "epoch": 0.5219314412089937, + "grad_norm": 1.3796153660905843, + "learning_rate": 1.646765614117153e-05, + "loss": 0.5778, + "step": 4248 + }, + { + "epoch": 0.5220543064258508, + "grad_norm": 1.498387530366278, + "learning_rate": 1.646125357915414e-05, + "loss": 0.6278, + "step": 4249 + }, + { + "epoch": 0.5221771716427079, + "grad_norm": 1.2931669592268369, + "learning_rate": 1.645485074834863e-05, + "loss": 0.6208, + "step": 4250 + }, + { + "epoch": 0.522300036859565, + "grad_norm": 1.1763646600087867, + "learning_rate": 1.6448447649932763e-05, + "loss": 0.6032, + "step": 4251 + }, + { + "epoch": 0.5224229020764222, + "grad_norm": 1.4834248991358954, + "learning_rate": 1.644204428508434e-05, + "loss": 0.5836, + "step": 4252 + }, + { + "epoch": 0.5225457672932793, + "grad_norm": 1.0673251508297166, + "learning_rate": 1.6435640654981225e-05, + "loss": 0.5975, + "step": 4253 + }, + { + "epoch": 0.5226686325101364, + "grad_norm": 1.256668719482166, + "learning_rate": 1.642923676080132e-05, + "loss": 0.663, + "step": 4254 + }, + { + "epoch": 0.5227914977269935, + "grad_norm": 1.0781031870464053, + "learning_rate": 1.6422832603722583e-05, + "loss": 0.4825, + "step": 4255 + }, + { + "epoch": 0.5229143629438506, + "grad_norm": 1.3859000640785157, + "learning_rate": 1.6416428184923014e-05, + "loss": 0.5234, + "step": 4256 + }, + { + "epoch": 0.5230372281607077, + "grad_norm": 1.1447023320084395, + "learning_rate": 1.641002350558067e-05, + "loss": 0.5935, + "step": 4257 + }, + { + "epoch": 0.5231600933775649, + "grad_norm": 1.2842165605393696, + "learning_rate": 1.6403618566873645e-05, + "loss": 0.5905, + "step": 4258 + }, + { + "epoch": 0.5232829585944219, + "grad_norm": 1.155499600531974, + "learning_rate": 1.6397213369980087e-05, + "loss": 0.5745, + "step": 4259 + }, + { + "epoch": 0.523405823811279, + "grad_norm": 1.4590141792671811, + "learning_rate": 1.6390807916078192e-05, + "loss": 0.5939, + "step": 4260 + }, + { + "epoch": 0.5235286890281361, + "grad_norm": 1.351962999085323, + "learning_rate": 1.6384402206346202e-05, + "loss": 0.6297, + "step": 4261 + }, + { + "epoch": 0.5236515542449932, + "grad_norm": 1.2913696767833085, + "learning_rate": 1.6377996241962402e-05, + "loss": 0.6249, + "step": 4262 + }, + { + "epoch": 0.5237744194618503, + "grad_norm": 1.1813700340380469, + "learning_rate": 1.6371590024105128e-05, + "loss": 0.7462, + "step": 4263 + }, + { + "epoch": 0.5238972846787074, + "grad_norm": 1.2933314599430885, + "learning_rate": 1.6365183553952765e-05, + "loss": 0.5564, + "step": 4264 + }, + { + "epoch": 0.5240201498955646, + "grad_norm": 1.2985453192855174, + "learning_rate": 1.6358776832683743e-05, + "loss": 0.6582, + "step": 4265 + }, + { + "epoch": 0.5241430151124217, + "grad_norm": 1.055326997580227, + "learning_rate": 1.635236986147653e-05, + "loss": 0.5277, + "step": 4266 + }, + { + "epoch": 0.5242658803292788, + "grad_norm": 1.4184858818130355, + "learning_rate": 1.6345962641509657e-05, + "loss": 0.5367, + "step": 4267 + }, + { + "epoch": 0.5243887455461359, + "grad_norm": 1.4417135296914994, + "learning_rate": 1.633955517396168e-05, + "loss": 0.6533, + "step": 4268 + }, + { + "epoch": 0.524511610762993, + "grad_norm": 1.3553278816627548, + "learning_rate": 1.6333147460011223e-05, + "loss": 0.6129, + "step": 4269 + }, + { + "epoch": 0.5246344759798501, + "grad_norm": 1.176862511854242, + "learning_rate": 1.6326739500836935e-05, + "loss": 0.5947, + "step": 4270 + }, + { + "epoch": 0.5247573411967072, + "grad_norm": 1.068924986550487, + "learning_rate": 1.6320331297617513e-05, + "loss": 0.6375, + "step": 4271 + }, + { + "epoch": 0.5248802064135644, + "grad_norm": 1.4298611252256577, + "learning_rate": 1.631392285153172e-05, + "loss": 0.6168, + "step": 4272 + }, + { + "epoch": 0.5250030716304215, + "grad_norm": 1.1612210603904038, + "learning_rate": 1.6307514163758334e-05, + "loss": 0.5731, + "step": 4273 + }, + { + "epoch": 0.5251259368472785, + "grad_norm": 1.466255992637623, + "learning_rate": 1.6301105235476195e-05, + "loss": 0.7601, + "step": 4274 + }, + { + "epoch": 0.5252488020641356, + "grad_norm": 1.1432669256501309, + "learning_rate": 1.629469606786419e-05, + "loss": 0.6119, + "step": 4275 + }, + { + "epoch": 0.5253716672809927, + "grad_norm": 1.6834018882813744, + "learning_rate": 1.628828666210124e-05, + "loss": 0.7085, + "step": 4276 + }, + { + "epoch": 0.5254945324978498, + "grad_norm": 1.4112966611587572, + "learning_rate": 1.628187701936631e-05, + "loss": 0.644, + "step": 4277 + }, + { + "epoch": 0.525617397714707, + "grad_norm": 1.4292318835795002, + "learning_rate": 1.6275467140838418e-05, + "loss": 0.6481, + "step": 4278 + }, + { + "epoch": 0.5257402629315641, + "grad_norm": 1.2833051161987439, + "learning_rate": 1.6269057027696618e-05, + "loss": 0.4778, + "step": 4279 + }, + { + "epoch": 0.5258631281484212, + "grad_norm": 1.329655049175335, + "learning_rate": 1.626264668112001e-05, + "loss": 0.5754, + "step": 4280 + }, + { + "epoch": 0.5259859933652783, + "grad_norm": 1.3076962587601542, + "learning_rate": 1.625623610228773e-05, + "loss": 0.7077, + "step": 4281 + }, + { + "epoch": 0.5261088585821354, + "grad_norm": 1.4322184479883449, + "learning_rate": 1.6249825292378965e-05, + "loss": 0.6474, + "step": 4282 + }, + { + "epoch": 0.5262317237989925, + "grad_norm": 1.4114726945898115, + "learning_rate": 1.6243414252572946e-05, + "loss": 0.518, + "step": 4283 + }, + { + "epoch": 0.5263545890158496, + "grad_norm": 1.4628982849173522, + "learning_rate": 1.6237002984048935e-05, + "loss": 0.6823, + "step": 4284 + }, + { + "epoch": 0.5264774542327068, + "grad_norm": 1.2356942896591545, + "learning_rate": 1.6230591487986247e-05, + "loss": 0.6515, + "step": 4285 + }, + { + "epoch": 0.5266003194495639, + "grad_norm": 1.2756910353108089, + "learning_rate": 1.6224179765564243e-05, + "loss": 0.6163, + "step": 4286 + }, + { + "epoch": 0.526723184666421, + "grad_norm": 1.2890631989360206, + "learning_rate": 1.6217767817962304e-05, + "loss": 0.5493, + "step": 4287 + }, + { + "epoch": 0.526846049883278, + "grad_norm": 1.5569202448667867, + "learning_rate": 1.6211355646359877e-05, + "loss": 0.687, + "step": 4288 + }, + { + "epoch": 0.5269689151001351, + "grad_norm": 1.1947104949601746, + "learning_rate": 1.620494325193643e-05, + "loss": 0.644, + "step": 4289 + }, + { + "epoch": 0.5270917803169922, + "grad_norm": 1.3637536499078662, + "learning_rate": 1.619853063587149e-05, + "loss": 0.6873, + "step": 4290 + }, + { + "epoch": 0.5272146455338493, + "grad_norm": 1.2490680105206218, + "learning_rate": 1.6192117799344606e-05, + "loss": 0.6694, + "step": 4291 + }, + { + "epoch": 0.5273375107507065, + "grad_norm": 1.1396378647238006, + "learning_rate": 1.6185704743535388e-05, + "loss": 0.5226, + "step": 4292 + }, + { + "epoch": 0.5274603759675636, + "grad_norm": 1.3315612668868808, + "learning_rate": 1.6179291469623474e-05, + "loss": 0.6204, + "step": 4293 + }, + { + "epoch": 0.5275832411844207, + "grad_norm": 1.2855158741909696, + "learning_rate": 1.617287797878854e-05, + "loss": 0.5753, + "step": 4294 + }, + { + "epoch": 0.5277061064012778, + "grad_norm": 1.2695168861682755, + "learning_rate": 1.6166464272210304e-05, + "loss": 0.5813, + "step": 4295 + }, + { + "epoch": 0.5278289716181349, + "grad_norm": 1.2075578749079223, + "learning_rate": 1.6160050351068534e-05, + "loss": 0.6223, + "step": 4296 + }, + { + "epoch": 0.527951836834992, + "grad_norm": 1.27065786878236, + "learning_rate": 1.6153636216543027e-05, + "loss": 0.6216, + "step": 4297 + }, + { + "epoch": 0.5280747020518491, + "grad_norm": 1.1258688812265873, + "learning_rate": 1.6147221869813618e-05, + "loss": 0.588, + "step": 4298 + }, + { + "epoch": 0.5281975672687063, + "grad_norm": 1.2375947424094456, + "learning_rate": 1.6140807312060188e-05, + "loss": 0.6862, + "step": 4299 + }, + { + "epoch": 0.5283204324855634, + "grad_norm": 1.1986147765878126, + "learning_rate": 1.613439254446265e-05, + "loss": 0.5202, + "step": 4300 + }, + { + "epoch": 0.5284432977024205, + "grad_norm": 1.224801926170722, + "learning_rate": 1.612797756820096e-05, + "loss": 0.6245, + "step": 4301 + }, + { + "epoch": 0.5285661629192776, + "grad_norm": 1.0716920135295458, + "learning_rate": 1.612156238445511e-05, + "loss": 0.6112, + "step": 4302 + }, + { + "epoch": 0.5286890281361346, + "grad_norm": 1.351474257631185, + "learning_rate": 1.6115146994405133e-05, + "loss": 0.6472, + "step": 4303 + }, + { + "epoch": 0.5288118933529917, + "grad_norm": 1.2684969224289784, + "learning_rate": 1.61087313992311e-05, + "loss": 0.6082, + "step": 4304 + }, + { + "epoch": 0.5289347585698488, + "grad_norm": 1.4350278840200403, + "learning_rate": 1.6102315600113117e-05, + "loss": 0.5351, + "step": 4305 + }, + { + "epoch": 0.529057623786706, + "grad_norm": 1.141318855720664, + "learning_rate": 1.6095899598231324e-05, + "loss": 0.5918, + "step": 4306 + }, + { + "epoch": 0.5291804890035631, + "grad_norm": 1.2230732927058143, + "learning_rate": 1.6089483394765908e-05, + "loss": 0.6722, + "step": 4307 + }, + { + "epoch": 0.5293033542204202, + "grad_norm": 1.1797759370218, + "learning_rate": 1.6083066990897094e-05, + "loss": 0.6672, + "step": 4308 + }, + { + "epoch": 0.5294262194372773, + "grad_norm": 1.1961960972027517, + "learning_rate": 1.607665038780513e-05, + "loss": 0.5841, + "step": 4309 + }, + { + "epoch": 0.5295490846541344, + "grad_norm": 1.5341117046590953, + "learning_rate": 1.6070233586670297e-05, + "loss": 0.6685, + "step": 4310 + }, + { + "epoch": 0.5296719498709915, + "grad_norm": 1.7576311580951482, + "learning_rate": 1.606381658867295e-05, + "loss": 0.602, + "step": 4311 + }, + { + "epoch": 0.5297948150878486, + "grad_norm": 1.2082150247145997, + "learning_rate": 1.6057399394993432e-05, + "loss": 0.7483, + "step": 4312 + }, + { + "epoch": 0.5299176803047058, + "grad_norm": 1.252194916994575, + "learning_rate": 1.6050982006812158e-05, + "loss": 0.5279, + "step": 4313 + }, + { + "epoch": 0.5300405455215629, + "grad_norm": 1.3998754622468084, + "learning_rate": 1.6044564425309555e-05, + "loss": 0.5624, + "step": 4314 + }, + { + "epoch": 0.53016341073842, + "grad_norm": 1.2428258220410766, + "learning_rate": 1.6038146651666106e-05, + "loss": 0.621, + "step": 4315 + }, + { + "epoch": 0.5302862759552771, + "grad_norm": 1.3155025574688937, + "learning_rate": 1.603172868706231e-05, + "loss": 0.5579, + "step": 4316 + }, + { + "epoch": 0.5304091411721342, + "grad_norm": 1.2085466825335522, + "learning_rate": 1.6025310532678713e-05, + "loss": 0.5657, + "step": 4317 + }, + { + "epoch": 0.5305320063889912, + "grad_norm": 1.2657685552319962, + "learning_rate": 1.6018892189695893e-05, + "loss": 0.4639, + "step": 4318 + }, + { + "epoch": 0.5306548716058483, + "grad_norm": 1.2467808105787705, + "learning_rate": 1.6012473659294463e-05, + "loss": 0.6422, + "step": 4319 + }, + { + "epoch": 0.5307777368227055, + "grad_norm": 1.2249434080568264, + "learning_rate": 1.6006054942655073e-05, + "loss": 0.6772, + "step": 4320 + }, + { + "epoch": 0.5309006020395626, + "grad_norm": 1.196509489029234, + "learning_rate": 1.5999636040958394e-05, + "loss": 0.609, + "step": 4321 + }, + { + "epoch": 0.5310234672564197, + "grad_norm": 1.161090363154261, + "learning_rate": 1.5993216955385153e-05, + "loss": 0.5201, + "step": 4322 + }, + { + "epoch": 0.5311463324732768, + "grad_norm": 1.3718375753627412, + "learning_rate": 1.598679768711609e-05, + "loss": 0.6162, + "step": 4323 + }, + { + "epoch": 0.5312691976901339, + "grad_norm": 1.2166884511491731, + "learning_rate": 1.5980378237331995e-05, + "loss": 0.6327, + "step": 4324 + }, + { + "epoch": 0.531392062906991, + "grad_norm": 1.2884525282198391, + "learning_rate": 1.597395860721368e-05, + "loss": 0.5387, + "step": 4325 + }, + { + "epoch": 0.5315149281238482, + "grad_norm": 1.1255720210846591, + "learning_rate": 1.5967538797941997e-05, + "loss": 0.6304, + "step": 4326 + }, + { + "epoch": 0.5316377933407053, + "grad_norm": 1.1088334468965502, + "learning_rate": 1.5961118810697824e-05, + "loss": 0.6072, + "step": 4327 + }, + { + "epoch": 0.5317606585575624, + "grad_norm": 1.2610127562711524, + "learning_rate": 1.5954698646662085e-05, + "loss": 0.6328, + "step": 4328 + }, + { + "epoch": 0.5318835237744195, + "grad_norm": 1.1464982787798386, + "learning_rate": 1.5948278307015715e-05, + "loss": 0.557, + "step": 4329 + }, + { + "epoch": 0.5320063889912766, + "grad_norm": 1.394239203512859, + "learning_rate": 1.5941857792939702e-05, + "loss": 0.5645, + "step": 4330 + }, + { + "epoch": 0.5321292542081337, + "grad_norm": 1.5555437794321274, + "learning_rate": 1.593543710561506e-05, + "loss": 0.5723, + "step": 4331 + }, + { + "epoch": 0.5322521194249907, + "grad_norm": 1.2095400604865274, + "learning_rate": 1.592901624622282e-05, + "loss": 0.5765, + "step": 4332 + }, + { + "epoch": 0.5323749846418478, + "grad_norm": 1.259032413497801, + "learning_rate": 1.5922595215944072e-05, + "loss": 0.6059, + "step": 4333 + }, + { + "epoch": 0.532497849858705, + "grad_norm": 1.137659986259396, + "learning_rate": 1.591617401595992e-05, + "loss": 0.6136, + "step": 4334 + }, + { + "epoch": 0.5326207150755621, + "grad_norm": 1.3613924049481243, + "learning_rate": 1.5909752647451494e-05, + "loss": 0.5951, + "step": 4335 + }, + { + "epoch": 0.5327435802924192, + "grad_norm": 1.4451758947436337, + "learning_rate": 1.590333111159997e-05, + "loss": 0.5608, + "step": 4336 + }, + { + "epoch": 0.5328664455092763, + "grad_norm": 1.1953242239830473, + "learning_rate": 1.589690940958655e-05, + "loss": 0.5513, + "step": 4337 + }, + { + "epoch": 0.5329893107261334, + "grad_norm": 0.99944216292318, + "learning_rate": 1.5890487542592458e-05, + "loss": 0.4824, + "step": 4338 + }, + { + "epoch": 0.5331121759429905, + "grad_norm": 1.1152496714432332, + "learning_rate": 1.5884065511798957e-05, + "loss": 0.6199, + "step": 4339 + }, + { + "epoch": 0.5332350411598477, + "grad_norm": 1.364669105914537, + "learning_rate": 1.5877643318387338e-05, + "loss": 0.6496, + "step": 4340 + }, + { + "epoch": 0.5333579063767048, + "grad_norm": 1.1414381348561864, + "learning_rate": 1.5871220963538927e-05, + "loss": 0.6158, + "step": 4341 + }, + { + "epoch": 0.5334807715935619, + "grad_norm": 1.205476098846901, + "learning_rate": 1.5864798448435064e-05, + "loss": 0.6449, + "step": 4342 + }, + { + "epoch": 0.533603636810419, + "grad_norm": 1.1515790441114955, + "learning_rate": 1.5858375774257136e-05, + "loss": 0.6841, + "step": 4343 + }, + { + "epoch": 0.5337265020272761, + "grad_norm": 1.1716905336103194, + "learning_rate": 1.585195294218655e-05, + "loss": 0.6434, + "step": 4344 + }, + { + "epoch": 0.5338493672441332, + "grad_norm": 1.1931006330780958, + "learning_rate": 1.584552995340475e-05, + "loss": 0.6173, + "step": 4345 + }, + { + "epoch": 0.5339722324609903, + "grad_norm": 1.373425514395551, + "learning_rate": 1.58391068090932e-05, + "loss": 0.6638, + "step": 4346 + }, + { + "epoch": 0.5340950976778474, + "grad_norm": 1.3432509199282612, + "learning_rate": 1.5832683510433393e-05, + "loss": 0.5642, + "step": 4347 + }, + { + "epoch": 0.5342179628947045, + "grad_norm": 1.3532324317366726, + "learning_rate": 1.582626005860685e-05, + "loss": 0.54, + "step": 4348 + }, + { + "epoch": 0.5343408281115616, + "grad_norm": 1.0839589523668423, + "learning_rate": 1.581983645479513e-05, + "loss": 0.5708, + "step": 4349 + }, + { + "epoch": 0.5344636933284187, + "grad_norm": 1.3819817353416641, + "learning_rate": 1.581341270017981e-05, + "loss": 0.5615, + "step": 4350 + }, + { + "epoch": 0.5345865585452758, + "grad_norm": 1.2314795583657772, + "learning_rate": 1.5806988795942495e-05, + "loss": 0.6574, + "step": 4351 + }, + { + "epoch": 0.5347094237621329, + "grad_norm": 1.1931726245829304, + "learning_rate": 1.580056474326483e-05, + "loss": 0.5724, + "step": 4352 + }, + { + "epoch": 0.53483228897899, + "grad_norm": 1.2948324571557073, + "learning_rate": 1.5794140543328472e-05, + "loss": 0.5544, + "step": 4353 + }, + { + "epoch": 0.5349551541958472, + "grad_norm": 1.3274639917989848, + "learning_rate": 1.5787716197315107e-05, + "loss": 0.5947, + "step": 4354 + }, + { + "epoch": 0.5350780194127043, + "grad_norm": 1.2310755888579925, + "learning_rate": 1.578129170640646e-05, + "loss": 0.6114, + "step": 4355 + }, + { + "epoch": 0.5352008846295614, + "grad_norm": 1.1032310024156444, + "learning_rate": 1.5774867071784274e-05, + "loss": 0.6043, + "step": 4356 + }, + { + "epoch": 0.5353237498464185, + "grad_norm": 1.2465317001150256, + "learning_rate": 1.5768442294630312e-05, + "loss": 0.5047, + "step": 4357 + }, + { + "epoch": 0.5354466150632756, + "grad_norm": 1.1824330344197964, + "learning_rate": 1.5762017376126372e-05, + "loss": 0.6233, + "step": 4358 + }, + { + "epoch": 0.5355694802801327, + "grad_norm": 1.2558565493164255, + "learning_rate": 1.5755592317454278e-05, + "loss": 0.5619, + "step": 4359 + }, + { + "epoch": 0.5356923454969899, + "grad_norm": 1.230563680110278, + "learning_rate": 1.5749167119795878e-05, + "loss": 0.5895, + "step": 4360 + }, + { + "epoch": 0.5358152107138469, + "grad_norm": 1.6454217636588497, + "learning_rate": 1.574274178433304e-05, + "loss": 0.6129, + "step": 4361 + }, + { + "epoch": 0.535938075930704, + "grad_norm": 1.3735871897081506, + "learning_rate": 1.5736316312247675e-05, + "loss": 0.6368, + "step": 4362 + }, + { + "epoch": 0.5360609411475611, + "grad_norm": 1.1150419574792605, + "learning_rate": 1.5729890704721698e-05, + "loss": 0.5402, + "step": 4363 + }, + { + "epoch": 0.5361838063644182, + "grad_norm": 1.3591394221412354, + "learning_rate": 1.572346496293706e-05, + "loss": 0.5478, + "step": 4364 + }, + { + "epoch": 0.5363066715812753, + "grad_norm": 1.1773946048451065, + "learning_rate": 1.5717039088075728e-05, + "loss": 0.5578, + "step": 4365 + }, + { + "epoch": 0.5364295367981324, + "grad_norm": 1.7098353969847262, + "learning_rate": 1.5710613081319714e-05, + "loss": 0.6479, + "step": 4366 + }, + { + "epoch": 0.5365524020149895, + "grad_norm": 1.3012360290408098, + "learning_rate": 1.5704186943851025e-05, + "loss": 0.6386, + "step": 4367 + }, + { + "epoch": 0.5366752672318467, + "grad_norm": 1.2170361840451445, + "learning_rate": 1.5697760676851717e-05, + "loss": 0.5465, + "step": 4368 + }, + { + "epoch": 0.5367981324487038, + "grad_norm": 1.31871056996393, + "learning_rate": 1.5691334281503858e-05, + "loss": 0.578, + "step": 4369 + }, + { + "epoch": 0.5369209976655609, + "grad_norm": 1.360153677834514, + "learning_rate": 1.5684907758989543e-05, + "loss": 0.5408, + "step": 4370 + }, + { + "epoch": 0.537043862882418, + "grad_norm": 1.3243282796840476, + "learning_rate": 1.567848111049088e-05, + "loss": 0.5268, + "step": 4371 + }, + { + "epoch": 0.5371667280992751, + "grad_norm": 1.2515125885865221, + "learning_rate": 1.5672054337190026e-05, + "loss": 0.6206, + "step": 4372 + }, + { + "epoch": 0.5372895933161322, + "grad_norm": 1.314035487074974, + "learning_rate": 1.5665627440269134e-05, + "loss": 0.5428, + "step": 4373 + }, + { + "epoch": 0.5374124585329894, + "grad_norm": 1.2094874281039014, + "learning_rate": 1.565920042091039e-05, + "loss": 0.5459, + "step": 4374 + }, + { + "epoch": 0.5375353237498465, + "grad_norm": 1.245842235251364, + "learning_rate": 1.5652773280296002e-05, + "loss": 0.5478, + "step": 4375 + }, + { + "epoch": 0.5376581889667035, + "grad_norm": 1.512852440001919, + "learning_rate": 1.5646346019608205e-05, + "loss": 0.587, + "step": 4376 + }, + { + "epoch": 0.5377810541835606, + "grad_norm": 1.130211641181014, + "learning_rate": 1.5639918640029247e-05, + "loss": 0.6153, + "step": 4377 + }, + { + "epoch": 0.5379039194004177, + "grad_norm": 1.3430359473258147, + "learning_rate": 1.5633491142741403e-05, + "loss": 0.6509, + "step": 4378 + }, + { + "epoch": 0.5380267846172748, + "grad_norm": 1.2978172385890951, + "learning_rate": 1.5627063528926973e-05, + "loss": 0.6726, + "step": 4379 + }, + { + "epoch": 0.5381496498341319, + "grad_norm": 1.2246496647364546, + "learning_rate": 1.562063579976828e-05, + "loss": 0.5857, + "step": 4380 + }, + { + "epoch": 0.538272515050989, + "grad_norm": 1.2106986189311912, + "learning_rate": 1.561420795644765e-05, + "loss": 0.6691, + "step": 4381 + }, + { + "epoch": 0.5383953802678462, + "grad_norm": 1.5774516780781163, + "learning_rate": 1.560778000014745e-05, + "loss": 0.593, + "step": 4382 + }, + { + "epoch": 0.5385182454847033, + "grad_norm": 1.1914284958071424, + "learning_rate": 1.5601351932050063e-05, + "loss": 0.5734, + "step": 4383 + }, + { + "epoch": 0.5386411107015604, + "grad_norm": 1.3404874320057154, + "learning_rate": 1.5594923753337884e-05, + "loss": 0.6596, + "step": 4384 + }, + { + "epoch": 0.5387639759184175, + "grad_norm": 1.2906440277751678, + "learning_rate": 1.5588495465193345e-05, + "loss": 0.637, + "step": 4385 + }, + { + "epoch": 0.5388868411352746, + "grad_norm": 1.6627706207104231, + "learning_rate": 1.5582067068798873e-05, + "loss": 0.6444, + "step": 4386 + }, + { + "epoch": 0.5390097063521317, + "grad_norm": 1.1652940561602751, + "learning_rate": 1.557563856533695e-05, + "loss": 0.5761, + "step": 4387 + }, + { + "epoch": 0.5391325715689889, + "grad_norm": 1.1149310500983625, + "learning_rate": 1.5569209955990036e-05, + "loss": 0.6227, + "step": 4388 + }, + { + "epoch": 0.539255436785846, + "grad_norm": 1.1684253809898575, + "learning_rate": 1.5562781241940647e-05, + "loss": 0.5255, + "step": 4389 + }, + { + "epoch": 0.539378302002703, + "grad_norm": 1.1017776817192952, + "learning_rate": 1.5556352424371294e-05, + "loss": 0.6245, + "step": 4390 + }, + { + "epoch": 0.5395011672195601, + "grad_norm": 1.2426521268298811, + "learning_rate": 1.5549923504464527e-05, + "loss": 0.6153, + "step": 4391 + }, + { + "epoch": 0.5396240324364172, + "grad_norm": 0.9999074357185077, + "learning_rate": 1.5543494483402894e-05, + "loss": 0.5779, + "step": 4392 + }, + { + "epoch": 0.5397468976532743, + "grad_norm": 1.2701069339566733, + "learning_rate": 1.5537065362368977e-05, + "loss": 0.6407, + "step": 4393 + }, + { + "epoch": 0.5398697628701314, + "grad_norm": 1.5718431857727055, + "learning_rate": 1.553063614254537e-05, + "loss": 0.6242, + "step": 4394 + }, + { + "epoch": 0.5399926280869886, + "grad_norm": 1.3440920024964829, + "learning_rate": 1.5524206825114685e-05, + "loss": 0.5724, + "step": 4395 + }, + { + "epoch": 0.5401154933038457, + "grad_norm": 1.7663912998093592, + "learning_rate": 1.551777741125955e-05, + "loss": 0.5845, + "step": 4396 + }, + { + "epoch": 0.5402383585207028, + "grad_norm": 1.399582253014718, + "learning_rate": 1.5511347902162622e-05, + "loss": 0.56, + "step": 4397 + }, + { + "epoch": 0.5403612237375599, + "grad_norm": 1.4214701523303708, + "learning_rate": 1.5504918299006564e-05, + "loss": 0.6186, + "step": 4398 + }, + { + "epoch": 0.540484088954417, + "grad_norm": 1.3954991674110218, + "learning_rate": 1.549848860297406e-05, + "loss": 0.546, + "step": 4399 + }, + { + "epoch": 0.5406069541712741, + "grad_norm": 1.1911242466758956, + "learning_rate": 1.5492058815247804e-05, + "loss": 0.7685, + "step": 4400 + }, + { + "epoch": 0.5407298193881312, + "grad_norm": 1.4780228560256317, + "learning_rate": 1.548562893701053e-05, + "loss": 0.6962, + "step": 4401 + }, + { + "epoch": 0.5408526846049884, + "grad_norm": 1.4142146552519128, + "learning_rate": 1.5479198969444956e-05, + "loss": 0.6742, + "step": 4402 + }, + { + "epoch": 0.5409755498218455, + "grad_norm": 1.3956439994195429, + "learning_rate": 1.547276891373384e-05, + "loss": 0.7088, + "step": 4403 + }, + { + "epoch": 0.5410984150387026, + "grad_norm": 1.2921985467779908, + "learning_rate": 1.546633877105995e-05, + "loss": 0.5059, + "step": 4404 + }, + { + "epoch": 0.5412212802555596, + "grad_norm": 1.1689434958723093, + "learning_rate": 1.5459908542606066e-05, + "loss": 0.5953, + "step": 4405 + }, + { + "epoch": 0.5413441454724167, + "grad_norm": 1.3575765468504395, + "learning_rate": 1.545347822955499e-05, + "loss": 0.6937, + "step": 4406 + }, + { + "epoch": 0.5414670106892738, + "grad_norm": 1.4207492356528686, + "learning_rate": 1.544704783308953e-05, + "loss": 0.6077, + "step": 4407 + }, + { + "epoch": 0.541589875906131, + "grad_norm": 1.3673254576347835, + "learning_rate": 1.5440617354392526e-05, + "loss": 0.5699, + "step": 4408 + }, + { + "epoch": 0.5417127411229881, + "grad_norm": 1.4061306435221461, + "learning_rate": 1.5434186794646813e-05, + "loss": 0.6409, + "step": 4409 + }, + { + "epoch": 0.5418356063398452, + "grad_norm": 1.23927203093729, + "learning_rate": 1.5427756155035257e-05, + "loss": 0.562, + "step": 4410 + }, + { + "epoch": 0.5419584715567023, + "grad_norm": 1.1083953025537885, + "learning_rate": 1.5421325436740734e-05, + "loss": 0.6036, + "step": 4411 + }, + { + "epoch": 0.5420813367735594, + "grad_norm": 1.125628381479355, + "learning_rate": 1.5414894640946122e-05, + "loss": 0.6061, + "step": 4412 + }, + { + "epoch": 0.5422042019904165, + "grad_norm": 1.2102061137452074, + "learning_rate": 1.5408463768834336e-05, + "loss": 0.6162, + "step": 4413 + }, + { + "epoch": 0.5423270672072736, + "grad_norm": 1.0953672841397657, + "learning_rate": 1.5402032821588288e-05, + "loss": 0.5151, + "step": 4414 + }, + { + "epoch": 0.5424499324241308, + "grad_norm": 1.170885039744495, + "learning_rate": 1.5395601800390907e-05, + "loss": 0.6533, + "step": 4415 + }, + { + "epoch": 0.5425727976409879, + "grad_norm": 1.1843601345447476, + "learning_rate": 1.5389170706425142e-05, + "loss": 0.5613, + "step": 4416 + }, + { + "epoch": 0.542695662857845, + "grad_norm": 0.9852256196135046, + "learning_rate": 1.538273954087395e-05, + "loss": 0.5814, + "step": 4417 + }, + { + "epoch": 0.5428185280747021, + "grad_norm": 1.2138245101233558, + "learning_rate": 1.5376308304920303e-05, + "loss": 0.4784, + "step": 4418 + }, + { + "epoch": 0.5429413932915591, + "grad_norm": 1.2675575703041655, + "learning_rate": 1.536987699974718e-05, + "loss": 0.5928, + "step": 4419 + }, + { + "epoch": 0.5430642585084162, + "grad_norm": 1.4904067597884545, + "learning_rate": 1.536344562653759e-05, + "loss": 0.5469, + "step": 4420 + }, + { + "epoch": 0.5431871237252733, + "grad_norm": 1.2706073890078255, + "learning_rate": 1.5357014186474527e-05, + "loss": 0.4528, + "step": 4421 + }, + { + "epoch": 0.5433099889421305, + "grad_norm": 1.3660182021668983, + "learning_rate": 1.5350582680741022e-05, + "loss": 0.6297, + "step": 4422 + }, + { + "epoch": 0.5434328541589876, + "grad_norm": 1.3060323564396317, + "learning_rate": 1.5344151110520104e-05, + "loss": 0.6393, + "step": 4423 + }, + { + "epoch": 0.5435557193758447, + "grad_norm": 1.4459814644388398, + "learning_rate": 1.533771947699482e-05, + "loss": 0.6569, + "step": 4424 + }, + { + "epoch": 0.5436785845927018, + "grad_norm": 1.7803595888923265, + "learning_rate": 1.5331287781348234e-05, + "loss": 0.5763, + "step": 4425 + }, + { + "epoch": 0.5438014498095589, + "grad_norm": 1.0900153088744446, + "learning_rate": 1.53248560247634e-05, + "loss": 0.6526, + "step": 4426 + }, + { + "epoch": 0.543924315026416, + "grad_norm": 1.3829222922096918, + "learning_rate": 1.5318424208423415e-05, + "loss": 0.5478, + "step": 4427 + }, + { + "epoch": 0.5440471802432731, + "grad_norm": 1.1468478361051782, + "learning_rate": 1.531199233351136e-05, + "loss": 0.682, + "step": 4428 + }, + { + "epoch": 0.5441700454601303, + "grad_norm": 1.215359473410985, + "learning_rate": 1.5305560401210337e-05, + "loss": 0.5529, + "step": 4429 + }, + { + "epoch": 0.5442929106769874, + "grad_norm": 1.14155507189865, + "learning_rate": 1.5299128412703465e-05, + "loss": 0.4978, + "step": 4430 + }, + { + "epoch": 0.5444157758938445, + "grad_norm": 1.206657659405935, + "learning_rate": 1.5292696369173858e-05, + "loss": 0.716, + "step": 4431 + }, + { + "epoch": 0.5445386411107016, + "grad_norm": 1.2537533077007825, + "learning_rate": 1.5286264271804648e-05, + "loss": 0.5981, + "step": 4432 + }, + { + "epoch": 0.5446615063275587, + "grad_norm": 1.2922995084052475, + "learning_rate": 1.5279832121778987e-05, + "loss": 0.573, + "step": 4433 + }, + { + "epoch": 0.5447843715444157, + "grad_norm": 1.2324543411221631, + "learning_rate": 1.527339992028002e-05, + "loss": 0.6387, + "step": 4434 + }, + { + "epoch": 0.5449072367612728, + "grad_norm": 1.1123539861488962, + "learning_rate": 1.5266967668490912e-05, + "loss": 0.5635, + "step": 4435 + }, + { + "epoch": 0.54503010197813, + "grad_norm": 1.4280701949006696, + "learning_rate": 1.526053536759483e-05, + "loss": 0.6037, + "step": 4436 + }, + { + "epoch": 0.5451529671949871, + "grad_norm": 1.3604831566128586, + "learning_rate": 1.525410301877496e-05, + "loss": 0.5584, + "step": 4437 + }, + { + "epoch": 0.5452758324118442, + "grad_norm": 1.1648713619844802, + "learning_rate": 1.5247670623214484e-05, + "loss": 0.5319, + "step": 4438 + }, + { + "epoch": 0.5453986976287013, + "grad_norm": 1.175405421343064, + "learning_rate": 1.5241238182096606e-05, + "loss": 0.4967, + "step": 4439 + }, + { + "epoch": 0.5455215628455584, + "grad_norm": 1.1581257186237395, + "learning_rate": 1.5234805696604531e-05, + "loss": 0.6363, + "step": 4440 + }, + { + "epoch": 0.5456444280624155, + "grad_norm": 1.5628755360021496, + "learning_rate": 1.5228373167921469e-05, + "loss": 0.5712, + "step": 4441 + }, + { + "epoch": 0.5457672932792726, + "grad_norm": 1.517255865804262, + "learning_rate": 1.5221940597230639e-05, + "loss": 0.7081, + "step": 4442 + }, + { + "epoch": 0.5458901584961298, + "grad_norm": 1.1065633894569462, + "learning_rate": 1.5215507985715283e-05, + "loss": 0.5971, + "step": 4443 + }, + { + "epoch": 0.5460130237129869, + "grad_norm": 1.2370471596993637, + "learning_rate": 1.5209075334558625e-05, + "loss": 0.5096, + "step": 4444 + }, + { + "epoch": 0.546135888929844, + "grad_norm": 1.2961573522054295, + "learning_rate": 1.5202642644943914e-05, + "loss": 0.5963, + "step": 4445 + }, + { + "epoch": 0.5462587541467011, + "grad_norm": 1.1793622871558542, + "learning_rate": 1.5196209918054408e-05, + "loss": 0.6929, + "step": 4446 + }, + { + "epoch": 0.5463816193635582, + "grad_norm": 1.1690105854183774, + "learning_rate": 1.5189777155073354e-05, + "loss": 0.5404, + "step": 4447 + }, + { + "epoch": 0.5465044845804153, + "grad_norm": 1.2119655850553628, + "learning_rate": 1.5183344357184032e-05, + "loss": 0.6045, + "step": 4448 + }, + { + "epoch": 0.5466273497972723, + "grad_norm": 1.3773291861084984, + "learning_rate": 1.5176911525569699e-05, + "loss": 0.6, + "step": 4449 + }, + { + "epoch": 0.5467502150141295, + "grad_norm": 1.162922522027371, + "learning_rate": 1.517047866141364e-05, + "loss": 0.5384, + "step": 4450 + }, + { + "epoch": 0.5468730802309866, + "grad_norm": 1.183119167612791, + "learning_rate": 1.5164045765899133e-05, + "loss": 0.5471, + "step": 4451 + }, + { + "epoch": 0.5469959454478437, + "grad_norm": 1.1165821533359752, + "learning_rate": 1.5157612840209477e-05, + "loss": 0.5699, + "step": 4452 + }, + { + "epoch": 0.5471188106647008, + "grad_norm": 1.3353573976513766, + "learning_rate": 1.5151179885527954e-05, + "loss": 0.6006, + "step": 4453 + }, + { + "epoch": 0.5472416758815579, + "grad_norm": 1.0527357481281037, + "learning_rate": 1.5144746903037876e-05, + "loss": 0.5808, + "step": 4454 + }, + { + "epoch": 0.547364541098415, + "grad_norm": 1.14211966998034, + "learning_rate": 1.5138313893922542e-05, + "loss": 0.6287, + "step": 4455 + }, + { + "epoch": 0.5474874063152722, + "grad_norm": 1.194663339750787, + "learning_rate": 1.5131880859365268e-05, + "loss": 0.6798, + "step": 4456 + }, + { + "epoch": 0.5476102715321293, + "grad_norm": 1.3511152213859463, + "learning_rate": 1.5125447800549357e-05, + "loss": 0.589, + "step": 4457 + }, + { + "epoch": 0.5477331367489864, + "grad_norm": 1.1438649063757471, + "learning_rate": 1.5119014718658147e-05, + "loss": 0.5705, + "step": 4458 + }, + { + "epoch": 0.5478560019658435, + "grad_norm": 1.1416498969775781, + "learning_rate": 1.5112581614874946e-05, + "loss": 0.5874, + "step": 4459 + }, + { + "epoch": 0.5479788671827006, + "grad_norm": 1.2984677487773286, + "learning_rate": 1.5106148490383091e-05, + "loss": 0.6716, + "step": 4460 + }, + { + "epoch": 0.5481017323995577, + "grad_norm": 1.428721142160359, + "learning_rate": 1.5099715346365902e-05, + "loss": 0.7229, + "step": 4461 + }, + { + "epoch": 0.5482245976164148, + "grad_norm": 1.401384278623002, + "learning_rate": 1.5093282184006728e-05, + "loss": 0.6353, + "step": 4462 + }, + { + "epoch": 0.5483474628332718, + "grad_norm": 1.2188452910503562, + "learning_rate": 1.5086849004488897e-05, + "loss": 0.6321, + "step": 4463 + }, + { + "epoch": 0.548470328050129, + "grad_norm": 1.371623706752202, + "learning_rate": 1.508041580899576e-05, + "loss": 0.6799, + "step": 4464 + }, + { + "epoch": 0.5485931932669861, + "grad_norm": 1.1664618542606606, + "learning_rate": 1.507398259871065e-05, + "loss": 0.7031, + "step": 4465 + }, + { + "epoch": 0.5487160584838432, + "grad_norm": 1.1228984135439792, + "learning_rate": 1.5067549374816924e-05, + "loss": 0.7082, + "step": 4466 + }, + { + "epoch": 0.5488389237007003, + "grad_norm": 1.3549127989268441, + "learning_rate": 1.506111613849793e-05, + "loss": 0.5885, + "step": 4467 + }, + { + "epoch": 0.5489617889175574, + "grad_norm": 1.0935857124271429, + "learning_rate": 1.5054682890937019e-05, + "loss": 0.5847, + "step": 4468 + }, + { + "epoch": 0.5490846541344145, + "grad_norm": 1.2498660503901702, + "learning_rate": 1.5048249633317546e-05, + "loss": 0.5746, + "step": 4469 + }, + { + "epoch": 0.5492075193512717, + "grad_norm": 1.545086196716166, + "learning_rate": 1.5041816366822859e-05, + "loss": 0.5565, + "step": 4470 + }, + { + "epoch": 0.5493303845681288, + "grad_norm": 1.4590704634778577, + "learning_rate": 1.503538309263633e-05, + "loss": 0.6359, + "step": 4471 + }, + { + "epoch": 0.5494532497849859, + "grad_norm": 1.162559011426625, + "learning_rate": 1.5028949811941304e-05, + "loss": 0.6319, + "step": 4472 + }, + { + "epoch": 0.549576115001843, + "grad_norm": 1.1757854505504235, + "learning_rate": 1.5022516525921152e-05, + "loss": 0.5699, + "step": 4473 + }, + { + "epoch": 0.5496989802187001, + "grad_norm": 1.232704257692557, + "learning_rate": 1.5016083235759227e-05, + "loss": 0.6501, + "step": 4474 + }, + { + "epoch": 0.5498218454355572, + "grad_norm": 1.1945508405090481, + "learning_rate": 1.5009649942638901e-05, + "loss": 0.595, + "step": 4475 + }, + { + "epoch": 0.5499447106524143, + "grad_norm": 1.4177657056089086, + "learning_rate": 1.5003216647743528e-05, + "loss": 0.5617, + "step": 4476 + }, + { + "epoch": 0.5500675758692715, + "grad_norm": 1.0908269820597043, + "learning_rate": 1.4996783352256473e-05, + "loss": 0.547, + "step": 4477 + }, + { + "epoch": 0.5501904410861285, + "grad_norm": 1.2935883696934694, + "learning_rate": 1.4990350057361101e-05, + "loss": 0.5732, + "step": 4478 + }, + { + "epoch": 0.5503133063029856, + "grad_norm": 1.325777519539606, + "learning_rate": 1.4983916764240773e-05, + "loss": 0.6185, + "step": 4479 + }, + { + "epoch": 0.5504361715198427, + "grad_norm": 1.5744817184129773, + "learning_rate": 1.4977483474078852e-05, + "loss": 0.5838, + "step": 4480 + }, + { + "epoch": 0.5505590367366998, + "grad_norm": 1.1944680118221014, + "learning_rate": 1.4971050188058697e-05, + "loss": 0.5889, + "step": 4481 + }, + { + "epoch": 0.5506819019535569, + "grad_norm": 1.3516922445194715, + "learning_rate": 1.4964616907363675e-05, + "loss": 0.5654, + "step": 4482 + }, + { + "epoch": 0.550804767170414, + "grad_norm": 1.286874988876658, + "learning_rate": 1.4958183633177142e-05, + "loss": 0.5751, + "step": 4483 + }, + { + "epoch": 0.5509276323872712, + "grad_norm": 1.26623507775261, + "learning_rate": 1.4951750366682462e-05, + "loss": 0.5395, + "step": 4484 + }, + { + "epoch": 0.5510504976041283, + "grad_norm": 1.3045348779048043, + "learning_rate": 1.4945317109062985e-05, + "loss": 0.5404, + "step": 4485 + }, + { + "epoch": 0.5511733628209854, + "grad_norm": 1.1814815414830238, + "learning_rate": 1.4938883861502073e-05, + "loss": 0.6088, + "step": 4486 + }, + { + "epoch": 0.5512962280378425, + "grad_norm": 1.4403489615965988, + "learning_rate": 1.493245062518308e-05, + "loss": 0.7054, + "step": 4487 + }, + { + "epoch": 0.5514190932546996, + "grad_norm": 1.2478655198219715, + "learning_rate": 1.4926017401289349e-05, + "loss": 0.6761, + "step": 4488 + }, + { + "epoch": 0.5515419584715567, + "grad_norm": 1.119638596721304, + "learning_rate": 1.4919584191004244e-05, + "loss": 0.6522, + "step": 4489 + }, + { + "epoch": 0.5516648236884139, + "grad_norm": 1.268366322793554, + "learning_rate": 1.4913150995511104e-05, + "loss": 0.6541, + "step": 4490 + }, + { + "epoch": 0.551787688905271, + "grad_norm": 1.0876933348766653, + "learning_rate": 1.4906717815993278e-05, + "loss": 0.6508, + "step": 4491 + }, + { + "epoch": 0.551910554122128, + "grad_norm": 1.1923919935040954, + "learning_rate": 1.4900284653634095e-05, + "loss": 0.629, + "step": 4492 + }, + { + "epoch": 0.5520334193389851, + "grad_norm": 1.2021297162767905, + "learning_rate": 1.4893851509616913e-05, + "loss": 0.598, + "step": 4493 + }, + { + "epoch": 0.5521562845558422, + "grad_norm": 1.3208219585946581, + "learning_rate": 1.4887418385125056e-05, + "loss": 0.6449, + "step": 4494 + }, + { + "epoch": 0.5522791497726993, + "grad_norm": 1.174668382137461, + "learning_rate": 1.4880985281341855e-05, + "loss": 0.6805, + "step": 4495 + }, + { + "epoch": 0.5524020149895564, + "grad_norm": 1.265904037801625, + "learning_rate": 1.487455219945064e-05, + "loss": 0.5479, + "step": 4496 + }, + { + "epoch": 0.5525248802064135, + "grad_norm": 1.2566706155081635, + "learning_rate": 1.4868119140634736e-05, + "loss": 0.6049, + "step": 4497 + }, + { + "epoch": 0.5526477454232707, + "grad_norm": 1.0764738805760177, + "learning_rate": 1.4861686106077462e-05, + "loss": 0.5802, + "step": 4498 + }, + { + "epoch": 0.5527706106401278, + "grad_norm": 1.5083802842500211, + "learning_rate": 1.485525309696213e-05, + "loss": 0.6465, + "step": 4499 + }, + { + "epoch": 0.5528934758569849, + "grad_norm": 1.5891397820528708, + "learning_rate": 1.4848820114472045e-05, + "loss": 0.6226, + "step": 4500 + }, + { + "epoch": 0.553016341073842, + "grad_norm": 1.6451756666838886, + "learning_rate": 1.4842387159790527e-05, + "loss": 0.7007, + "step": 4501 + }, + { + "epoch": 0.5531392062906991, + "grad_norm": 1.2864200030059876, + "learning_rate": 1.483595423410087e-05, + "loss": 0.5832, + "step": 4502 + }, + { + "epoch": 0.5532620715075562, + "grad_norm": 1.3253782991107685, + "learning_rate": 1.4829521338586367e-05, + "loss": 0.5811, + "step": 4503 + }, + { + "epoch": 0.5533849367244134, + "grad_norm": 1.2203832090377358, + "learning_rate": 1.4823088474430304e-05, + "loss": 0.5657, + "step": 4504 + }, + { + "epoch": 0.5535078019412705, + "grad_norm": 1.384674791685188, + "learning_rate": 1.4816655642815972e-05, + "loss": 0.6722, + "step": 4505 + }, + { + "epoch": 0.5536306671581276, + "grad_norm": 1.2188641886672031, + "learning_rate": 1.4810222844926647e-05, + "loss": 0.5869, + "step": 4506 + }, + { + "epoch": 0.5537535323749846, + "grad_norm": 1.2759091005276342, + "learning_rate": 1.4803790081945597e-05, + "loss": 0.6597, + "step": 4507 + }, + { + "epoch": 0.5538763975918417, + "grad_norm": 1.1807328860881836, + "learning_rate": 1.4797357355056085e-05, + "loss": 0.5196, + "step": 4508 + }, + { + "epoch": 0.5539992628086988, + "grad_norm": 1.4550087236675344, + "learning_rate": 1.4790924665441379e-05, + "loss": 0.7073, + "step": 4509 + }, + { + "epoch": 0.5541221280255559, + "grad_norm": 1.20617484013702, + "learning_rate": 1.4784492014284723e-05, + "loss": 0.5589, + "step": 4510 + }, + { + "epoch": 0.554244993242413, + "grad_norm": 1.1850610009161269, + "learning_rate": 1.4778059402769358e-05, + "loss": 0.6433, + "step": 4511 + }, + { + "epoch": 0.5543678584592702, + "grad_norm": 1.3111248328173564, + "learning_rate": 1.4771626832078534e-05, + "loss": 0.5832, + "step": 4512 + }, + { + "epoch": 0.5544907236761273, + "grad_norm": 1.232259243918243, + "learning_rate": 1.4765194303395473e-05, + "loss": 0.6259, + "step": 4513 + }, + { + "epoch": 0.5546135888929844, + "grad_norm": 1.3562104534888184, + "learning_rate": 1.4758761817903396e-05, + "loss": 0.6329, + "step": 4514 + }, + { + "epoch": 0.5547364541098415, + "grad_norm": 1.1149645080830672, + "learning_rate": 1.4752329376785516e-05, + "loss": 0.6298, + "step": 4515 + }, + { + "epoch": 0.5548593193266986, + "grad_norm": 1.3777037703290833, + "learning_rate": 1.4745896981225043e-05, + "loss": 0.666, + "step": 4516 + }, + { + "epoch": 0.5549821845435557, + "grad_norm": 1.2708174266762104, + "learning_rate": 1.4739464632405173e-05, + "loss": 0.5696, + "step": 4517 + }, + { + "epoch": 0.5551050497604129, + "grad_norm": 1.2855325492775243, + "learning_rate": 1.4733032331509094e-05, + "loss": 0.5785, + "step": 4518 + }, + { + "epoch": 0.55522791497727, + "grad_norm": 1.162759920514632, + "learning_rate": 1.472660007971998e-05, + "loss": 0.5917, + "step": 4519 + }, + { + "epoch": 0.5553507801941271, + "grad_norm": 1.1289066739806524, + "learning_rate": 1.4720167878221014e-05, + "loss": 0.5899, + "step": 4520 + }, + { + "epoch": 0.5554736454109841, + "grad_norm": 1.1573277873786398, + "learning_rate": 1.4713735728195353e-05, + "loss": 0.5714, + "step": 4521 + }, + { + "epoch": 0.5555965106278412, + "grad_norm": 1.2739878873869095, + "learning_rate": 1.4707303630826148e-05, + "loss": 0.5278, + "step": 4522 + }, + { + "epoch": 0.5557193758446983, + "grad_norm": 1.123043148241051, + "learning_rate": 1.4700871587296539e-05, + "loss": 0.7693, + "step": 4523 + }, + { + "epoch": 0.5558422410615554, + "grad_norm": 1.1383851225677568, + "learning_rate": 1.4694439598789664e-05, + "loss": 0.5888, + "step": 4524 + }, + { + "epoch": 0.5559651062784126, + "grad_norm": 1.4281677771852896, + "learning_rate": 1.4688007666488645e-05, + "loss": 0.6148, + "step": 4525 + }, + { + "epoch": 0.5560879714952697, + "grad_norm": 1.2098656192638517, + "learning_rate": 1.468157579157659e-05, + "loss": 0.6231, + "step": 4526 + }, + { + "epoch": 0.5562108367121268, + "grad_norm": 1.2356727598940476, + "learning_rate": 1.4675143975236599e-05, + "loss": 0.5427, + "step": 4527 + }, + { + "epoch": 0.5563337019289839, + "grad_norm": 1.3057850100461088, + "learning_rate": 1.4668712218651772e-05, + "loss": 0.6102, + "step": 4528 + }, + { + "epoch": 0.556456567145841, + "grad_norm": 1.364942198914135, + "learning_rate": 1.4662280523005185e-05, + "loss": 0.6551, + "step": 4529 + }, + { + "epoch": 0.5565794323626981, + "grad_norm": 1.1478400103045578, + "learning_rate": 1.4655848889479897e-05, + "loss": 0.54, + "step": 4530 + }, + { + "epoch": 0.5567022975795552, + "grad_norm": 1.155880125066274, + "learning_rate": 1.4649417319258982e-05, + "loss": 0.5119, + "step": 4531 + }, + { + "epoch": 0.5568251627964124, + "grad_norm": 1.4265257834225555, + "learning_rate": 1.4642985813525477e-05, + "loss": 0.7146, + "step": 4532 + }, + { + "epoch": 0.5569480280132695, + "grad_norm": 1.3358597868199606, + "learning_rate": 1.4636554373462416e-05, + "loss": 0.7056, + "step": 4533 + }, + { + "epoch": 0.5570708932301266, + "grad_norm": 1.374944699290895, + "learning_rate": 1.463012300025282e-05, + "loss": 0.7309, + "step": 4534 + }, + { + "epoch": 0.5571937584469837, + "grad_norm": 1.6185247202588084, + "learning_rate": 1.4623691695079698e-05, + "loss": 0.685, + "step": 4535 + }, + { + "epoch": 0.5573166236638407, + "grad_norm": 1.3257000258769485, + "learning_rate": 1.4617260459126053e-05, + "loss": 0.7105, + "step": 4536 + }, + { + "epoch": 0.5574394888806978, + "grad_norm": 1.0837295298939489, + "learning_rate": 1.461082929357486e-05, + "loss": 0.5391, + "step": 4537 + }, + { + "epoch": 0.557562354097555, + "grad_norm": 1.1480056822113902, + "learning_rate": 1.4604398199609092e-05, + "loss": 0.5685, + "step": 4538 + }, + { + "epoch": 0.5576852193144121, + "grad_norm": 1.1417248832046307, + "learning_rate": 1.4597967178411715e-05, + "loss": 0.6008, + "step": 4539 + }, + { + "epoch": 0.5578080845312692, + "grad_norm": 1.2106735754832245, + "learning_rate": 1.4591536231165668e-05, + "loss": 0.6936, + "step": 4540 + }, + { + "epoch": 0.5579309497481263, + "grad_norm": 1.272929733840975, + "learning_rate": 1.4585105359053882e-05, + "loss": 0.5872, + "step": 4541 + }, + { + "epoch": 0.5580538149649834, + "grad_norm": 0.9631214973392644, + "learning_rate": 1.4578674563259272e-05, + "loss": 0.6436, + "step": 4542 + }, + { + "epoch": 0.5581766801818405, + "grad_norm": 1.2266359810138767, + "learning_rate": 1.4572243844964745e-05, + "loss": 0.6264, + "step": 4543 + }, + { + "epoch": 0.5582995453986976, + "grad_norm": 1.275499027441949, + "learning_rate": 1.4565813205353191e-05, + "loss": 0.6734, + "step": 4544 + }, + { + "epoch": 0.5584224106155548, + "grad_norm": 0.9970198864079293, + "learning_rate": 1.455938264560748e-05, + "loss": 0.6064, + "step": 4545 + }, + { + "epoch": 0.5585452758324119, + "grad_norm": 1.5746069220121484, + "learning_rate": 1.455295216691047e-05, + "loss": 0.6782, + "step": 4546 + }, + { + "epoch": 0.558668141049269, + "grad_norm": 1.3137406249270493, + "learning_rate": 1.4546521770445014e-05, + "loss": 0.5421, + "step": 4547 + }, + { + "epoch": 0.5587910062661261, + "grad_norm": 1.4233865667752914, + "learning_rate": 1.4540091457393938e-05, + "loss": 0.5484, + "step": 4548 + }, + { + "epoch": 0.5589138714829832, + "grad_norm": 1.2965086226494713, + "learning_rate": 1.4533661228940056e-05, + "loss": 0.5922, + "step": 4549 + }, + { + "epoch": 0.5590367366998402, + "grad_norm": 1.337520201899617, + "learning_rate": 1.452723108626616e-05, + "loss": 0.5756, + "step": 4550 + }, + { + "epoch": 0.5591596019166973, + "grad_norm": 1.1384019680587942, + "learning_rate": 1.4520801030555044e-05, + "loss": 0.6657, + "step": 4551 + }, + { + "epoch": 0.5592824671335545, + "grad_norm": 1.2969664976338569, + "learning_rate": 1.4514371062989473e-05, + "loss": 0.6495, + "step": 4552 + }, + { + "epoch": 0.5594053323504116, + "grad_norm": 1.2762865956281078, + "learning_rate": 1.4507941184752195e-05, + "loss": 0.6123, + "step": 4553 + }, + { + "epoch": 0.5595281975672687, + "grad_norm": 1.2175136283511327, + "learning_rate": 1.4501511397025943e-05, + "loss": 0.5882, + "step": 4554 + }, + { + "epoch": 0.5596510627841258, + "grad_norm": 1.2609843182602287, + "learning_rate": 1.449508170099344e-05, + "loss": 0.4765, + "step": 4555 + }, + { + "epoch": 0.5597739280009829, + "grad_norm": 1.3208467243896405, + "learning_rate": 1.4488652097837384e-05, + "loss": 0.6253, + "step": 4556 + }, + { + "epoch": 0.55989679321784, + "grad_norm": 1.2000709539019305, + "learning_rate": 1.4482222588740448e-05, + "loss": 0.6746, + "step": 4557 + }, + { + "epoch": 0.5600196584346971, + "grad_norm": 1.3349090887767852, + "learning_rate": 1.447579317488532e-05, + "loss": 0.605, + "step": 4558 + }, + { + "epoch": 0.5601425236515543, + "grad_norm": 1.4191723639137879, + "learning_rate": 1.4469363857454635e-05, + "loss": 0.6902, + "step": 4559 + }, + { + "epoch": 0.5602653888684114, + "grad_norm": 1.2947562089516875, + "learning_rate": 1.4462934637631027e-05, + "loss": 0.584, + "step": 4560 + }, + { + "epoch": 0.5603882540852685, + "grad_norm": 1.3725611456393727, + "learning_rate": 1.4456505516597107e-05, + "loss": 0.5444, + "step": 4561 + }, + { + "epoch": 0.5605111193021256, + "grad_norm": 1.2165606450914632, + "learning_rate": 1.4450076495535477e-05, + "loss": 0.6111, + "step": 4562 + }, + { + "epoch": 0.5606339845189827, + "grad_norm": 1.173057682450614, + "learning_rate": 1.4443647575628707e-05, + "loss": 0.5474, + "step": 4563 + }, + { + "epoch": 0.5607568497358398, + "grad_norm": 1.7025442401552386, + "learning_rate": 1.443721875805936e-05, + "loss": 0.5414, + "step": 4564 + }, + { + "epoch": 0.5608797149526968, + "grad_norm": 1.1854205065105832, + "learning_rate": 1.4430790044009965e-05, + "loss": 0.6657, + "step": 4565 + }, + { + "epoch": 0.561002580169554, + "grad_norm": 1.2488439286655864, + "learning_rate": 1.4424361434663057e-05, + "loss": 0.5772, + "step": 4566 + }, + { + "epoch": 0.5611254453864111, + "grad_norm": 1.3154482485603118, + "learning_rate": 1.4417932931201126e-05, + "loss": 0.6389, + "step": 4567 + }, + { + "epoch": 0.5612483106032682, + "grad_norm": 1.5692632570005356, + "learning_rate": 1.4411504534806662e-05, + "loss": 0.5968, + "step": 4568 + }, + { + "epoch": 0.5613711758201253, + "grad_norm": 1.5573175195292288, + "learning_rate": 1.4405076246662113e-05, + "loss": 0.6872, + "step": 4569 + }, + { + "epoch": 0.5614940410369824, + "grad_norm": 1.2836386837143907, + "learning_rate": 1.439864806794994e-05, + "loss": 0.6585, + "step": 4570 + }, + { + "epoch": 0.5616169062538395, + "grad_norm": 1.3688507275269965, + "learning_rate": 1.4392219999852552e-05, + "loss": 0.4901, + "step": 4571 + }, + { + "epoch": 0.5617397714706966, + "grad_norm": 1.19968766618538, + "learning_rate": 1.4385792043552354e-05, + "loss": 0.5083, + "step": 4572 + }, + { + "epoch": 0.5618626366875538, + "grad_norm": 1.1202824634002855, + "learning_rate": 1.4379364200231724e-05, + "loss": 0.6051, + "step": 4573 + }, + { + "epoch": 0.5619855019044109, + "grad_norm": 1.40346823691081, + "learning_rate": 1.4372936471073028e-05, + "loss": 0.5536, + "step": 4574 + }, + { + "epoch": 0.562108367121268, + "grad_norm": 1.1171922075885992, + "learning_rate": 1.43665088572586e-05, + "loss": 0.6133, + "step": 4575 + }, + { + "epoch": 0.5622312323381251, + "grad_norm": 1.289845147256014, + "learning_rate": 1.4360081359970755e-05, + "loss": 0.5273, + "step": 4576 + }, + { + "epoch": 0.5623540975549822, + "grad_norm": 1.1829108444469658, + "learning_rate": 1.4353653980391799e-05, + "loss": 0.5498, + "step": 4577 + }, + { + "epoch": 0.5624769627718393, + "grad_norm": 1.219701943325564, + "learning_rate": 1.4347226719704e-05, + "loss": 0.5624, + "step": 4578 + }, + { + "epoch": 0.5625998279886965, + "grad_norm": 1.6466855801721787, + "learning_rate": 1.4340799579089615e-05, + "loss": 0.6446, + "step": 4579 + }, + { + "epoch": 0.5627226932055535, + "grad_norm": 1.2752422903096396, + "learning_rate": 1.4334372559730867e-05, + "loss": 0.577, + "step": 4580 + }, + { + "epoch": 0.5628455584224106, + "grad_norm": 1.2948668744389487, + "learning_rate": 1.4327945662809975e-05, + "loss": 0.5539, + "step": 4581 + }, + { + "epoch": 0.5629684236392677, + "grad_norm": 1.220509798863145, + "learning_rate": 1.4321518889509118e-05, + "loss": 0.6998, + "step": 4582 + }, + { + "epoch": 0.5630912888561248, + "grad_norm": 1.3211875168554499, + "learning_rate": 1.4315092241010465e-05, + "loss": 0.5931, + "step": 4583 + }, + { + "epoch": 0.5632141540729819, + "grad_norm": 1.2020959526567563, + "learning_rate": 1.4308665718496143e-05, + "loss": 0.6616, + "step": 4584 + }, + { + "epoch": 0.563337019289839, + "grad_norm": 1.1419602095310597, + "learning_rate": 1.4302239323148284e-05, + "loss": 0.4885, + "step": 4585 + }, + { + "epoch": 0.5634598845066962, + "grad_norm": 1.150258580649529, + "learning_rate": 1.4295813056148979e-05, + "loss": 0.4698, + "step": 4586 + }, + { + "epoch": 0.5635827497235533, + "grad_norm": 1.442118918339612, + "learning_rate": 1.4289386918680294e-05, + "loss": 0.7413, + "step": 4587 + }, + { + "epoch": 0.5637056149404104, + "grad_norm": 1.1024912245550025, + "learning_rate": 1.428296091192427e-05, + "loss": 0.5782, + "step": 4588 + }, + { + "epoch": 0.5638284801572675, + "grad_norm": 1.360787001640595, + "learning_rate": 1.4276535037062943e-05, + "loss": 0.6903, + "step": 4589 + }, + { + "epoch": 0.5639513453741246, + "grad_norm": 1.2409838415233667, + "learning_rate": 1.4270109295278305e-05, + "loss": 0.5944, + "step": 4590 + }, + { + "epoch": 0.5640742105909817, + "grad_norm": 1.2644270449134676, + "learning_rate": 1.4263683687752329e-05, + "loss": 0.6029, + "step": 4591 + }, + { + "epoch": 0.5641970758078388, + "grad_norm": 1.1922045277870976, + "learning_rate": 1.4257258215666957e-05, + "loss": 0.711, + "step": 4592 + }, + { + "epoch": 0.564319941024696, + "grad_norm": 1.2544398943731, + "learning_rate": 1.4250832880204126e-05, + "loss": 0.7277, + "step": 4593 + }, + { + "epoch": 0.564442806241553, + "grad_norm": 1.3220740885583142, + "learning_rate": 1.4244407682545728e-05, + "loss": 0.6256, + "step": 4594 + }, + { + "epoch": 0.5645656714584101, + "grad_norm": 1.5112286502961825, + "learning_rate": 1.4237982623873629e-05, + "loss": 0.7181, + "step": 4595 + }, + { + "epoch": 0.5646885366752672, + "grad_norm": 1.1297177874099888, + "learning_rate": 1.4231557705369689e-05, + "loss": 0.678, + "step": 4596 + }, + { + "epoch": 0.5648114018921243, + "grad_norm": 1.217289148495727, + "learning_rate": 1.4225132928215729e-05, + "loss": 0.6702, + "step": 4597 + }, + { + "epoch": 0.5649342671089814, + "grad_norm": 1.468289391058185, + "learning_rate": 1.4218708293593539e-05, + "loss": 0.581, + "step": 4598 + }, + { + "epoch": 0.5650571323258385, + "grad_norm": 1.109761940241006, + "learning_rate": 1.421228380268489e-05, + "loss": 0.5745, + "step": 4599 + }, + { + "epoch": 0.5651799975426957, + "grad_norm": 0.988035017065615, + "learning_rate": 1.420585945667153e-05, + "loss": 0.5874, + "step": 4600 + }, + { + "epoch": 0.5653028627595528, + "grad_norm": 1.3472977073672892, + "learning_rate": 1.4199435256735172e-05, + "loss": 0.5876, + "step": 4601 + }, + { + "epoch": 0.5654257279764099, + "grad_norm": 1.3264984701934266, + "learning_rate": 1.4193011204057507e-05, + "loss": 0.5415, + "step": 4602 + }, + { + "epoch": 0.565548593193267, + "grad_norm": 1.1268641081728386, + "learning_rate": 1.4186587299820193e-05, + "loss": 0.647, + "step": 4603 + }, + { + "epoch": 0.5656714584101241, + "grad_norm": 1.1927765742830152, + "learning_rate": 1.4180163545204875e-05, + "loss": 0.6063, + "step": 4604 + }, + { + "epoch": 0.5657943236269812, + "grad_norm": 1.2084156004917113, + "learning_rate": 1.4173739941393156e-05, + "loss": 0.6132, + "step": 4605 + }, + { + "epoch": 0.5659171888438383, + "grad_norm": 1.1734994946985944, + "learning_rate": 1.4167316489566617e-05, + "loss": 0.542, + "step": 4606 + }, + { + "epoch": 0.5660400540606955, + "grad_norm": 1.2683835398106218, + "learning_rate": 1.4160893190906804e-05, + "loss": 0.4758, + "step": 4607 + }, + { + "epoch": 0.5661629192775526, + "grad_norm": 1.1110681824804856, + "learning_rate": 1.4154470046595251e-05, + "loss": 0.496, + "step": 4608 + }, + { + "epoch": 0.5662857844944096, + "grad_norm": 1.242314687308722, + "learning_rate": 1.414804705781345e-05, + "loss": 0.6035, + "step": 4609 + }, + { + "epoch": 0.5664086497112667, + "grad_norm": 1.194385152304613, + "learning_rate": 1.4141624225742867e-05, + "loss": 0.6421, + "step": 4610 + }, + { + "epoch": 0.5665315149281238, + "grad_norm": 1.3112145086228058, + "learning_rate": 1.4135201551564937e-05, + "loss": 0.7328, + "step": 4611 + }, + { + "epoch": 0.5666543801449809, + "grad_norm": 1.4662565351862022, + "learning_rate": 1.4128779036461077e-05, + "loss": 0.5955, + "step": 4612 + }, + { + "epoch": 0.566777245361838, + "grad_norm": 1.272298035908665, + "learning_rate": 1.4122356681612664e-05, + "loss": 0.6296, + "step": 4613 + }, + { + "epoch": 0.5669001105786952, + "grad_norm": 1.3363067122762489, + "learning_rate": 1.4115934488201047e-05, + "loss": 0.6874, + "step": 4614 + }, + { + "epoch": 0.5670229757955523, + "grad_norm": 0.9626580778673175, + "learning_rate": 1.4109512457407543e-05, + "loss": 0.526, + "step": 4615 + }, + { + "epoch": 0.5671458410124094, + "grad_norm": 1.1346194392700522, + "learning_rate": 1.4103090590413452e-05, + "loss": 0.512, + "step": 4616 + }, + { + "epoch": 0.5672687062292665, + "grad_norm": 1.244233754594268, + "learning_rate": 1.409666888840003e-05, + "loss": 0.6733, + "step": 4617 + }, + { + "epoch": 0.5673915714461236, + "grad_norm": 1.3256181257090929, + "learning_rate": 1.4090247352548504e-05, + "loss": 0.5006, + "step": 4618 + }, + { + "epoch": 0.5675144366629807, + "grad_norm": 1.047957550009395, + "learning_rate": 1.4083825984040083e-05, + "loss": 0.5525, + "step": 4619 + }, + { + "epoch": 0.5676373018798379, + "grad_norm": 1.238754108344294, + "learning_rate": 1.407740478405593e-05, + "loss": 0.5731, + "step": 4620 + }, + { + "epoch": 0.567760167096695, + "grad_norm": 1.2432835982571118, + "learning_rate": 1.4070983753777183e-05, + "loss": 0.672, + "step": 4621 + }, + { + "epoch": 0.5678830323135521, + "grad_norm": 1.3686640376188857, + "learning_rate": 1.4064562894384944e-05, + "loss": 0.7681, + "step": 4622 + }, + { + "epoch": 0.5680058975304091, + "grad_norm": 1.107506593510886, + "learning_rate": 1.40581422070603e-05, + "loss": 0.6099, + "step": 4623 + }, + { + "epoch": 0.5681287627472662, + "grad_norm": 1.090701703080941, + "learning_rate": 1.4051721692984289e-05, + "loss": 0.6478, + "step": 4624 + }, + { + "epoch": 0.5682516279641233, + "grad_norm": 0.94938856747882, + "learning_rate": 1.4045301353337922e-05, + "loss": 0.5474, + "step": 4625 + }, + { + "epoch": 0.5683744931809804, + "grad_norm": 1.22475836212397, + "learning_rate": 1.4038881189302175e-05, + "loss": 0.6494, + "step": 4626 + }, + { + "epoch": 0.5684973583978375, + "grad_norm": 1.151417800091835, + "learning_rate": 1.4032461202058009e-05, + "loss": 0.6643, + "step": 4627 + }, + { + "epoch": 0.5686202236146947, + "grad_norm": 1.230753789284278, + "learning_rate": 1.4026041392786325e-05, + "loss": 0.6121, + "step": 4628 + }, + { + "epoch": 0.5687430888315518, + "grad_norm": 1.3666543057223202, + "learning_rate": 1.4019621762668011e-05, + "loss": 0.6018, + "step": 4629 + }, + { + "epoch": 0.5688659540484089, + "grad_norm": 1.1921416588098885, + "learning_rate": 1.4013202312883912e-05, + "loss": 0.6543, + "step": 4630 + }, + { + "epoch": 0.568988819265266, + "grad_norm": 1.0791112656788846, + "learning_rate": 1.4006783044614853e-05, + "loss": 0.6327, + "step": 4631 + }, + { + "epoch": 0.5691116844821231, + "grad_norm": 1.437141862980313, + "learning_rate": 1.400036395904161e-05, + "loss": 0.558, + "step": 4632 + }, + { + "epoch": 0.5692345496989802, + "grad_norm": 1.2936297136878028, + "learning_rate": 1.3993945057344935e-05, + "loss": 0.4617, + "step": 4633 + }, + { + "epoch": 0.5693574149158374, + "grad_norm": 1.4283639367044074, + "learning_rate": 1.3987526340705538e-05, + "loss": 0.6019, + "step": 4634 + }, + { + "epoch": 0.5694802801326945, + "grad_norm": 1.298815340359428, + "learning_rate": 1.3981107810304106e-05, + "loss": 0.5801, + "step": 4635 + }, + { + "epoch": 0.5696031453495516, + "grad_norm": 1.2189758695914747, + "learning_rate": 1.3974689467321289e-05, + "loss": 0.6846, + "step": 4636 + }, + { + "epoch": 0.5697260105664087, + "grad_norm": 1.1753239740774404, + "learning_rate": 1.396827131293769e-05, + "loss": 0.5301, + "step": 4637 + }, + { + "epoch": 0.5698488757832657, + "grad_norm": 1.1573930973084545, + "learning_rate": 1.3961853348333896e-05, + "loss": 0.6064, + "step": 4638 + }, + { + "epoch": 0.5699717410001228, + "grad_norm": 1.1370335477569917, + "learning_rate": 1.3955435574690444e-05, + "loss": 0.7141, + "step": 4639 + }, + { + "epoch": 0.5700946062169799, + "grad_norm": 1.0669427106799774, + "learning_rate": 1.3949017993187848e-05, + "loss": 0.6909, + "step": 4640 + }, + { + "epoch": 0.570217471433837, + "grad_norm": 1.7497966111211383, + "learning_rate": 1.3942600605006565e-05, + "loss": 0.7399, + "step": 4641 + }, + { + "epoch": 0.5703403366506942, + "grad_norm": 1.5001268111813744, + "learning_rate": 1.3936183411327054e-05, + "loss": 0.7252, + "step": 4642 + }, + { + "epoch": 0.5704632018675513, + "grad_norm": 1.124612606117686, + "learning_rate": 1.3929766413329702e-05, + "loss": 0.551, + "step": 4643 + }, + { + "epoch": 0.5705860670844084, + "grad_norm": 1.1391824173781804, + "learning_rate": 1.392334961219488e-05, + "loss": 0.5046, + "step": 4644 + }, + { + "epoch": 0.5707089323012655, + "grad_norm": 1.3003500781860189, + "learning_rate": 1.391693300910291e-05, + "loss": 0.5499, + "step": 4645 + }, + { + "epoch": 0.5708317975181226, + "grad_norm": 1.2654142531604196, + "learning_rate": 1.3910516605234091e-05, + "loss": 0.5656, + "step": 4646 + }, + { + "epoch": 0.5709546627349797, + "grad_norm": 1.3240350119884037, + "learning_rate": 1.390410040176868e-05, + "loss": 0.6028, + "step": 4647 + }, + { + "epoch": 0.5710775279518369, + "grad_norm": 1.3059867700283034, + "learning_rate": 1.3897684399886892e-05, + "loss": 0.6807, + "step": 4648 + }, + { + "epoch": 0.571200393168694, + "grad_norm": 1.1359036264797264, + "learning_rate": 1.3891268600768902e-05, + "loss": 0.6316, + "step": 4649 + }, + { + "epoch": 0.5713232583855511, + "grad_norm": 1.0957942467403845, + "learning_rate": 1.3884853005594869e-05, + "loss": 0.5022, + "step": 4650 + }, + { + "epoch": 0.5714461236024082, + "grad_norm": 1.0150443542639316, + "learning_rate": 1.3878437615544896e-05, + "loss": 0.5098, + "step": 4651 + }, + { + "epoch": 0.5715689888192652, + "grad_norm": 1.0680992480990628, + "learning_rate": 1.3872022431799047e-05, + "loss": 0.6321, + "step": 4652 + }, + { + "epoch": 0.5716918540361223, + "grad_norm": 1.1350814182422964, + "learning_rate": 1.3865607455537352e-05, + "loss": 0.5925, + "step": 4653 + }, + { + "epoch": 0.5718147192529794, + "grad_norm": 1.2438834469977347, + "learning_rate": 1.3859192687939813e-05, + "loss": 0.6333, + "step": 4654 + }, + { + "epoch": 0.5719375844698366, + "grad_norm": 1.1901743961483797, + "learning_rate": 1.3852778130186384e-05, + "loss": 0.5696, + "step": 4655 + }, + { + "epoch": 0.5720604496866937, + "grad_norm": 1.0820138659121796, + "learning_rate": 1.3846363783456976e-05, + "loss": 0.6428, + "step": 4656 + }, + { + "epoch": 0.5721833149035508, + "grad_norm": 1.441493083872987, + "learning_rate": 1.3839949648931465e-05, + "loss": 0.6376, + "step": 4657 + }, + { + "epoch": 0.5723061801204079, + "grad_norm": 1.3795898472162686, + "learning_rate": 1.3833535727789695e-05, + "loss": 0.5276, + "step": 4658 + }, + { + "epoch": 0.572429045337265, + "grad_norm": 1.328664216342804, + "learning_rate": 1.3827122021211465e-05, + "loss": 0.6075, + "step": 4659 + }, + { + "epoch": 0.5725519105541221, + "grad_norm": 1.2380007633028083, + "learning_rate": 1.3820708530376527e-05, + "loss": 0.5762, + "step": 4660 + }, + { + "epoch": 0.5726747757709792, + "grad_norm": 1.908195598480364, + "learning_rate": 1.3814295256464613e-05, + "loss": 0.6063, + "step": 4661 + }, + { + "epoch": 0.5727976409878364, + "grad_norm": 1.5400004698279506, + "learning_rate": 1.3807882200655396e-05, + "loss": 0.7839, + "step": 4662 + }, + { + "epoch": 0.5729205062046935, + "grad_norm": 1.3893973015583125, + "learning_rate": 1.3801469364128515e-05, + "loss": 0.6279, + "step": 4663 + }, + { + "epoch": 0.5730433714215506, + "grad_norm": 1.2160340950503707, + "learning_rate": 1.3795056748063574e-05, + "loss": 0.527, + "step": 4664 + }, + { + "epoch": 0.5731662366384077, + "grad_norm": 1.0857077554342722, + "learning_rate": 1.3788644353640129e-05, + "loss": 0.5862, + "step": 4665 + }, + { + "epoch": 0.5732891018552648, + "grad_norm": 1.4127508041929673, + "learning_rate": 1.3782232182037701e-05, + "loss": 0.6052, + "step": 4666 + }, + { + "epoch": 0.5734119670721218, + "grad_norm": 1.0915987723783944, + "learning_rate": 1.3775820234435764e-05, + "loss": 0.7842, + "step": 4667 + }, + { + "epoch": 0.573534832288979, + "grad_norm": 1.1955289992924432, + "learning_rate": 1.3769408512013748e-05, + "loss": 0.5705, + "step": 4668 + }, + { + "epoch": 0.5736576975058361, + "grad_norm": 1.314951863602414, + "learning_rate": 1.3762997015951066e-05, + "loss": 0.4392, + "step": 4669 + }, + { + "epoch": 0.5737805627226932, + "grad_norm": 1.9016346044266332, + "learning_rate": 1.375658574742706e-05, + "loss": 0.727, + "step": 4670 + }, + { + "epoch": 0.5739034279395503, + "grad_norm": 1.3271506563632611, + "learning_rate": 1.375017470762104e-05, + "loss": 0.5098, + "step": 4671 + }, + { + "epoch": 0.5740262931564074, + "grad_norm": 1.3167894232527018, + "learning_rate": 1.3743763897712271e-05, + "loss": 0.5633, + "step": 4672 + }, + { + "epoch": 0.5741491583732645, + "grad_norm": 1.1828546641879487, + "learning_rate": 1.3737353318879993e-05, + "loss": 0.6483, + "step": 4673 + }, + { + "epoch": 0.5742720235901216, + "grad_norm": 1.7527861230980746, + "learning_rate": 1.3730942972303383e-05, + "loss": 0.6888, + "step": 4674 + }, + { + "epoch": 0.5743948888069788, + "grad_norm": 1.3044584635467598, + "learning_rate": 1.3724532859161583e-05, + "loss": 0.5995, + "step": 4675 + }, + { + "epoch": 0.5745177540238359, + "grad_norm": 1.1786523892552525, + "learning_rate": 1.371812298063369e-05, + "loss": 0.6391, + "step": 4676 + }, + { + "epoch": 0.574640619240693, + "grad_norm": 1.4449846420392145, + "learning_rate": 1.3711713337898763e-05, + "loss": 0.6763, + "step": 4677 + }, + { + "epoch": 0.5747634844575501, + "grad_norm": 1.248455781107704, + "learning_rate": 1.3705303932135813e-05, + "loss": 0.5858, + "step": 4678 + }, + { + "epoch": 0.5748863496744072, + "grad_norm": 1.2839497717154595, + "learning_rate": 1.3698894764523809e-05, + "loss": 0.6154, + "step": 4679 + }, + { + "epoch": 0.5750092148912643, + "grad_norm": 1.0770344012497395, + "learning_rate": 1.3692485836241668e-05, + "loss": 0.5944, + "step": 4680 + }, + { + "epoch": 0.5751320801081214, + "grad_norm": 1.3299273624612464, + "learning_rate": 1.3686077148468285e-05, + "loss": 0.6246, + "step": 4681 + }, + { + "epoch": 0.5752549453249785, + "grad_norm": 1.1876719249770373, + "learning_rate": 1.367966870238249e-05, + "loss": 0.6207, + "step": 4682 + }, + { + "epoch": 0.5753778105418356, + "grad_norm": 1.3869607995934792, + "learning_rate": 1.367326049916307e-05, + "loss": 0.6182, + "step": 4683 + }, + { + "epoch": 0.5755006757586927, + "grad_norm": 1.2175270664807105, + "learning_rate": 1.366685253998878e-05, + "loss": 0.6075, + "step": 4684 + }, + { + "epoch": 0.5756235409755498, + "grad_norm": 1.4783397033004377, + "learning_rate": 1.3660444826038322e-05, + "loss": 0.689, + "step": 4685 + }, + { + "epoch": 0.5757464061924069, + "grad_norm": 1.2507638376778987, + "learning_rate": 1.3654037358490348e-05, + "loss": 0.5906, + "step": 4686 + }, + { + "epoch": 0.575869271409264, + "grad_norm": 1.2363131951592086, + "learning_rate": 1.3647630138523467e-05, + "loss": 0.6053, + "step": 4687 + }, + { + "epoch": 0.5759921366261211, + "grad_norm": 1.0709760636105599, + "learning_rate": 1.364122316731626e-05, + "loss": 0.5617, + "step": 4688 + }, + { + "epoch": 0.5761150018429783, + "grad_norm": 1.1677463959230503, + "learning_rate": 1.3634816446047237e-05, + "loss": 0.5871, + "step": 4689 + }, + { + "epoch": 0.5762378670598354, + "grad_norm": 1.193976709788362, + "learning_rate": 1.3628409975894878e-05, + "loss": 0.6475, + "step": 4690 + }, + { + "epoch": 0.5763607322766925, + "grad_norm": 1.1846136820877098, + "learning_rate": 1.36220037580376e-05, + "loss": 0.5511, + "step": 4691 + }, + { + "epoch": 0.5764835974935496, + "grad_norm": 1.1879943990486173, + "learning_rate": 1.36155977936538e-05, + "loss": 0.5689, + "step": 4692 + }, + { + "epoch": 0.5766064627104067, + "grad_norm": 1.3216866690054578, + "learning_rate": 1.360919208392181e-05, + "loss": 0.5397, + "step": 4693 + }, + { + "epoch": 0.5767293279272638, + "grad_norm": 1.1435032402110077, + "learning_rate": 1.3602786630019914e-05, + "loss": 0.513, + "step": 4694 + }, + { + "epoch": 0.576852193144121, + "grad_norm": 1.3581717084023102, + "learning_rate": 1.3596381433126356e-05, + "loss": 0.5327, + "step": 4695 + }, + { + "epoch": 0.576975058360978, + "grad_norm": 1.622056807820146, + "learning_rate": 1.3589976494419333e-05, + "loss": 0.5851, + "step": 4696 + }, + { + "epoch": 0.5770979235778351, + "grad_norm": 1.5129053080232266, + "learning_rate": 1.3583571815076988e-05, + "loss": 0.7189, + "step": 4697 + }, + { + "epoch": 0.5772207887946922, + "grad_norm": 1.3015678216787943, + "learning_rate": 1.3577167396277421e-05, + "loss": 0.6313, + "step": 4698 + }, + { + "epoch": 0.5773436540115493, + "grad_norm": 1.2027382425846882, + "learning_rate": 1.357076323919868e-05, + "loss": 0.5116, + "step": 4699 + }, + { + "epoch": 0.5774665192284064, + "grad_norm": 0.9738339267332328, + "learning_rate": 1.3564359345018777e-05, + "loss": 0.5918, + "step": 4700 + }, + { + "epoch": 0.5775893844452635, + "grad_norm": 1.2305719528411898, + "learning_rate": 1.3557955714915665e-05, + "loss": 0.6179, + "step": 4701 + }, + { + "epoch": 0.5777122496621206, + "grad_norm": 1.2247303245431662, + "learning_rate": 1.3551552350067241e-05, + "loss": 0.7235, + "step": 4702 + }, + { + "epoch": 0.5778351148789778, + "grad_norm": 1.1323079516762584, + "learning_rate": 1.3545149251651372e-05, + "loss": 0.6214, + "step": 4703 + }, + { + "epoch": 0.5779579800958349, + "grad_norm": 1.1675600540781508, + "learning_rate": 1.3538746420845866e-05, + "loss": 0.5422, + "step": 4704 + }, + { + "epoch": 0.578080845312692, + "grad_norm": 1.2784403532281126, + "learning_rate": 1.3532343858828476e-05, + "loss": 0.6994, + "step": 4705 + }, + { + "epoch": 0.5782037105295491, + "grad_norm": 1.1334379542920079, + "learning_rate": 1.3525941566776909e-05, + "loss": 0.585, + "step": 4706 + }, + { + "epoch": 0.5783265757464062, + "grad_norm": 1.2412249430991897, + "learning_rate": 1.351953954586884e-05, + "loss": 0.5757, + "step": 4707 + }, + { + "epoch": 0.5784494409632633, + "grad_norm": 1.4189239160935743, + "learning_rate": 1.3513137797281868e-05, + "loss": 0.576, + "step": 4708 + }, + { + "epoch": 0.5785723061801205, + "grad_norm": 1.2708661109839232, + "learning_rate": 1.3506736322193556e-05, + "loss": 0.5546, + "step": 4709 + }, + { + "epoch": 0.5786951713969776, + "grad_norm": 1.3132950563602657, + "learning_rate": 1.350033512178141e-05, + "loss": 0.6612, + "step": 4710 + }, + { + "epoch": 0.5788180366138346, + "grad_norm": 1.0581355638592556, + "learning_rate": 1.3493934197222893e-05, + "loss": 0.7165, + "step": 4711 + }, + { + "epoch": 0.5789409018306917, + "grad_norm": 1.2037813410799456, + "learning_rate": 1.3487533549695417e-05, + "loss": 0.5792, + "step": 4712 + }, + { + "epoch": 0.5790637670475488, + "grad_norm": 1.2666107770925612, + "learning_rate": 1.3481133180376336e-05, + "loss": 0.5931, + "step": 4713 + }, + { + "epoch": 0.5791866322644059, + "grad_norm": 1.2068204300513385, + "learning_rate": 1.3474733090442953e-05, + "loss": 0.5803, + "step": 4714 + }, + { + "epoch": 0.579309497481263, + "grad_norm": 1.828160299107149, + "learning_rate": 1.3468333281072528e-05, + "loss": 0.6167, + "step": 4715 + }, + { + "epoch": 0.5794323626981202, + "grad_norm": 1.190841224745319, + "learning_rate": 1.3461933753442265e-05, + "loss": 0.6536, + "step": 4716 + }, + { + "epoch": 0.5795552279149773, + "grad_norm": 1.3173896247991663, + "learning_rate": 1.3455534508729313e-05, + "loss": 0.432, + "step": 4717 + }, + { + "epoch": 0.5796780931318344, + "grad_norm": 1.1422982287952756, + "learning_rate": 1.3449135548110763e-05, + "loss": 0.7335, + "step": 4718 + }, + { + "epoch": 0.5798009583486915, + "grad_norm": 1.295421409308299, + "learning_rate": 1.3442736872763681e-05, + "loss": 0.7278, + "step": 4719 + }, + { + "epoch": 0.5799238235655486, + "grad_norm": 1.3965633938794029, + "learning_rate": 1.343633848386505e-05, + "loss": 0.5649, + "step": 4720 + }, + { + "epoch": 0.5800466887824057, + "grad_norm": 1.5111856755797919, + "learning_rate": 1.3429940382591815e-05, + "loss": 0.5347, + "step": 4721 + }, + { + "epoch": 0.5801695539992628, + "grad_norm": 1.3778098757961499, + "learning_rate": 1.3423542570120861e-05, + "loss": 0.5869, + "step": 4722 + }, + { + "epoch": 0.58029241921612, + "grad_norm": 1.1572015745740143, + "learning_rate": 1.3417145047629029e-05, + "loss": 0.5051, + "step": 4723 + }, + { + "epoch": 0.5804152844329771, + "grad_norm": 1.3607987498624037, + "learning_rate": 1.3410747816293102e-05, + "loss": 0.5751, + "step": 4724 + }, + { + "epoch": 0.5805381496498341, + "grad_norm": 1.2721181547353824, + "learning_rate": 1.34043508772898e-05, + "loss": 0.7007, + "step": 4725 + }, + { + "epoch": 0.5806610148666912, + "grad_norm": 1.2914329981624004, + "learning_rate": 1.3397954231795815e-05, + "loss": 0.5261, + "step": 4726 + }, + { + "epoch": 0.5807838800835483, + "grad_norm": 1.1515765768756328, + "learning_rate": 1.3391557880987757e-05, + "loss": 0.5584, + "step": 4727 + }, + { + "epoch": 0.5809067453004054, + "grad_norm": 1.2090253542285954, + "learning_rate": 1.3385161826042199e-05, + "loss": 0.6045, + "step": 4728 + }, + { + "epoch": 0.5810296105172625, + "grad_norm": 1.1544956544069105, + "learning_rate": 1.3378766068135642e-05, + "loss": 0.5118, + "step": 4729 + }, + { + "epoch": 0.5811524757341197, + "grad_norm": 1.4221746160698074, + "learning_rate": 1.337237060844456e-05, + "loss": 0.5437, + "step": 4730 + }, + { + "epoch": 0.5812753409509768, + "grad_norm": 1.2649176881041557, + "learning_rate": 1.3365975448145348e-05, + "loss": 0.6236, + "step": 4731 + }, + { + "epoch": 0.5813982061678339, + "grad_norm": 1.1685002279400998, + "learning_rate": 1.3359580588414354e-05, + "loss": 0.5258, + "step": 4732 + }, + { + "epoch": 0.581521071384691, + "grad_norm": 1.3672555132718396, + "learning_rate": 1.3353186030427868e-05, + "loss": 0.6591, + "step": 4733 + }, + { + "epoch": 0.5816439366015481, + "grad_norm": 1.0218274673289387, + "learning_rate": 1.3346791775362136e-05, + "loss": 0.6355, + "step": 4734 + }, + { + "epoch": 0.5817668018184052, + "grad_norm": 1.0883376096535766, + "learning_rate": 1.3340397824393337e-05, + "loss": 0.607, + "step": 4735 + }, + { + "epoch": 0.5818896670352623, + "grad_norm": 1.14436077182396, + "learning_rate": 1.3334004178697595e-05, + "loss": 0.5286, + "step": 4736 + }, + { + "epoch": 0.5820125322521195, + "grad_norm": 1.1893561985899257, + "learning_rate": 1.3327610839450972e-05, + "loss": 0.6014, + "step": 4737 + }, + { + "epoch": 0.5821353974689766, + "grad_norm": 1.2681493034733324, + "learning_rate": 1.3321217807829498e-05, + "loss": 0.4932, + "step": 4738 + }, + { + "epoch": 0.5822582626858337, + "grad_norm": 1.4090156822712316, + "learning_rate": 1.331482508500912e-05, + "loss": 0.6203, + "step": 4739 + }, + { + "epoch": 0.5823811279026907, + "grad_norm": 1.2476505238007927, + "learning_rate": 1.3308432672165738e-05, + "loss": 0.4972, + "step": 4740 + }, + { + "epoch": 0.5825039931195478, + "grad_norm": 1.1260282259659349, + "learning_rate": 1.3302040570475194e-05, + "loss": 0.537, + "step": 4741 + }, + { + "epoch": 0.5826268583364049, + "grad_norm": 1.2608006024289669, + "learning_rate": 1.3295648781113277e-05, + "loss": 0.6856, + "step": 4742 + }, + { + "epoch": 0.582749723553262, + "grad_norm": 0.9675729905197541, + "learning_rate": 1.3289257305255716e-05, + "loss": 0.64, + "step": 4743 + }, + { + "epoch": 0.5828725887701192, + "grad_norm": 1.3336813494288424, + "learning_rate": 1.3282866144078171e-05, + "loss": 0.5348, + "step": 4744 + }, + { + "epoch": 0.5829954539869763, + "grad_norm": 1.1288799652160622, + "learning_rate": 1.327647529875627e-05, + "loss": 0.6855, + "step": 4745 + }, + { + "epoch": 0.5831183192038334, + "grad_norm": 1.3806414911633011, + "learning_rate": 1.327008477046556e-05, + "loss": 0.6846, + "step": 4746 + }, + { + "epoch": 0.5832411844206905, + "grad_norm": 1.2060952727683958, + "learning_rate": 1.3263694560381538e-05, + "loss": 0.5075, + "step": 4747 + }, + { + "epoch": 0.5833640496375476, + "grad_norm": 1.1844189244045897, + "learning_rate": 1.3257304669679637e-05, + "loss": 0.4641, + "step": 4748 + }, + { + "epoch": 0.5834869148544047, + "grad_norm": 1.1010302238292413, + "learning_rate": 1.3250915099535245e-05, + "loss": 0.6643, + "step": 4749 + }, + { + "epoch": 0.5836097800712619, + "grad_norm": 1.2478689783081902, + "learning_rate": 1.3244525851123676e-05, + "loss": 0.5027, + "step": 4750 + }, + { + "epoch": 0.583732645288119, + "grad_norm": 1.1969305377065733, + "learning_rate": 1.3238136925620191e-05, + "loss": 0.6165, + "step": 4751 + }, + { + "epoch": 0.5838555105049761, + "grad_norm": 1.1566111426988128, + "learning_rate": 1.3231748324199989e-05, + "loss": 0.6752, + "step": 4752 + }, + { + "epoch": 0.5839783757218332, + "grad_norm": 1.1413120738940352, + "learning_rate": 1.322536004803822e-05, + "loss": 0.5268, + "step": 4753 + }, + { + "epoch": 0.5841012409386902, + "grad_norm": 1.3231134882784314, + "learning_rate": 1.321897209830996e-05, + "loss": 0.4905, + "step": 4754 + }, + { + "epoch": 0.5842241061555473, + "grad_norm": 1.1480707596095834, + "learning_rate": 1.3212584476190233e-05, + "loss": 0.6679, + "step": 4755 + }, + { + "epoch": 0.5843469713724044, + "grad_norm": 1.4604682245978404, + "learning_rate": 1.3206197182853994e-05, + "loss": 0.6337, + "step": 4756 + }, + { + "epoch": 0.5844698365892615, + "grad_norm": 1.3861831717109385, + "learning_rate": 1.3199810219476156e-05, + "loss": 0.5797, + "step": 4757 + }, + { + "epoch": 0.5845927018061187, + "grad_norm": 1.108738212207439, + "learning_rate": 1.3193423587231553e-05, + "loss": 0.627, + "step": 4758 + }, + { + "epoch": 0.5847155670229758, + "grad_norm": 1.349710873398861, + "learning_rate": 1.3187037287294967e-05, + "loss": 0.6939, + "step": 4759 + }, + { + "epoch": 0.5848384322398329, + "grad_norm": 1.1245056924925245, + "learning_rate": 1.318065132084111e-05, + "loss": 0.6879, + "step": 4760 + }, + { + "epoch": 0.58496129745669, + "grad_norm": 1.1526134027481056, + "learning_rate": 1.3174265689044646e-05, + "loss": 0.6107, + "step": 4761 + }, + { + "epoch": 0.5850841626735471, + "grad_norm": 1.154424451456958, + "learning_rate": 1.3167880393080171e-05, + "loss": 0.5586, + "step": 4762 + }, + { + "epoch": 0.5852070278904042, + "grad_norm": 1.3870435097609697, + "learning_rate": 1.3161495434122213e-05, + "loss": 0.6892, + "step": 4763 + }, + { + "epoch": 0.5853298931072614, + "grad_norm": 1.0780004877696172, + "learning_rate": 1.315511081334524e-05, + "loss": 0.538, + "step": 4764 + }, + { + "epoch": 0.5854527583241185, + "grad_norm": 1.3585592271916285, + "learning_rate": 1.3148726531923677e-05, + "loss": 0.5703, + "step": 4765 + }, + { + "epoch": 0.5855756235409756, + "grad_norm": 1.3583348019660262, + "learning_rate": 1.3142342591031862e-05, + "loss": 0.6103, + "step": 4766 + }, + { + "epoch": 0.5856984887578327, + "grad_norm": 1.0611416507764426, + "learning_rate": 1.3135958991844076e-05, + "loss": 0.5449, + "step": 4767 + }, + { + "epoch": 0.5858213539746898, + "grad_norm": 1.3195678082329894, + "learning_rate": 1.3129575735534548e-05, + "loss": 0.5756, + "step": 4768 + }, + { + "epoch": 0.5859442191915468, + "grad_norm": 1.2877785542777946, + "learning_rate": 1.3123192823277435e-05, + "loss": 0.4999, + "step": 4769 + }, + { + "epoch": 0.5860670844084039, + "grad_norm": 1.274787274177473, + "learning_rate": 1.3116810256246828e-05, + "loss": 0.5283, + "step": 4770 + }, + { + "epoch": 0.586189949625261, + "grad_norm": 1.184599648226264, + "learning_rate": 1.3110428035616757e-05, + "loss": 0.657, + "step": 4771 + }, + { + "epoch": 0.5863128148421182, + "grad_norm": 1.254192462193568, + "learning_rate": 1.31040461625612e-05, + "loss": 0.5663, + "step": 4772 + }, + { + "epoch": 0.5864356800589753, + "grad_norm": 1.122428967180021, + "learning_rate": 1.3097664638254057e-05, + "loss": 0.6561, + "step": 4773 + }, + { + "epoch": 0.5865585452758324, + "grad_norm": 1.2381252582002227, + "learning_rate": 1.3091283463869167e-05, + "loss": 0.5978, + "step": 4774 + }, + { + "epoch": 0.5866814104926895, + "grad_norm": 1.3275324252011076, + "learning_rate": 1.3084902640580297e-05, + "loss": 0.6438, + "step": 4775 + }, + { + "epoch": 0.5868042757095466, + "grad_norm": 1.160003532751031, + "learning_rate": 1.3078522169561172e-05, + "loss": 0.494, + "step": 4776 + }, + { + "epoch": 0.5869271409264037, + "grad_norm": 1.58758247411693, + "learning_rate": 1.3072142051985436e-05, + "loss": 0.6355, + "step": 4777 + }, + { + "epoch": 0.5870500061432609, + "grad_norm": 1.3216801222022145, + "learning_rate": 1.3065762289026665e-05, + "loss": 0.6262, + "step": 4778 + }, + { + "epoch": 0.587172871360118, + "grad_norm": 1.1209079951999203, + "learning_rate": 1.3059382881858375e-05, + "loss": 0.5189, + "step": 4779 + }, + { + "epoch": 0.5872957365769751, + "grad_norm": 1.1914566510143834, + "learning_rate": 1.3053003831654019e-05, + "loss": 0.6782, + "step": 4780 + }, + { + "epoch": 0.5874186017938322, + "grad_norm": 1.2274261008151561, + "learning_rate": 1.3046625139586984e-05, + "loss": 0.6529, + "step": 4781 + }, + { + "epoch": 0.5875414670106893, + "grad_norm": 1.1240193322232122, + "learning_rate": 1.3040246806830585e-05, + "loss": 0.5907, + "step": 4782 + }, + { + "epoch": 0.5876643322275463, + "grad_norm": 1.155807076083266, + "learning_rate": 1.3033868834558071e-05, + "loss": 0.5646, + "step": 4783 + }, + { + "epoch": 0.5877871974444034, + "grad_norm": 1.1305659694162093, + "learning_rate": 1.302749122394264e-05, + "loss": 0.6125, + "step": 4784 + }, + { + "epoch": 0.5879100626612606, + "grad_norm": 1.1817154785622284, + "learning_rate": 1.3021113976157408e-05, + "loss": 0.5436, + "step": 4785 + }, + { + "epoch": 0.5880329278781177, + "grad_norm": 1.2395799443236521, + "learning_rate": 1.3014737092375423e-05, + "loss": 0.6125, + "step": 4786 + }, + { + "epoch": 0.5881557930949748, + "grad_norm": 1.198145334364304, + "learning_rate": 1.3008360573769676e-05, + "loss": 0.6238, + "step": 4787 + }, + { + "epoch": 0.5882786583118319, + "grad_norm": 1.2393234124904315, + "learning_rate": 1.3001984421513085e-05, + "loss": 0.5765, + "step": 4788 + }, + { + "epoch": 0.588401523528689, + "grad_norm": 1.1551714742138843, + "learning_rate": 1.2995608636778502e-05, + "loss": 0.6347, + "step": 4789 + }, + { + "epoch": 0.5885243887455461, + "grad_norm": 1.28334018888213, + "learning_rate": 1.2989233220738707e-05, + "loss": 0.6108, + "step": 4790 + }, + { + "epoch": 0.5886472539624032, + "grad_norm": 1.3181069585124405, + "learning_rate": 1.2982858174566425e-05, + "loss": 0.5672, + "step": 4791 + }, + { + "epoch": 0.5887701191792604, + "grad_norm": 1.3953138385491877, + "learning_rate": 1.2976483499434302e-05, + "loss": 0.6027, + "step": 4792 + }, + { + "epoch": 0.5888929843961175, + "grad_norm": 1.3446139059369324, + "learning_rate": 1.2970109196514918e-05, + "loss": 0.5546, + "step": 4793 + }, + { + "epoch": 0.5890158496129746, + "grad_norm": 1.2499292381851896, + "learning_rate": 1.2963735266980773e-05, + "loss": 0.5505, + "step": 4794 + }, + { + "epoch": 0.5891387148298317, + "grad_norm": 1.0950797272594022, + "learning_rate": 1.2957361712004327e-05, + "loss": 0.6828, + "step": 4795 + }, + { + "epoch": 0.5892615800466888, + "grad_norm": 1.4897092712378632, + "learning_rate": 1.295098853275795e-05, + "loss": 0.5507, + "step": 4796 + }, + { + "epoch": 0.5893844452635459, + "grad_norm": 1.0292409213129154, + "learning_rate": 1.2944615730413941e-05, + "loss": 0.6007, + "step": 4797 + }, + { + "epoch": 0.589507310480403, + "grad_norm": 1.3243859727372664, + "learning_rate": 1.2938243306144536e-05, + "loss": 0.5905, + "step": 4798 + }, + { + "epoch": 0.5896301756972601, + "grad_norm": 1.2894661514399857, + "learning_rate": 1.2931871261121907e-05, + "loss": 0.6269, + "step": 4799 + }, + { + "epoch": 0.5897530409141172, + "grad_norm": 1.4242311238497813, + "learning_rate": 1.292549959651815e-05, + "loss": 0.6179, + "step": 4800 + }, + { + "epoch": 0.5898759061309743, + "grad_norm": 1.4773708541886612, + "learning_rate": 1.2919128313505286e-05, + "loss": 0.5944, + "step": 4801 + }, + { + "epoch": 0.5899987713478314, + "grad_norm": 1.3004427810834438, + "learning_rate": 1.2912757413255266e-05, + "loss": 0.6149, + "step": 4802 + }, + { + "epoch": 0.5901216365646885, + "grad_norm": 1.12447595039409, + "learning_rate": 1.2906386896939994e-05, + "loss": 0.5003, + "step": 4803 + }, + { + "epoch": 0.5902445017815456, + "grad_norm": 1.998177280351999, + "learning_rate": 1.2900016765731271e-05, + "loss": 0.7327, + "step": 4804 + }, + { + "epoch": 0.5903673669984028, + "grad_norm": 1.3520734940280055, + "learning_rate": 1.2893647020800847e-05, + "loss": 0.6392, + "step": 4805 + }, + { + "epoch": 0.5904902322152599, + "grad_norm": 1.5741071131797018, + "learning_rate": 1.288727766332039e-05, + "loss": 0.552, + "step": 4806 + }, + { + "epoch": 0.590613097432117, + "grad_norm": 1.2817782291479891, + "learning_rate": 1.288090869446151e-05, + "loss": 0.4934, + "step": 4807 + }, + { + "epoch": 0.5907359626489741, + "grad_norm": 1.477162876994285, + "learning_rate": 1.287454011539573e-05, + "loss": 0.6877, + "step": 4808 + }, + { + "epoch": 0.5908588278658312, + "grad_norm": 1.1266656807071846, + "learning_rate": 1.2868171927294507e-05, + "loss": 0.5782, + "step": 4809 + }, + { + "epoch": 0.5909816930826883, + "grad_norm": 1.2607425917522699, + "learning_rate": 1.2861804131329237e-05, + "loss": 0.6053, + "step": 4810 + }, + { + "epoch": 0.5911045582995454, + "grad_norm": 1.2000060934158963, + "learning_rate": 1.2855436728671232e-05, + "loss": 0.5758, + "step": 4811 + }, + { + "epoch": 0.5912274235164026, + "grad_norm": 1.1365567254400024, + "learning_rate": 1.2849069720491735e-05, + "loss": 0.5659, + "step": 4812 + }, + { + "epoch": 0.5913502887332596, + "grad_norm": 1.4935692333962751, + "learning_rate": 1.2842703107961903e-05, + "loss": 0.6836, + "step": 4813 + }, + { + "epoch": 0.5914731539501167, + "grad_norm": 1.2397329991539605, + "learning_rate": 1.2836336892252851e-05, + "loss": 0.6293, + "step": 4814 + }, + { + "epoch": 0.5915960191669738, + "grad_norm": 1.3535896781055545, + "learning_rate": 1.2829971074535597e-05, + "loss": 0.5964, + "step": 4815 + }, + { + "epoch": 0.5917188843838309, + "grad_norm": 1.2027105713073871, + "learning_rate": 1.282360565598109e-05, + "loss": 0.6555, + "step": 4816 + }, + { + "epoch": 0.591841749600688, + "grad_norm": 1.4148955591025714, + "learning_rate": 1.2817240637760206e-05, + "loss": 0.6487, + "step": 4817 + }, + { + "epoch": 0.5919646148175451, + "grad_norm": 1.6005349470844865, + "learning_rate": 1.2810876021043753e-05, + "loss": 0.553, + "step": 4818 + }, + { + "epoch": 0.5920874800344023, + "grad_norm": 1.1773767887711961, + "learning_rate": 1.280451180700246e-05, + "loss": 0.5487, + "step": 4819 + }, + { + "epoch": 0.5922103452512594, + "grad_norm": 1.319234456739277, + "learning_rate": 1.2798147996806982e-05, + "loss": 0.5933, + "step": 4820 + }, + { + "epoch": 0.5923332104681165, + "grad_norm": 0.9936583556503611, + "learning_rate": 1.2791784591627893e-05, + "loss": 0.5684, + "step": 4821 + }, + { + "epoch": 0.5924560756849736, + "grad_norm": 1.2263127626732733, + "learning_rate": 1.2785421592635716e-05, + "loss": 0.5053, + "step": 4822 + }, + { + "epoch": 0.5925789409018307, + "grad_norm": 1.1616092175704456, + "learning_rate": 1.2779059001000873e-05, + "loss": 0.5031, + "step": 4823 + }, + { + "epoch": 0.5927018061186878, + "grad_norm": 1.3339950719096958, + "learning_rate": 1.2772696817893726e-05, + "loss": 0.5661, + "step": 4824 + }, + { + "epoch": 0.592824671335545, + "grad_norm": 1.2159259287034658, + "learning_rate": 1.2766335044484548e-05, + "loss": 0.5847, + "step": 4825 + }, + { + "epoch": 0.5929475365524021, + "grad_norm": 1.201969288328668, + "learning_rate": 1.2759973681943559e-05, + "loss": 0.6373, + "step": 4826 + }, + { + "epoch": 0.5930704017692591, + "grad_norm": 1.2285562007140498, + "learning_rate": 1.2753612731440882e-05, + "loss": 0.6017, + "step": 4827 + }, + { + "epoch": 0.5931932669861162, + "grad_norm": 1.1885248346464463, + "learning_rate": 1.2747252194146575e-05, + "loss": 0.7167, + "step": 4828 + }, + { + "epoch": 0.5933161322029733, + "grad_norm": 1.1431667976182276, + "learning_rate": 1.274089207123061e-05, + "loss": 0.5683, + "step": 4829 + }, + { + "epoch": 0.5934389974198304, + "grad_norm": 1.1869006772036486, + "learning_rate": 1.2734532363862907e-05, + "loss": 0.5408, + "step": 4830 + }, + { + "epoch": 0.5935618626366875, + "grad_norm": 1.4446974172740192, + "learning_rate": 1.2728173073213282e-05, + "loss": 0.6376, + "step": 4831 + }, + { + "epoch": 0.5936847278535446, + "grad_norm": 1.4949186114741688, + "learning_rate": 1.2721814200451483e-05, + "loss": 0.6619, + "step": 4832 + }, + { + "epoch": 0.5938075930704018, + "grad_norm": 1.2037328916191774, + "learning_rate": 1.2715455746747188e-05, + "loss": 0.5519, + "step": 4833 + }, + { + "epoch": 0.5939304582872589, + "grad_norm": 1.3397550319598888, + "learning_rate": 1.2709097713269996e-05, + "loss": 0.5092, + "step": 4834 + }, + { + "epoch": 0.594053323504116, + "grad_norm": 1.515492603728574, + "learning_rate": 1.2702740101189423e-05, + "loss": 0.5563, + "step": 4835 + }, + { + "epoch": 0.5941761887209731, + "grad_norm": 1.2169627622971348, + "learning_rate": 1.2696382911674905e-05, + "loss": 0.6838, + "step": 4836 + }, + { + "epoch": 0.5942990539378302, + "grad_norm": 1.2923215997503803, + "learning_rate": 1.2690026145895814e-05, + "loss": 0.696, + "step": 4837 + }, + { + "epoch": 0.5944219191546873, + "grad_norm": 1.1764857305244325, + "learning_rate": 1.2683669805021437e-05, + "loss": 0.5812, + "step": 4838 + }, + { + "epoch": 0.5945447843715445, + "grad_norm": 1.5672854299117656, + "learning_rate": 1.2677313890220974e-05, + "loss": 0.6358, + "step": 4839 + }, + { + "epoch": 0.5946676495884016, + "grad_norm": 1.2074023796105426, + "learning_rate": 1.2670958402663552e-05, + "loss": 0.6192, + "step": 4840 + }, + { + "epoch": 0.5947905148052587, + "grad_norm": 1.2702007445595194, + "learning_rate": 1.2664603343518232e-05, + "loss": 0.7071, + "step": 4841 + }, + { + "epoch": 0.5949133800221157, + "grad_norm": 1.0760966716774454, + "learning_rate": 1.2658248713953983e-05, + "loss": 0.5848, + "step": 4842 + }, + { + "epoch": 0.5950362452389728, + "grad_norm": 1.327004424531281, + "learning_rate": 1.2651894515139697e-05, + "loss": 0.6108, + "step": 4843 + }, + { + "epoch": 0.5951591104558299, + "grad_norm": 1.361984147887081, + "learning_rate": 1.2645540748244183e-05, + "loss": 0.4864, + "step": 4844 + }, + { + "epoch": 0.595281975672687, + "grad_norm": 1.0703773839441337, + "learning_rate": 1.2639187414436182e-05, + "loss": 0.5447, + "step": 4845 + }, + { + "epoch": 0.5954048408895442, + "grad_norm": 1.1590123091743507, + "learning_rate": 1.2632834514884347e-05, + "loss": 0.4985, + "step": 4846 + }, + { + "epoch": 0.5955277061064013, + "grad_norm": 1.3528554707462161, + "learning_rate": 1.2626482050757251e-05, + "loss": 0.5613, + "step": 4847 + }, + { + "epoch": 0.5956505713232584, + "grad_norm": 1.3400061041220181, + "learning_rate": 1.2620130023223382e-05, + "loss": 0.6477, + "step": 4848 + }, + { + "epoch": 0.5957734365401155, + "grad_norm": 1.1684579483248707, + "learning_rate": 1.2613778433451168e-05, + "loss": 0.6421, + "step": 4849 + }, + { + "epoch": 0.5958963017569726, + "grad_norm": 1.1785025483176674, + "learning_rate": 1.2607427282608936e-05, + "loss": 0.5401, + "step": 4850 + }, + { + "epoch": 0.5960191669738297, + "grad_norm": 0.9605819670485719, + "learning_rate": 1.2601076571864934e-05, + "loss": 0.6536, + "step": 4851 + }, + { + "epoch": 0.5961420321906868, + "grad_norm": 1.1449303422108332, + "learning_rate": 1.2594726302387345e-05, + "loss": 0.5416, + "step": 4852 + }, + { + "epoch": 0.596264897407544, + "grad_norm": 1.3328394056062203, + "learning_rate": 1.2588376475344252e-05, + "loss": 0.6088, + "step": 4853 + }, + { + "epoch": 0.5963877626244011, + "grad_norm": 1.1027248935165148, + "learning_rate": 1.2582027091903667e-05, + "loss": 0.5301, + "step": 4854 + }, + { + "epoch": 0.5965106278412582, + "grad_norm": 1.1211285290323374, + "learning_rate": 1.2575678153233512e-05, + "loss": 0.5144, + "step": 4855 + }, + { + "epoch": 0.5966334930581152, + "grad_norm": 1.2334055576672946, + "learning_rate": 1.2569329660501643e-05, + "loss": 0.5691, + "step": 4856 + }, + { + "epoch": 0.5967563582749723, + "grad_norm": 1.2009376896814892, + "learning_rate": 1.256298161487582e-05, + "loss": 0.7159, + "step": 4857 + }, + { + "epoch": 0.5968792234918294, + "grad_norm": 1.280866602894649, + "learning_rate": 1.2556634017523727e-05, + "loss": 0.6747, + "step": 4858 + }, + { + "epoch": 0.5970020887086865, + "grad_norm": 1.5209138279896175, + "learning_rate": 1.255028686961295e-05, + "loss": 0.7119, + "step": 4859 + }, + { + "epoch": 0.5971249539255437, + "grad_norm": 1.1219549446774342, + "learning_rate": 1.2543940172311026e-05, + "loss": 0.5808, + "step": 4860 + }, + { + "epoch": 0.5972478191424008, + "grad_norm": 1.2382493379561585, + "learning_rate": 1.2537593926785378e-05, + "loss": 0.6008, + "step": 4861 + }, + { + "epoch": 0.5973706843592579, + "grad_norm": 1.0847206076951112, + "learning_rate": 1.2531248134203357e-05, + "loss": 0.702, + "step": 4862 + }, + { + "epoch": 0.597493549576115, + "grad_norm": 1.4534792653646045, + "learning_rate": 1.252490279573223e-05, + "loss": 0.6392, + "step": 4863 + }, + { + "epoch": 0.5976164147929721, + "grad_norm": 1.2637082974212228, + "learning_rate": 1.2518557912539185e-05, + "loss": 0.5007, + "step": 4864 + }, + { + "epoch": 0.5977392800098292, + "grad_norm": 1.2054140159825808, + "learning_rate": 1.2512213485791318e-05, + "loss": 0.5696, + "step": 4865 + }, + { + "epoch": 0.5978621452266863, + "grad_norm": 1.3225733191985167, + "learning_rate": 1.2505869516655647e-05, + "loss": 0.6419, + "step": 4866 + }, + { + "epoch": 0.5979850104435435, + "grad_norm": 1.3689263337516, + "learning_rate": 1.2499526006299097e-05, + "loss": 0.6087, + "step": 4867 + }, + { + "epoch": 0.5981078756604006, + "grad_norm": 1.3656226391001414, + "learning_rate": 1.249318295588853e-05, + "loss": 0.6009, + "step": 4868 + }, + { + "epoch": 0.5982307408772577, + "grad_norm": 1.040325726010827, + "learning_rate": 1.2486840366590698e-05, + "loss": 0.5843, + "step": 4869 + }, + { + "epoch": 0.5983536060941148, + "grad_norm": 1.227231239425388, + "learning_rate": 1.2480498239572285e-05, + "loss": 0.6575, + "step": 4870 + }, + { + "epoch": 0.5984764713109718, + "grad_norm": 1.8493681737990717, + "learning_rate": 1.2474156575999875e-05, + "loss": 0.6487, + "step": 4871 + }, + { + "epoch": 0.5985993365278289, + "grad_norm": 1.1043787330259531, + "learning_rate": 1.2467815377039988e-05, + "loss": 0.5104, + "step": 4872 + }, + { + "epoch": 0.598722201744686, + "grad_norm": 1.2532530362264491, + "learning_rate": 1.246147464385904e-05, + "loss": 0.6873, + "step": 4873 + }, + { + "epoch": 0.5988450669615432, + "grad_norm": 1.1953829189554201, + "learning_rate": 1.2455134377623361e-05, + "loss": 0.6648, + "step": 4874 + }, + { + "epoch": 0.5989679321784003, + "grad_norm": 1.641077732030845, + "learning_rate": 1.2448794579499216e-05, + "loss": 0.7248, + "step": 4875 + }, + { + "epoch": 0.5990907973952574, + "grad_norm": 1.2194693739547757, + "learning_rate": 1.2442455250652763e-05, + "loss": 0.5957, + "step": 4876 + }, + { + "epoch": 0.5992136626121145, + "grad_norm": 1.2183860075642599, + "learning_rate": 1.243611639225008e-05, + "loss": 0.6886, + "step": 4877 + }, + { + "epoch": 0.5993365278289716, + "grad_norm": 1.1254032476300109, + "learning_rate": 1.2429778005457154e-05, + "loss": 0.568, + "step": 4878 + }, + { + "epoch": 0.5994593930458287, + "grad_norm": 1.8159739879896684, + "learning_rate": 1.2423440091439902e-05, + "loss": 0.6408, + "step": 4879 + }, + { + "epoch": 0.5995822582626859, + "grad_norm": 1.129345362037194, + "learning_rate": 1.2417102651364134e-05, + "loss": 0.5925, + "step": 4880 + }, + { + "epoch": 0.599705123479543, + "grad_norm": 1.4314893327996057, + "learning_rate": 1.2410765686395584e-05, + "loss": 0.6655, + "step": 4881 + }, + { + "epoch": 0.5998279886964001, + "grad_norm": 1.411291387756137, + "learning_rate": 1.240442919769989e-05, + "loss": 0.6334, + "step": 4882 + }, + { + "epoch": 0.5999508539132572, + "grad_norm": 1.3974885985544316, + "learning_rate": 1.2398093186442616e-05, + "loss": 0.6161, + "step": 4883 + }, + { + "epoch": 0.6000737191301143, + "grad_norm": 1.1731329272908382, + "learning_rate": 1.2391757653789227e-05, + "loss": 0.5356, + "step": 4884 + }, + { + "epoch": 0.6001965843469713, + "grad_norm": 1.0569051170079709, + "learning_rate": 1.2385422600905102e-05, + "loss": 0.5688, + "step": 4885 + }, + { + "epoch": 0.6003194495638284, + "grad_norm": 1.5519262717822182, + "learning_rate": 1.2379088028955525e-05, + "loss": 0.6725, + "step": 4886 + }, + { + "epoch": 0.6004423147806855, + "grad_norm": 1.1288185665164088, + "learning_rate": 1.2372753939105716e-05, + "loss": 0.5079, + "step": 4887 + }, + { + "epoch": 0.6005651799975427, + "grad_norm": 1.0859275077184412, + "learning_rate": 1.2366420332520783e-05, + "loss": 0.6504, + "step": 4888 + }, + { + "epoch": 0.6006880452143998, + "grad_norm": 1.6743377499276053, + "learning_rate": 1.236008721036575e-05, + "loss": 0.6892, + "step": 4889 + }, + { + "epoch": 0.6008109104312569, + "grad_norm": 1.3204148696154865, + "learning_rate": 1.2353754573805549e-05, + "loss": 0.6118, + "step": 4890 + }, + { + "epoch": 0.600933775648114, + "grad_norm": 1.3944927715182258, + "learning_rate": 1.2347422424005039e-05, + "loss": 0.5413, + "step": 4891 + }, + { + "epoch": 0.6010566408649711, + "grad_norm": 1.098662012132866, + "learning_rate": 1.2341090762128969e-05, + "loss": 0.6613, + "step": 4892 + }, + { + "epoch": 0.6011795060818282, + "grad_norm": 1.1763143472540212, + "learning_rate": 1.2334759589342003e-05, + "loss": 0.5721, + "step": 4893 + }, + { + "epoch": 0.6013023712986854, + "grad_norm": 1.1982806180466452, + "learning_rate": 1.2328428906808734e-05, + "loss": 0.5891, + "step": 4894 + }, + { + "epoch": 0.6014252365155425, + "grad_norm": 1.235532010627716, + "learning_rate": 1.232209871569364e-05, + "loss": 0.523, + "step": 4895 + }, + { + "epoch": 0.6015481017323996, + "grad_norm": 1.0959241562593927, + "learning_rate": 1.2315769017161121e-05, + "loss": 0.5817, + "step": 4896 + }, + { + "epoch": 0.6016709669492567, + "grad_norm": 1.0104412107629688, + "learning_rate": 1.2309439812375479e-05, + "loss": 0.4791, + "step": 4897 + }, + { + "epoch": 0.6017938321661138, + "grad_norm": 1.1837798538565758, + "learning_rate": 1.2303111102500938e-05, + "loss": 0.6, + "step": 4898 + }, + { + "epoch": 0.6019166973829709, + "grad_norm": 1.0595716655297487, + "learning_rate": 1.2296782888701621e-05, + "loss": 0.5246, + "step": 4899 + }, + { + "epoch": 0.6020395625998279, + "grad_norm": 1.2051442409636888, + "learning_rate": 1.2290455172141563e-05, + "loss": 0.6095, + "step": 4900 + }, + { + "epoch": 0.602162427816685, + "grad_norm": 1.4535344034141597, + "learning_rate": 1.2284127953984698e-05, + "loss": 0.5928, + "step": 4901 + }, + { + "epoch": 0.6022852930335422, + "grad_norm": 1.1604259866337132, + "learning_rate": 1.2277801235394885e-05, + "loss": 0.5774, + "step": 4902 + }, + { + "epoch": 0.6024081582503993, + "grad_norm": 1.2786690022957672, + "learning_rate": 1.2271475017535884e-05, + "loss": 0.5739, + "step": 4903 + }, + { + "epoch": 0.6025310234672564, + "grad_norm": 1.3822056765715338, + "learning_rate": 1.2265149301571357e-05, + "loss": 0.5792, + "step": 4904 + }, + { + "epoch": 0.6026538886841135, + "grad_norm": 1.2998574839381445, + "learning_rate": 1.2258824088664874e-05, + "loss": 0.6369, + "step": 4905 + }, + { + "epoch": 0.6027767539009706, + "grad_norm": 1.2585693332848045, + "learning_rate": 1.2252499379979928e-05, + "loss": 0.7371, + "step": 4906 + }, + { + "epoch": 0.6028996191178277, + "grad_norm": 1.2152615332442036, + "learning_rate": 1.2246175176679902e-05, + "loss": 0.6168, + "step": 4907 + }, + { + "epoch": 0.6030224843346849, + "grad_norm": 1.1606144879843505, + "learning_rate": 1.2239851479928096e-05, + "loss": 0.6176, + "step": 4908 + }, + { + "epoch": 0.603145349551542, + "grad_norm": 1.2352622053677993, + "learning_rate": 1.2233528290887705e-05, + "loss": 0.5689, + "step": 4909 + }, + { + "epoch": 0.6032682147683991, + "grad_norm": 1.324380877520496, + "learning_rate": 1.2227205610721848e-05, + "loss": 0.544, + "step": 4910 + }, + { + "epoch": 0.6033910799852562, + "grad_norm": 1.1119098064619926, + "learning_rate": 1.2220883440593536e-05, + "loss": 0.5852, + "step": 4911 + }, + { + "epoch": 0.6035139452021133, + "grad_norm": 1.3971188433549913, + "learning_rate": 1.221456178166569e-05, + "loss": 0.5977, + "step": 4912 + }, + { + "epoch": 0.6036368104189704, + "grad_norm": 1.2973159114595534, + "learning_rate": 1.2208240635101137e-05, + "loss": 0.6845, + "step": 4913 + }, + { + "epoch": 0.6037596756358274, + "grad_norm": 1.1920944355821705, + "learning_rate": 1.2201920002062617e-05, + "loss": 0.5798, + "step": 4914 + }, + { + "epoch": 0.6038825408526846, + "grad_norm": 1.2962479179328106, + "learning_rate": 1.2195599883712768e-05, + "loss": 0.6799, + "step": 4915 + }, + { + "epoch": 0.6040054060695417, + "grad_norm": 1.333066464614596, + "learning_rate": 1.2189280281214128e-05, + "loss": 0.6923, + "step": 4916 + }, + { + "epoch": 0.6041282712863988, + "grad_norm": 1.254072586074098, + "learning_rate": 1.2182961195729158e-05, + "loss": 0.605, + "step": 4917 + }, + { + "epoch": 0.6042511365032559, + "grad_norm": 1.240150673531269, + "learning_rate": 1.2176642628420206e-05, + "loss": 0.6651, + "step": 4918 + }, + { + "epoch": 0.604374001720113, + "grad_norm": 0.9944720707339351, + "learning_rate": 1.2170324580449534e-05, + "loss": 0.617, + "step": 4919 + }, + { + "epoch": 0.6044968669369701, + "grad_norm": 1.1417273954752818, + "learning_rate": 1.2164007052979299e-05, + "loss": 0.6007, + "step": 4920 + }, + { + "epoch": 0.6046197321538272, + "grad_norm": 1.2712312400413885, + "learning_rate": 1.2157690047171578e-05, + "loss": 0.5025, + "step": 4921 + }, + { + "epoch": 0.6047425973706844, + "grad_norm": 1.2823737227667862, + "learning_rate": 1.215137356418834e-05, + "loss": 0.6601, + "step": 4922 + }, + { + "epoch": 0.6048654625875415, + "grad_norm": 1.3220042469754072, + "learning_rate": 1.2145057605191462e-05, + "loss": 0.7328, + "step": 4923 + }, + { + "epoch": 0.6049883278043986, + "grad_norm": 1.3277688302899817, + "learning_rate": 1.2138742171342716e-05, + "loss": 0.5146, + "step": 4924 + }, + { + "epoch": 0.6051111930212557, + "grad_norm": 1.088024567623259, + "learning_rate": 1.2132427263803797e-05, + "loss": 0.6229, + "step": 4925 + }, + { + "epoch": 0.6052340582381128, + "grad_norm": 1.1293168998084013, + "learning_rate": 1.2126112883736288e-05, + "loss": 0.6083, + "step": 4926 + }, + { + "epoch": 0.6053569234549699, + "grad_norm": 1.0530981111514812, + "learning_rate": 1.2119799032301675e-05, + "loss": 0.494, + "step": 4927 + }, + { + "epoch": 0.6054797886718271, + "grad_norm": 1.4077884266853518, + "learning_rate": 1.2113485710661348e-05, + "loss": 0.6472, + "step": 4928 + }, + { + "epoch": 0.6056026538886841, + "grad_norm": 1.2391031124306708, + "learning_rate": 1.2107172919976607e-05, + "loss": 0.601, + "step": 4929 + }, + { + "epoch": 0.6057255191055412, + "grad_norm": 1.0467099183558541, + "learning_rate": 1.2100860661408648e-05, + "loss": 0.6282, + "step": 4930 + }, + { + "epoch": 0.6058483843223983, + "grad_norm": 1.1259482883797272, + "learning_rate": 1.2094548936118567e-05, + "loss": 0.5636, + "step": 4931 + }, + { + "epoch": 0.6059712495392554, + "grad_norm": 1.3139570152436675, + "learning_rate": 1.2088237745267363e-05, + "loss": 0.6908, + "step": 4932 + }, + { + "epoch": 0.6060941147561125, + "grad_norm": 1.5085176690956654, + "learning_rate": 1.2081927090015949e-05, + "loss": 0.6066, + "step": 4933 + }, + { + "epoch": 0.6062169799729696, + "grad_norm": 1.1100883956105243, + "learning_rate": 1.2075616971525119e-05, + "loss": 0.6135, + "step": 4934 + }, + { + "epoch": 0.6063398451898268, + "grad_norm": 1.1764617969665039, + "learning_rate": 1.2069307390955584e-05, + "loss": 0.6124, + "step": 4935 + }, + { + "epoch": 0.6064627104066839, + "grad_norm": 1.0613135026947689, + "learning_rate": 1.2062998349467941e-05, + "loss": 0.5337, + "step": 4936 + }, + { + "epoch": 0.606585575623541, + "grad_norm": 1.0461872326419004, + "learning_rate": 1.2056689848222713e-05, + "loss": 0.6129, + "step": 4937 + }, + { + "epoch": 0.6067084408403981, + "grad_norm": 1.1068289457663292, + "learning_rate": 1.2050381888380297e-05, + "loss": 0.7301, + "step": 4938 + }, + { + "epoch": 0.6068313060572552, + "grad_norm": 1.3023916246740155, + "learning_rate": 1.2044074471101e-05, + "loss": 0.698, + "step": 4939 + }, + { + "epoch": 0.6069541712741123, + "grad_norm": 1.0539568898407412, + "learning_rate": 1.2037767597545039e-05, + "loss": 0.4786, + "step": 4940 + }, + { + "epoch": 0.6070770364909694, + "grad_norm": 1.2996579452778554, + "learning_rate": 1.2031461268872518e-05, + "loss": 0.5475, + "step": 4941 + }, + { + "epoch": 0.6071999017078266, + "grad_norm": 1.187115000764816, + "learning_rate": 1.2025155486243444e-05, + "loss": 0.5855, + "step": 4942 + }, + { + "epoch": 0.6073227669246837, + "grad_norm": 1.4616641988656198, + "learning_rate": 1.2018850250817719e-05, + "loss": 0.7058, + "step": 4943 + }, + { + "epoch": 0.6074456321415407, + "grad_norm": 1.498723211383975, + "learning_rate": 1.2012545563755165e-05, + "loss": 0.612, + "step": 4944 + }, + { + "epoch": 0.6075684973583978, + "grad_norm": 1.2118738359321068, + "learning_rate": 1.2006241426215479e-05, + "loss": 0.5907, + "step": 4945 + }, + { + "epoch": 0.6076913625752549, + "grad_norm": 1.2612516112786734, + "learning_rate": 1.1999937839358268e-05, + "loss": 0.6787, + "step": 4946 + }, + { + "epoch": 0.607814227792112, + "grad_norm": 1.4463986676983365, + "learning_rate": 1.1993634804343032e-05, + "loss": 0.5803, + "step": 4947 + }, + { + "epoch": 0.6079370930089691, + "grad_norm": 1.2639282148441897, + "learning_rate": 1.198733232232918e-05, + "loss": 0.6267, + "step": 4948 + }, + { + "epoch": 0.6080599582258263, + "grad_norm": 1.1465854917810474, + "learning_rate": 1.198103039447601e-05, + "loss": 0.6214, + "step": 4949 + }, + { + "epoch": 0.6081828234426834, + "grad_norm": 1.1159249188071896, + "learning_rate": 1.1974729021942717e-05, + "loss": 0.5205, + "step": 4950 + }, + { + "epoch": 0.6083056886595405, + "grad_norm": 7.067580458796607, + "learning_rate": 1.1968428205888397e-05, + "loss": 0.6946, + "step": 4951 + }, + { + "epoch": 0.6084285538763976, + "grad_norm": 1.1530405335155964, + "learning_rate": 1.1962127947472055e-05, + "loss": 0.5355, + "step": 4952 + }, + { + "epoch": 0.6085514190932547, + "grad_norm": 1.5752879758820033, + "learning_rate": 1.1955828247852576e-05, + "loss": 0.6583, + "step": 4953 + }, + { + "epoch": 0.6086742843101118, + "grad_norm": 1.2202970017070367, + "learning_rate": 1.1949529108188746e-05, + "loss": 0.6047, + "step": 4954 + }, + { + "epoch": 0.608797149526969, + "grad_norm": 1.0759875957240013, + "learning_rate": 1.1943230529639251e-05, + "loss": 0.5621, + "step": 4955 + }, + { + "epoch": 0.6089200147438261, + "grad_norm": 1.2695798898974724, + "learning_rate": 1.193693251336268e-05, + "loss": 0.4978, + "step": 4956 + }, + { + "epoch": 0.6090428799606832, + "grad_norm": 1.308476326049241, + "learning_rate": 1.1930635060517509e-05, + "loss": 0.6101, + "step": 4957 + }, + { + "epoch": 0.6091657451775402, + "grad_norm": 1.116098067463713, + "learning_rate": 1.192433817226211e-05, + "loss": 0.6083, + "step": 4958 + }, + { + "epoch": 0.6092886103943973, + "grad_norm": 1.302464301449465, + "learning_rate": 1.191804184975476e-05, + "loss": 0.621, + "step": 4959 + }, + { + "epoch": 0.6094114756112544, + "grad_norm": 1.217744952787814, + "learning_rate": 1.1911746094153627e-05, + "loss": 0.4722, + "step": 4960 + }, + { + "epoch": 0.6095343408281115, + "grad_norm": 1.0925866624033331, + "learning_rate": 1.190545090661677e-05, + "loss": 0.5671, + "step": 4961 + }, + { + "epoch": 0.6096572060449686, + "grad_norm": 1.208881942195444, + "learning_rate": 1.1899156288302144e-05, + "loss": 0.6317, + "step": 4962 + }, + { + "epoch": 0.6097800712618258, + "grad_norm": 1.3406082118061433, + "learning_rate": 1.1892862240367615e-05, + "loss": 0.63, + "step": 4963 + }, + { + "epoch": 0.6099029364786829, + "grad_norm": 1.4181090865369084, + "learning_rate": 1.1886568763970928e-05, + "loss": 0.6296, + "step": 4964 + }, + { + "epoch": 0.61002580169554, + "grad_norm": 1.2501330541594633, + "learning_rate": 1.1880275860269723e-05, + "loss": 0.5356, + "step": 4965 + }, + { + "epoch": 0.6101486669123971, + "grad_norm": 1.2733179496525797, + "learning_rate": 1.1873983530421539e-05, + "loss": 0.4739, + "step": 4966 + }, + { + "epoch": 0.6102715321292542, + "grad_norm": 1.1109718082881412, + "learning_rate": 1.1867691775583816e-05, + "loss": 0.564, + "step": 4967 + }, + { + "epoch": 0.6103943973461113, + "grad_norm": 1.0818474660725255, + "learning_rate": 1.1861400596913877e-05, + "loss": 0.5345, + "step": 4968 + }, + { + "epoch": 0.6105172625629685, + "grad_norm": 1.1635960970027468, + "learning_rate": 1.1855109995568944e-05, + "loss": 0.7055, + "step": 4969 + }, + { + "epoch": 0.6106401277798256, + "grad_norm": 1.4019880251436114, + "learning_rate": 1.1848819972706124e-05, + "loss": 0.6428, + "step": 4970 + }, + { + "epoch": 0.6107629929966827, + "grad_norm": 1.3011240015275214, + "learning_rate": 1.1842530529482441e-05, + "loss": 0.5801, + "step": 4971 + }, + { + "epoch": 0.6108858582135398, + "grad_norm": 1.091945692248107, + "learning_rate": 1.183624166705479e-05, + "loss": 0.5386, + "step": 4972 + }, + { + "epoch": 0.6110087234303968, + "grad_norm": 1.1369277103151245, + "learning_rate": 1.1829953386579967e-05, + "loss": 0.5939, + "step": 4973 + }, + { + "epoch": 0.6111315886472539, + "grad_norm": 1.391912973569696, + "learning_rate": 1.1823665689214657e-05, + "loss": 0.5774, + "step": 4974 + }, + { + "epoch": 0.611254453864111, + "grad_norm": 1.1915983428839627, + "learning_rate": 1.1817378576115447e-05, + "loss": 0.7807, + "step": 4975 + }, + { + "epoch": 0.6113773190809682, + "grad_norm": 1.208104689133999, + "learning_rate": 1.1811092048438808e-05, + "loss": 0.5368, + "step": 4976 + }, + { + "epoch": 0.6115001842978253, + "grad_norm": 1.2771909231787817, + "learning_rate": 1.1804806107341106e-05, + "loss": 0.5917, + "step": 4977 + }, + { + "epoch": 0.6116230495146824, + "grad_norm": 1.2080680957003636, + "learning_rate": 1.1798520753978592e-05, + "loss": 0.5363, + "step": 4978 + }, + { + "epoch": 0.6117459147315395, + "grad_norm": 1.2946744660197171, + "learning_rate": 1.179223598950743e-05, + "loss": 0.6682, + "step": 4979 + }, + { + "epoch": 0.6118687799483966, + "grad_norm": 1.2882395770449075, + "learning_rate": 1.1785951815083655e-05, + "loss": 0.573, + "step": 4980 + }, + { + "epoch": 0.6119916451652537, + "grad_norm": 2.0977582952479708, + "learning_rate": 1.1779668231863193e-05, + "loss": 0.6721, + "step": 4981 + }, + { + "epoch": 0.6121145103821108, + "grad_norm": 1.323381475644259, + "learning_rate": 1.1773385241001882e-05, + "loss": 0.6335, + "step": 4982 + }, + { + "epoch": 0.612237375598968, + "grad_norm": 1.3785434589242611, + "learning_rate": 1.176710284365543e-05, + "loss": 0.6395, + "step": 4983 + }, + { + "epoch": 0.6123602408158251, + "grad_norm": 1.1968432822209616, + "learning_rate": 1.1760821040979446e-05, + "loss": 0.5475, + "step": 4984 + }, + { + "epoch": 0.6124831060326822, + "grad_norm": 1.3094417135849583, + "learning_rate": 1.1754539834129417e-05, + "loss": 0.5381, + "step": 4985 + }, + { + "epoch": 0.6126059712495393, + "grad_norm": 1.1342248311421985, + "learning_rate": 1.1748259224260745e-05, + "loss": 0.6797, + "step": 4986 + }, + { + "epoch": 0.6127288364663963, + "grad_norm": 1.2234218729924584, + "learning_rate": 1.1741979212528698e-05, + "loss": 0.4632, + "step": 4987 + }, + { + "epoch": 0.6128517016832534, + "grad_norm": 1.3999054760694098, + "learning_rate": 1.1735699800088447e-05, + "loss": 0.5956, + "step": 4988 + }, + { + "epoch": 0.6129745669001105, + "grad_norm": 1.2599062407781814, + "learning_rate": 1.1729420988095042e-05, + "loss": 0.5157, + "step": 4989 + }, + { + "epoch": 0.6130974321169677, + "grad_norm": 1.1180472789061613, + "learning_rate": 1.1723142777703442e-05, + "loss": 0.5695, + "step": 4990 + }, + { + "epoch": 0.6132202973338248, + "grad_norm": 1.523310336354127, + "learning_rate": 1.1716865170068475e-05, + "loss": 0.6112, + "step": 4991 + }, + { + "epoch": 0.6133431625506819, + "grad_norm": 1.449922558529882, + "learning_rate": 1.1710588166344872e-05, + "loss": 0.5985, + "step": 4992 + }, + { + "epoch": 0.613466027767539, + "grad_norm": 1.0868533085664365, + "learning_rate": 1.1704311767687237e-05, + "loss": 0.5871, + "step": 4993 + }, + { + "epoch": 0.6135888929843961, + "grad_norm": 1.3341261278790046, + "learning_rate": 1.1698035975250082e-05, + "loss": 0.6083, + "step": 4994 + }, + { + "epoch": 0.6137117582012532, + "grad_norm": 1.5246895857077716, + "learning_rate": 1.1691760790187798e-05, + "loss": 0.7059, + "step": 4995 + }, + { + "epoch": 0.6138346234181103, + "grad_norm": 1.5927845943164514, + "learning_rate": 1.168548621365466e-05, + "loss": 0.7643, + "step": 4996 + }, + { + "epoch": 0.6139574886349675, + "grad_norm": 1.5888376488461773, + "learning_rate": 1.1679212246804831e-05, + "loss": 0.5632, + "step": 4997 + }, + { + "epoch": 0.6140803538518246, + "grad_norm": 2.269227831416141, + "learning_rate": 1.167293889079238e-05, + "loss": 0.6493, + "step": 4998 + }, + { + "epoch": 0.6142032190686817, + "grad_norm": 1.4515020189963683, + "learning_rate": 1.1666666146771243e-05, + "loss": 0.5581, + "step": 4999 + }, + { + "epoch": 0.6143260842855388, + "grad_norm": 1.3480644673580846, + "learning_rate": 1.1660394015895245e-05, + "loss": 0.4811, + "step": 5000 + }, + { + "epoch": 0.6144489495023959, + "grad_norm": 1.2492273151785767, + "learning_rate": 1.1654122499318117e-05, + "loss": 0.5873, + "step": 5001 + }, + { + "epoch": 0.6145718147192529, + "grad_norm": 1.2571312931712866, + "learning_rate": 1.1647851598193456e-05, + "loss": 0.5815, + "step": 5002 + }, + { + "epoch": 0.61469467993611, + "grad_norm": 1.2528238799603164, + "learning_rate": 1.1641581313674752e-05, + "loss": 0.5519, + "step": 5003 + }, + { + "epoch": 0.6148175451529672, + "grad_norm": 1.1345185237970636, + "learning_rate": 1.1635311646915385e-05, + "loss": 0.5929, + "step": 5004 + }, + { + "epoch": 0.6149404103698243, + "grad_norm": 1.3686581828799098, + "learning_rate": 1.162904259906862e-05, + "loss": 0.6869, + "step": 5005 + }, + { + "epoch": 0.6150632755866814, + "grad_norm": 1.0389074446285969, + "learning_rate": 1.162277417128761e-05, + "loss": 0.5664, + "step": 5006 + }, + { + "epoch": 0.6151861408035385, + "grad_norm": 1.2234550814853662, + "learning_rate": 1.1616506364725388e-05, + "loss": 0.5629, + "step": 5007 + }, + { + "epoch": 0.6153090060203956, + "grad_norm": 1.3719053801574301, + "learning_rate": 1.1610239180534872e-05, + "loss": 0.6129, + "step": 5008 + }, + { + "epoch": 0.6154318712372527, + "grad_norm": 1.2385678942893728, + "learning_rate": 1.1603972619868881e-05, + "loss": 0.7027, + "step": 5009 + }, + { + "epoch": 0.6155547364541099, + "grad_norm": 1.2547888019109483, + "learning_rate": 1.15977066838801e-05, + "loss": 0.5799, + "step": 5010 + }, + { + "epoch": 0.615677601670967, + "grad_norm": 1.087553769955007, + "learning_rate": 1.1591441373721115e-05, + "loss": 0.5633, + "step": 5011 + }, + { + "epoch": 0.6158004668878241, + "grad_norm": 1.5159407662087923, + "learning_rate": 1.1585176690544377e-05, + "loss": 0.5846, + "step": 5012 + }, + { + "epoch": 0.6159233321046812, + "grad_norm": 1.1856190969924847, + "learning_rate": 1.1578912635502245e-05, + "loss": 0.6455, + "step": 5013 + }, + { + "epoch": 0.6160461973215383, + "grad_norm": 1.0698717600667265, + "learning_rate": 1.1572649209746948e-05, + "loss": 0.5434, + "step": 5014 + }, + { + "epoch": 0.6161690625383954, + "grad_norm": 1.1154640886661982, + "learning_rate": 1.1566386414430602e-05, + "loss": 0.7014, + "step": 5015 + }, + { + "epoch": 0.6162919277552524, + "grad_norm": 1.3789546891523476, + "learning_rate": 1.1560124250705198e-05, + "loss": 0.6708, + "step": 5016 + }, + { + "epoch": 0.6164147929721095, + "grad_norm": 1.2850680529061898, + "learning_rate": 1.1553862719722639e-05, + "loss": 0.5778, + "step": 5017 + }, + { + "epoch": 0.6165376581889667, + "grad_norm": 1.4413050544960548, + "learning_rate": 1.1547601822634684e-05, + "loss": 0.6766, + "step": 5018 + }, + { + "epoch": 0.6166605234058238, + "grad_norm": 1.3825490136685927, + "learning_rate": 1.1541341560592982e-05, + "loss": 0.6937, + "step": 5019 + }, + { + "epoch": 0.6167833886226809, + "grad_norm": 1.50587080641632, + "learning_rate": 1.1535081934749064e-05, + "loss": 0.5928, + "step": 5020 + }, + { + "epoch": 0.616906253839538, + "grad_norm": 1.133969583450374, + "learning_rate": 1.152882294625436e-05, + "loss": 0.6034, + "step": 5021 + }, + { + "epoch": 0.6170291190563951, + "grad_norm": 1.4859781407611206, + "learning_rate": 1.1522564596260165e-05, + "loss": 0.5333, + "step": 5022 + }, + { + "epoch": 0.6171519842732522, + "grad_norm": 1.1046155379394351, + "learning_rate": 1.1516306885917656e-05, + "loss": 0.57, + "step": 5023 + }, + { + "epoch": 0.6172748494901094, + "grad_norm": 1.2355669928018604, + "learning_rate": 1.1510049816377904e-05, + "loss": 0.7192, + "step": 5024 + }, + { + "epoch": 0.6173977147069665, + "grad_norm": 1.5238497483811848, + "learning_rate": 1.1503793388791859e-05, + "loss": 0.6139, + "step": 5025 + }, + { + "epoch": 0.6175205799238236, + "grad_norm": 1.4326612106343033, + "learning_rate": 1.1497537604310343e-05, + "loss": 0.6929, + "step": 5026 + }, + { + "epoch": 0.6176434451406807, + "grad_norm": 1.3140278376234882, + "learning_rate": 1.1491282464084067e-05, + "loss": 0.5592, + "step": 5027 + }, + { + "epoch": 0.6177663103575378, + "grad_norm": 1.1817282396953968, + "learning_rate": 1.1485027969263632e-05, + "loss": 0.6782, + "step": 5028 + }, + { + "epoch": 0.6178891755743949, + "grad_norm": 1.3651512715527456, + "learning_rate": 1.1478774120999507e-05, + "loss": 0.61, + "step": 5029 + }, + { + "epoch": 0.618012040791252, + "grad_norm": 1.25028525263793, + "learning_rate": 1.1472520920442044e-05, + "loss": 0.5488, + "step": 5030 + }, + { + "epoch": 0.618134906008109, + "grad_norm": 1.4070726466605803, + "learning_rate": 1.146626836874148e-05, + "loss": 0.6564, + "step": 5031 + }, + { + "epoch": 0.6182577712249662, + "grad_norm": 1.209523692926911, + "learning_rate": 1.1460016467047937e-05, + "loss": 0.7519, + "step": 5032 + }, + { + "epoch": 0.6183806364418233, + "grad_norm": 1.245515452694344, + "learning_rate": 1.1453765216511408e-05, + "loss": 0.6005, + "step": 5033 + }, + { + "epoch": 0.6185035016586804, + "grad_norm": 1.1178634379523753, + "learning_rate": 1.1447514618281768e-05, + "loss": 0.6472, + "step": 5034 + }, + { + "epoch": 0.6186263668755375, + "grad_norm": 1.2006484961191004, + "learning_rate": 1.1441264673508766e-05, + "loss": 0.6246, + "step": 5035 + }, + { + "epoch": 0.6187492320923946, + "grad_norm": 1.142576022503797, + "learning_rate": 1.1435015383342058e-05, + "loss": 0.5651, + "step": 5036 + }, + { + "epoch": 0.6188720973092517, + "grad_norm": 1.1271094183716466, + "learning_rate": 1.1428766748931148e-05, + "loss": 0.5947, + "step": 5037 + }, + { + "epoch": 0.6189949625261089, + "grad_norm": 1.470665469622402, + "learning_rate": 1.1422518771425435e-05, + "loss": 0.7306, + "step": 5038 + }, + { + "epoch": 0.619117827742966, + "grad_norm": 1.3328610909849994, + "learning_rate": 1.1416271451974187e-05, + "loss": 0.5467, + "step": 5039 + }, + { + "epoch": 0.6192406929598231, + "grad_norm": 1.246152004421772, + "learning_rate": 1.1410024791726573e-05, + "loss": 0.6115, + "step": 5040 + }, + { + "epoch": 0.6193635581766802, + "grad_norm": 1.6894941564372101, + "learning_rate": 1.1403778791831614e-05, + "loss": 0.5927, + "step": 5041 + }, + { + "epoch": 0.6194864233935373, + "grad_norm": 1.334582089979247, + "learning_rate": 1.1397533453438223e-05, + "loss": 0.694, + "step": 5042 + }, + { + "epoch": 0.6196092886103944, + "grad_norm": 1.5258965671905316, + "learning_rate": 1.139128877769519e-05, + "loss": 0.673, + "step": 5043 + }, + { + "epoch": 0.6197321538272516, + "grad_norm": 1.2028924292455212, + "learning_rate": 1.1385044765751185e-05, + "loss": 0.5842, + "step": 5044 + }, + { + "epoch": 0.6198550190441087, + "grad_norm": 1.3304540883866547, + "learning_rate": 1.1378801418754752e-05, + "loss": 0.6311, + "step": 5045 + }, + { + "epoch": 0.6199778842609657, + "grad_norm": 1.2715894419384308, + "learning_rate": 1.1372558737854307e-05, + "loss": 0.5928, + "step": 5046 + }, + { + "epoch": 0.6201007494778228, + "grad_norm": 1.2958524455035123, + "learning_rate": 1.1366316724198163e-05, + "loss": 0.6403, + "step": 5047 + }, + { + "epoch": 0.6202236146946799, + "grad_norm": 1.2922269927688752, + "learning_rate": 1.1360075378934492e-05, + "loss": 0.7419, + "step": 5048 + }, + { + "epoch": 0.620346479911537, + "grad_norm": 1.1386988397715831, + "learning_rate": 1.1353834703211351e-05, + "loss": 0.5806, + "step": 5049 + }, + { + "epoch": 0.6204693451283941, + "grad_norm": 1.1293451425916112, + "learning_rate": 1.1347594698176666e-05, + "loss": 0.5728, + "step": 5050 + }, + { + "epoch": 0.6205922103452512, + "grad_norm": 1.1946618782826206, + "learning_rate": 1.1341355364978253e-05, + "loss": 0.5658, + "step": 5051 + }, + { + "epoch": 0.6207150755621084, + "grad_norm": 1.282412974769383, + "learning_rate": 1.1335116704763794e-05, + "loss": 0.6273, + "step": 5052 + }, + { + "epoch": 0.6208379407789655, + "grad_norm": 1.4112900058021143, + "learning_rate": 1.132887871868085e-05, + "loss": 0.6231, + "step": 5053 + }, + { + "epoch": 0.6209608059958226, + "grad_norm": 1.1518251743940755, + "learning_rate": 1.132264140787685e-05, + "loss": 0.5457, + "step": 5054 + }, + { + "epoch": 0.6210836712126797, + "grad_norm": 1.33011057937413, + "learning_rate": 1.1316404773499122e-05, + "loss": 0.6055, + "step": 5055 + }, + { + "epoch": 0.6212065364295368, + "grad_norm": 1.279091449990655, + "learning_rate": 1.1310168816694846e-05, + "loss": 0.5786, + "step": 5056 + }, + { + "epoch": 0.6213294016463939, + "grad_norm": 1.2781349986142845, + "learning_rate": 1.1303933538611086e-05, + "loss": 0.5746, + "step": 5057 + }, + { + "epoch": 0.6214522668632511, + "grad_norm": 1.1586469280210892, + "learning_rate": 1.1297698940394777e-05, + "loss": 0.5962, + "step": 5058 + }, + { + "epoch": 0.6215751320801082, + "grad_norm": 1.5236035738728022, + "learning_rate": 1.1291465023192742e-05, + "loss": 0.7066, + "step": 5059 + }, + { + "epoch": 0.6216979972969652, + "grad_norm": 1.2054911818740424, + "learning_rate": 1.1285231788151667e-05, + "loss": 0.6479, + "step": 5060 + }, + { + "epoch": 0.6218208625138223, + "grad_norm": 1.1267719896866883, + "learning_rate": 1.1278999236418113e-05, + "loss": 0.5905, + "step": 5061 + }, + { + "epoch": 0.6219437277306794, + "grad_norm": 1.3304309602808178, + "learning_rate": 1.1272767369138515e-05, + "loss": 0.5312, + "step": 5062 + }, + { + "epoch": 0.6220665929475365, + "grad_norm": 0.9769668886762187, + "learning_rate": 1.126653618745919e-05, + "loss": 0.5679, + "step": 5063 + }, + { + "epoch": 0.6221894581643936, + "grad_norm": 1.4619451314528829, + "learning_rate": 1.1260305692526321e-05, + "loss": 0.7035, + "step": 5064 + }, + { + "epoch": 0.6223123233812508, + "grad_norm": 1.4549592616860754, + "learning_rate": 1.1254075885485962e-05, + "loss": 0.5146, + "step": 5065 + }, + { + "epoch": 0.6224351885981079, + "grad_norm": 1.1279033402288838, + "learning_rate": 1.1247846767484057e-05, + "loss": 0.5026, + "step": 5066 + }, + { + "epoch": 0.622558053814965, + "grad_norm": 1.082989744302011, + "learning_rate": 1.1241618339666404e-05, + "loss": 0.5646, + "step": 5067 + }, + { + "epoch": 0.6226809190318221, + "grad_norm": 1.445865208697441, + "learning_rate": 1.1235390603178684e-05, + "loss": 0.5488, + "step": 5068 + }, + { + "epoch": 0.6228037842486792, + "grad_norm": 1.2354814525790305, + "learning_rate": 1.1229163559166445e-05, + "loss": 0.604, + "step": 5069 + }, + { + "epoch": 0.6229266494655363, + "grad_norm": 1.2208770012454093, + "learning_rate": 1.1222937208775117e-05, + "loss": 0.5727, + "step": 5070 + }, + { + "epoch": 0.6230495146823934, + "grad_norm": 1.284487864636418, + "learning_rate": 1.1216711553149995e-05, + "loss": 0.615, + "step": 5071 + }, + { + "epoch": 0.6231723798992506, + "grad_norm": 1.3570119748634881, + "learning_rate": 1.1210486593436249e-05, + "loss": 0.4951, + "step": 5072 + }, + { + "epoch": 0.6232952451161077, + "grad_norm": 1.131112442376893, + "learning_rate": 1.1204262330778912e-05, + "loss": 0.5375, + "step": 5073 + }, + { + "epoch": 0.6234181103329648, + "grad_norm": 1.035893861424968, + "learning_rate": 1.1198038766322907e-05, + "loss": 0.697, + "step": 5074 + }, + { + "epoch": 0.6235409755498218, + "grad_norm": 1.3631869011950557, + "learning_rate": 1.1191815901213015e-05, + "loss": 0.5959, + "step": 5075 + }, + { + "epoch": 0.6236638407666789, + "grad_norm": 1.3203737124005201, + "learning_rate": 1.118559373659389e-05, + "loss": 0.6388, + "step": 5076 + }, + { + "epoch": 0.623786705983536, + "grad_norm": 0.9858095810018782, + "learning_rate": 1.117937227361006e-05, + "loss": 0.6272, + "step": 5077 + }, + { + "epoch": 0.6239095712003931, + "grad_norm": 1.2849040254160922, + "learning_rate": 1.1173151513405923e-05, + "loss": 0.5894, + "step": 5078 + }, + { + "epoch": 0.6240324364172503, + "grad_norm": 1.0755847117062771, + "learning_rate": 1.1166931457125744e-05, + "loss": 0.6197, + "step": 5079 + }, + { + "epoch": 0.6241553016341074, + "grad_norm": 1.6789458063721014, + "learning_rate": 1.116071210591367e-05, + "loss": 0.5357, + "step": 5080 + }, + { + "epoch": 0.6242781668509645, + "grad_norm": 1.3573249633484312, + "learning_rate": 1.1154493460913702e-05, + "loss": 0.6216, + "step": 5081 + }, + { + "epoch": 0.6244010320678216, + "grad_norm": 1.0755548762936746, + "learning_rate": 1.1148275523269724e-05, + "loss": 0.6641, + "step": 5082 + }, + { + "epoch": 0.6245238972846787, + "grad_norm": 1.1973381617562384, + "learning_rate": 1.1142058294125486e-05, + "loss": 0.5908, + "step": 5083 + }, + { + "epoch": 0.6246467625015358, + "grad_norm": 1.2378277373992692, + "learning_rate": 1.1135841774624605e-05, + "loss": 0.5802, + "step": 5084 + }, + { + "epoch": 0.624769627718393, + "grad_norm": 1.3113237833646978, + "learning_rate": 1.1129625965910563e-05, + "loss": 0.7407, + "step": 5085 + }, + { + "epoch": 0.6248924929352501, + "grad_norm": 1.1036943790382927, + "learning_rate": 1.1123410869126731e-05, + "loss": 0.6006, + "step": 5086 + }, + { + "epoch": 0.6250153581521072, + "grad_norm": 1.6447106753558067, + "learning_rate": 1.1117196485416328e-05, + "loss": 0.7647, + "step": 5087 + }, + { + "epoch": 0.6251382233689643, + "grad_norm": 1.3617687314128617, + "learning_rate": 1.1110982815922449e-05, + "loss": 0.5834, + "step": 5088 + }, + { + "epoch": 0.6252610885858213, + "grad_norm": 1.3925387863709977, + "learning_rate": 1.1104769861788062e-05, + "loss": 0.7333, + "step": 5089 + }, + { + "epoch": 0.6253839538026784, + "grad_norm": 1.3070520702563027, + "learning_rate": 1.1098557624155997e-05, + "loss": 0.524, + "step": 5090 + }, + { + "epoch": 0.6255068190195355, + "grad_norm": 1.5837708605993694, + "learning_rate": 1.1092346104168955e-05, + "loss": 0.5287, + "step": 5091 + }, + { + "epoch": 0.6256296842363926, + "grad_norm": 1.2350220279916468, + "learning_rate": 1.10861353029695e-05, + "loss": 0.5941, + "step": 5092 + }, + { + "epoch": 0.6257525494532498, + "grad_norm": 1.3113123688970842, + "learning_rate": 1.107992522170008e-05, + "loss": 0.6135, + "step": 5093 + }, + { + "epoch": 0.6258754146701069, + "grad_norm": 1.1336862901718112, + "learning_rate": 1.1073715861502994e-05, + "loss": 0.599, + "step": 5094 + }, + { + "epoch": 0.625998279886964, + "grad_norm": 1.3595717883557306, + "learning_rate": 1.106750722352041e-05, + "loss": 0.5227, + "step": 5095 + }, + { + "epoch": 0.6261211451038211, + "grad_norm": 1.2847480279910206, + "learning_rate": 1.1061299308894367e-05, + "loss": 0.6271, + "step": 5096 + }, + { + "epoch": 0.6262440103206782, + "grad_norm": 1.0797125853968466, + "learning_rate": 1.1055092118766776e-05, + "loss": 0.6458, + "step": 5097 + }, + { + "epoch": 0.6263668755375353, + "grad_norm": 1.1999437420794858, + "learning_rate": 1.1048885654279407e-05, + "loss": 0.5227, + "step": 5098 + }, + { + "epoch": 0.6264897407543925, + "grad_norm": 1.3037353991004332, + "learning_rate": 1.1042679916573898e-05, + "loss": 0.598, + "step": 5099 + }, + { + "epoch": 0.6266126059712496, + "grad_norm": 1.193294326847676, + "learning_rate": 1.1036474906791746e-05, + "loss": 0.5993, + "step": 5100 + }, + { + "epoch": 0.6267354711881067, + "grad_norm": 1.2906530980619113, + "learning_rate": 1.1030270626074338e-05, + "loss": 0.6785, + "step": 5101 + }, + { + "epoch": 0.6268583364049638, + "grad_norm": 1.304233647206345, + "learning_rate": 1.1024067075562903e-05, + "loss": 0.4714, + "step": 5102 + }, + { + "epoch": 0.6269812016218209, + "grad_norm": 1.384958903910949, + "learning_rate": 1.1017864256398547e-05, + "loss": 0.6489, + "step": 5103 + }, + { + "epoch": 0.6271040668386779, + "grad_norm": 1.193656168722829, + "learning_rate": 1.1011662169722227e-05, + "loss": 0.5416, + "step": 5104 + }, + { + "epoch": 0.627226932055535, + "grad_norm": 1.1116157182251722, + "learning_rate": 1.1005460816674792e-05, + "loss": 0.6689, + "step": 5105 + }, + { + "epoch": 0.6273497972723922, + "grad_norm": 1.220034260458224, + "learning_rate": 1.0999260198396936e-05, + "loss": 0.6063, + "step": 5106 + }, + { + "epoch": 0.6274726624892493, + "grad_norm": 1.0326376831494857, + "learning_rate": 1.0993060316029216e-05, + "loss": 0.6285, + "step": 5107 + }, + { + "epoch": 0.6275955277061064, + "grad_norm": 1.1088135631220724, + "learning_rate": 1.098686117071207e-05, + "loss": 0.5353, + "step": 5108 + }, + { + "epoch": 0.6277183929229635, + "grad_norm": 1.2347219710318778, + "learning_rate": 1.0980662763585783e-05, + "loss": 0.5913, + "step": 5109 + }, + { + "epoch": 0.6278412581398206, + "grad_norm": 0.9803588229042811, + "learning_rate": 1.0974465095790516e-05, + "loss": 0.5184, + "step": 5110 + }, + { + "epoch": 0.6279641233566777, + "grad_norm": 1.091791847076933, + "learning_rate": 1.0968268168466282e-05, + "loss": 0.6377, + "step": 5111 + }, + { + "epoch": 0.6280869885735348, + "grad_norm": 1.1790098624628167, + "learning_rate": 1.0962071982752977e-05, + "loss": 0.4939, + "step": 5112 + }, + { + "epoch": 0.628209853790392, + "grad_norm": 1.2484262578216874, + "learning_rate": 1.0955876539790344e-05, + "loss": 0.6001, + "step": 5113 + }, + { + "epoch": 0.6283327190072491, + "grad_norm": 1.2879440451160309, + "learning_rate": 1.0949681840717997e-05, + "loss": 0.6531, + "step": 5114 + }, + { + "epoch": 0.6284555842241062, + "grad_norm": 1.0617019336729843, + "learning_rate": 1.0943487886675401e-05, + "loss": 0.5694, + "step": 5115 + }, + { + "epoch": 0.6285784494409633, + "grad_norm": 1.2310351793377374, + "learning_rate": 1.0937294678801905e-05, + "loss": 0.5138, + "step": 5116 + }, + { + "epoch": 0.6287013146578204, + "grad_norm": 1.283082204445918, + "learning_rate": 1.0931102218236707e-05, + "loss": 0.6134, + "step": 5117 + }, + { + "epoch": 0.6288241798746774, + "grad_norm": 1.3345202004934498, + "learning_rate": 1.0924910506118868e-05, + "loss": 0.666, + "step": 5118 + }, + { + "epoch": 0.6289470450915345, + "grad_norm": 1.4149307736567502, + "learning_rate": 1.0918719543587307e-05, + "loss": 0.7128, + "step": 5119 + }, + { + "epoch": 0.6290699103083917, + "grad_norm": 1.2826128109119423, + "learning_rate": 1.0912529331780824e-05, + "loss": 0.5284, + "step": 5120 + }, + { + "epoch": 0.6291927755252488, + "grad_norm": 1.1644485186569258, + "learning_rate": 1.090633987183806e-05, + "loss": 0.7042, + "step": 5121 + }, + { + "epoch": 0.6293156407421059, + "grad_norm": 1.269878250683381, + "learning_rate": 1.0900151164897532e-05, + "loss": 0.5013, + "step": 5122 + }, + { + "epoch": 0.629438505958963, + "grad_norm": 1.32318655841898, + "learning_rate": 1.08939632120976e-05, + "loss": 0.5806, + "step": 5123 + }, + { + "epoch": 0.6295613711758201, + "grad_norm": 1.5380884970061923, + "learning_rate": 1.0887776014576514e-05, + "loss": 0.5595, + "step": 5124 + }, + { + "epoch": 0.6296842363926772, + "grad_norm": 1.1122794970195218, + "learning_rate": 1.088158957347236e-05, + "loss": 0.8212, + "step": 5125 + }, + { + "epoch": 0.6298071016095343, + "grad_norm": 1.3204992587459632, + "learning_rate": 1.0875403889923098e-05, + "loss": 0.6051, + "step": 5126 + }, + { + "epoch": 0.6299299668263915, + "grad_norm": 1.1922548276561036, + "learning_rate": 1.0869218965066536e-05, + "loss": 0.5992, + "step": 5127 + }, + { + "epoch": 0.6300528320432486, + "grad_norm": 1.1414544051185522, + "learning_rate": 1.086303480004036e-05, + "loss": 0.5493, + "step": 5128 + }, + { + "epoch": 0.6301756972601057, + "grad_norm": 1.2446575315608766, + "learning_rate": 1.0856851395982103e-05, + "loss": 0.5903, + "step": 5129 + }, + { + "epoch": 0.6302985624769628, + "grad_norm": 1.1570737499103128, + "learning_rate": 1.0850668754029157e-05, + "loss": 0.6296, + "step": 5130 + }, + { + "epoch": 0.6304214276938199, + "grad_norm": 1.3056696262720868, + "learning_rate": 1.084448687531879e-05, + "loss": 0.6066, + "step": 5131 + }, + { + "epoch": 0.630544292910677, + "grad_norm": 1.0950048938568029, + "learning_rate": 1.0838305760988113e-05, + "loss": 0.5878, + "step": 5132 + }, + { + "epoch": 0.630667158127534, + "grad_norm": 1.2208758464933822, + "learning_rate": 1.0832125412174102e-05, + "loss": 0.5653, + "step": 5133 + }, + { + "epoch": 0.6307900233443912, + "grad_norm": 1.246131116787707, + "learning_rate": 1.0825945830013588e-05, + "loss": 0.5212, + "step": 5134 + }, + { + "epoch": 0.6309128885612483, + "grad_norm": 1.2116186930692385, + "learning_rate": 1.0819767015643273e-05, + "loss": 0.6097, + "step": 5135 + }, + { + "epoch": 0.6310357537781054, + "grad_norm": 1.2243459477720735, + "learning_rate": 1.0813588970199705e-05, + "loss": 0.5558, + "step": 5136 + }, + { + "epoch": 0.6311586189949625, + "grad_norm": 1.233678605385963, + "learning_rate": 1.0807411694819295e-05, + "loss": 0.5535, + "step": 5137 + }, + { + "epoch": 0.6312814842118196, + "grad_norm": 0.8816621647050311, + "learning_rate": 1.0801235190638309e-05, + "loss": 0.6091, + "step": 5138 + }, + { + "epoch": 0.6314043494286767, + "grad_norm": 1.265650375060481, + "learning_rate": 1.0795059458792886e-05, + "loss": 0.6378, + "step": 5139 + }, + { + "epoch": 0.6315272146455339, + "grad_norm": 1.1104399182049496, + "learning_rate": 1.0788884500419005e-05, + "loss": 0.5951, + "step": 5140 + }, + { + "epoch": 0.631650079862391, + "grad_norm": 1.1699290722178657, + "learning_rate": 1.0782710316652512e-05, + "loss": 0.5654, + "step": 5141 + }, + { + "epoch": 0.6317729450792481, + "grad_norm": 1.151796479022268, + "learning_rate": 1.0776536908629098e-05, + "loss": 0.5638, + "step": 5142 + }, + { + "epoch": 0.6318958102961052, + "grad_norm": 1.247741487433421, + "learning_rate": 1.0770364277484335e-05, + "loss": 0.5831, + "step": 5143 + }, + { + "epoch": 0.6320186755129623, + "grad_norm": 1.3136538374447175, + "learning_rate": 1.0764192424353634e-05, + "loss": 0.6943, + "step": 5144 + }, + { + "epoch": 0.6321415407298194, + "grad_norm": 1.196613885407614, + "learning_rate": 1.0758021350372268e-05, + "loss": 0.5195, + "step": 5145 + }, + { + "epoch": 0.6322644059466765, + "grad_norm": 1.15245030909477, + "learning_rate": 1.0751851056675358e-05, + "loss": 0.5277, + "step": 5146 + }, + { + "epoch": 0.6323872711635335, + "grad_norm": 1.087410814792468, + "learning_rate": 1.0745681544397902e-05, + "loss": 0.521, + "step": 5147 + }, + { + "epoch": 0.6325101363803907, + "grad_norm": 1.3064207976108808, + "learning_rate": 1.0739512814674734e-05, + "loss": 0.532, + "step": 5148 + }, + { + "epoch": 0.6326330015972478, + "grad_norm": 1.1444027245855342, + "learning_rate": 1.0733344868640556e-05, + "loss": 0.5232, + "step": 5149 + }, + { + "epoch": 0.6327558668141049, + "grad_norm": 1.1906464005753519, + "learning_rate": 1.072717770742991e-05, + "loss": 0.5457, + "step": 5150 + }, + { + "epoch": 0.632878732030962, + "grad_norm": 1.0729232265468, + "learning_rate": 1.0721011332177223e-05, + "loss": 0.5692, + "step": 5151 + }, + { + "epoch": 0.6330015972478191, + "grad_norm": 1.2994863375575971, + "learning_rate": 1.0714845744016749e-05, + "loss": 0.7338, + "step": 5152 + }, + { + "epoch": 0.6331244624646762, + "grad_norm": 1.5263266814267635, + "learning_rate": 1.0708680944082608e-05, + "loss": 0.6402, + "step": 5153 + }, + { + "epoch": 0.6332473276815334, + "grad_norm": 0.9740079765551258, + "learning_rate": 1.0702516933508779e-05, + "loss": 0.5375, + "step": 5154 + }, + { + "epoch": 0.6333701928983905, + "grad_norm": 1.1961372503309824, + "learning_rate": 1.0696353713429092e-05, + "loss": 0.5103, + "step": 5155 + }, + { + "epoch": 0.6334930581152476, + "grad_norm": 1.3379002035411385, + "learning_rate": 1.0690191284977229e-05, + "loss": 0.5629, + "step": 5156 + }, + { + "epoch": 0.6336159233321047, + "grad_norm": 1.3517621017739407, + "learning_rate": 1.0684029649286721e-05, + "loss": 0.6597, + "step": 5157 + }, + { + "epoch": 0.6337387885489618, + "grad_norm": 1.1273527379363528, + "learning_rate": 1.0677868807490977e-05, + "loss": 0.5597, + "step": 5158 + }, + { + "epoch": 0.6338616537658189, + "grad_norm": 1.5196670728882018, + "learning_rate": 1.0671708760723236e-05, + "loss": 0.5944, + "step": 5159 + }, + { + "epoch": 0.633984518982676, + "grad_norm": 1.127836189996214, + "learning_rate": 1.0665549510116597e-05, + "loss": 0.5096, + "step": 5160 + }, + { + "epoch": 0.6341073841995332, + "grad_norm": 1.0691553998209042, + "learning_rate": 1.065939105680401e-05, + "loss": 0.4808, + "step": 5161 + }, + { + "epoch": 0.6342302494163902, + "grad_norm": 1.3606349646926093, + "learning_rate": 1.0653233401918296e-05, + "loss": 0.6458, + "step": 5162 + }, + { + "epoch": 0.6343531146332473, + "grad_norm": 1.1939948516073384, + "learning_rate": 1.0647076546592105e-05, + "loss": 0.6227, + "step": 5163 + }, + { + "epoch": 0.6344759798501044, + "grad_norm": 0.9833425578943614, + "learning_rate": 1.0640920491957957e-05, + "loss": 0.6376, + "step": 5164 + }, + { + "epoch": 0.6345988450669615, + "grad_norm": 1.3243877636962131, + "learning_rate": 1.063476523914821e-05, + "loss": 0.5428, + "step": 5165 + }, + { + "epoch": 0.6347217102838186, + "grad_norm": 1.2564994048232034, + "learning_rate": 1.062861078929509e-05, + "loss": 0.6441, + "step": 5166 + }, + { + "epoch": 0.6348445755006757, + "grad_norm": 1.2769316559344588, + "learning_rate": 1.0622457143530666e-05, + "loss": 0.5233, + "step": 5167 + }, + { + "epoch": 0.6349674407175329, + "grad_norm": 1.2547264443349149, + "learning_rate": 1.0616304302986863e-05, + "loss": 0.4678, + "step": 5168 + }, + { + "epoch": 0.63509030593439, + "grad_norm": 1.1216175771115215, + "learning_rate": 1.0610152268795446e-05, + "loss": 0.5832, + "step": 5169 + }, + { + "epoch": 0.6352131711512471, + "grad_norm": 1.1210841178965223, + "learning_rate": 1.0604001042088057e-05, + "loss": 0.5441, + "step": 5170 + }, + { + "epoch": 0.6353360363681042, + "grad_norm": 1.2214958139923193, + "learning_rate": 1.0597850623996169e-05, + "loss": 0.5555, + "step": 5171 + }, + { + "epoch": 0.6354589015849613, + "grad_norm": 1.2309159458284342, + "learning_rate": 1.0591701015651104e-05, + "loss": 0.5158, + "step": 5172 + }, + { + "epoch": 0.6355817668018184, + "grad_norm": 1.0937369743974306, + "learning_rate": 1.0585552218184054e-05, + "loss": 0.6795, + "step": 5173 + }, + { + "epoch": 0.6357046320186756, + "grad_norm": 1.2473530091885279, + "learning_rate": 1.0579404232726041e-05, + "loss": 0.5039, + "step": 5174 + }, + { + "epoch": 0.6358274972355327, + "grad_norm": 1.2643218796536217, + "learning_rate": 1.0573257060407955e-05, + "loss": 0.5531, + "step": 5175 + }, + { + "epoch": 0.6359503624523898, + "grad_norm": 1.4750649881392695, + "learning_rate": 1.0567110702360514e-05, + "loss": 0.6206, + "step": 5176 + }, + { + "epoch": 0.6360732276692468, + "grad_norm": 1.1690116636110488, + "learning_rate": 1.056096515971432e-05, + "loss": 0.5966, + "step": 5177 + }, + { + "epoch": 0.6361960928861039, + "grad_norm": 1.2148566265019691, + "learning_rate": 1.0554820433599797e-05, + "loss": 0.5617, + "step": 5178 + }, + { + "epoch": 0.636318958102961, + "grad_norm": 1.0996996993676347, + "learning_rate": 1.0548676525147226e-05, + "loss": 0.5942, + "step": 5179 + }, + { + "epoch": 0.6364418233198181, + "grad_norm": 1.2268734100108452, + "learning_rate": 1.0542533435486734e-05, + "loss": 0.5616, + "step": 5180 + }, + { + "epoch": 0.6365646885366752, + "grad_norm": 1.0861159706691832, + "learning_rate": 1.0536391165748315e-05, + "loss": 0.564, + "step": 5181 + }, + { + "epoch": 0.6366875537535324, + "grad_norm": 1.408381212365579, + "learning_rate": 1.0530249717061795e-05, + "loss": 0.5765, + "step": 5182 + }, + { + "epoch": 0.6368104189703895, + "grad_norm": 1.1052627823903678, + "learning_rate": 1.052410909055685e-05, + "loss": 0.6108, + "step": 5183 + }, + { + "epoch": 0.6369332841872466, + "grad_norm": 1.268075328988321, + "learning_rate": 1.051796928736301e-05, + "loss": 0.5307, + "step": 5184 + }, + { + "epoch": 0.6370561494041037, + "grad_norm": 1.079468360257231, + "learning_rate": 1.0511830308609655e-05, + "loss": 0.6122, + "step": 5185 + }, + { + "epoch": 0.6371790146209608, + "grad_norm": 1.1571065111878291, + "learning_rate": 1.0505692155426007e-05, + "loss": 0.5816, + "step": 5186 + }, + { + "epoch": 0.6373018798378179, + "grad_norm": 1.612372368806991, + "learning_rate": 1.049955482894114e-05, + "loss": 0.6135, + "step": 5187 + }, + { + "epoch": 0.6374247450546751, + "grad_norm": 1.250495485336522, + "learning_rate": 1.049341833028397e-05, + "loss": 0.5925, + "step": 5188 + }, + { + "epoch": 0.6375476102715322, + "grad_norm": 1.3769031297388294, + "learning_rate": 1.0487282660583278e-05, + "loss": 0.5653, + "step": 5189 + }, + { + "epoch": 0.6376704754883893, + "grad_norm": 1.3401263362533986, + "learning_rate": 1.0481147820967677e-05, + "loss": 0.5715, + "step": 5190 + }, + { + "epoch": 0.6377933407052463, + "grad_norm": 1.397251571225431, + "learning_rate": 1.0475013812565628e-05, + "loss": 0.5326, + "step": 5191 + }, + { + "epoch": 0.6379162059221034, + "grad_norm": 1.3330711513951181, + "learning_rate": 1.0468880636505437e-05, + "loss": 0.6165, + "step": 5192 + }, + { + "epoch": 0.6380390711389605, + "grad_norm": 1.2506953335435549, + "learning_rate": 1.0462748293915271e-05, + "loss": 0.5901, + "step": 5193 + }, + { + "epoch": 0.6381619363558176, + "grad_norm": 1.3357027817280274, + "learning_rate": 1.0456616785923131e-05, + "loss": 0.5326, + "step": 5194 + }, + { + "epoch": 0.6382848015726748, + "grad_norm": 1.6183556163394999, + "learning_rate": 1.0450486113656862e-05, + "loss": 0.5506, + "step": 5195 + }, + { + "epoch": 0.6384076667895319, + "grad_norm": 1.4126761941328836, + "learning_rate": 1.0444356278244178e-05, + "loss": 0.6166, + "step": 5196 + }, + { + "epoch": 0.638530532006389, + "grad_norm": 1.3464103992754433, + "learning_rate": 1.0438227280812608e-05, + "loss": 0.5298, + "step": 5197 + }, + { + "epoch": 0.6386533972232461, + "grad_norm": 1.1923810736096563, + "learning_rate": 1.0432099122489547e-05, + "loss": 0.5423, + "step": 5198 + }, + { + "epoch": 0.6387762624401032, + "grad_norm": 1.2571880895420982, + "learning_rate": 1.0425971804402227e-05, + "loss": 0.6405, + "step": 5199 + }, + { + "epoch": 0.6388991276569603, + "grad_norm": 1.212589607637169, + "learning_rate": 1.0419845327677731e-05, + "loss": 0.5117, + "step": 5200 + }, + { + "epoch": 0.6390219928738174, + "grad_norm": 1.1634802396234316, + "learning_rate": 1.0413719693442984e-05, + "loss": 0.6753, + "step": 5201 + }, + { + "epoch": 0.6391448580906746, + "grad_norm": 1.0712497992150178, + "learning_rate": 1.0407594902824751e-05, + "loss": 0.5415, + "step": 5202 + }, + { + "epoch": 0.6392677233075317, + "grad_norm": 1.1016208805383636, + "learning_rate": 1.0401470956949656e-05, + "loss": 0.5378, + "step": 5203 + }, + { + "epoch": 0.6393905885243888, + "grad_norm": 1.1797323542365845, + "learning_rate": 1.0395347856944158e-05, + "loss": 0.5849, + "step": 5204 + }, + { + "epoch": 0.6395134537412459, + "grad_norm": 1.3885606053187627, + "learning_rate": 1.0389225603934561e-05, + "loss": 0.5388, + "step": 5205 + }, + { + "epoch": 0.6396363189581029, + "grad_norm": 1.187179661535103, + "learning_rate": 1.038310419904701e-05, + "loss": 0.5895, + "step": 5206 + }, + { + "epoch": 0.63975918417496, + "grad_norm": 1.0472071690206557, + "learning_rate": 1.0376983643407497e-05, + "loss": 0.5129, + "step": 5207 + }, + { + "epoch": 0.6398820493918171, + "grad_norm": 1.384551770160788, + "learning_rate": 1.0370863938141864e-05, + "loss": 0.592, + "step": 5208 + }, + { + "epoch": 0.6400049146086743, + "grad_norm": 1.6275700329940714, + "learning_rate": 1.036474508437579e-05, + "loss": 0.5891, + "step": 5209 + }, + { + "epoch": 0.6401277798255314, + "grad_norm": 1.2097185470649288, + "learning_rate": 1.0358627083234797e-05, + "loss": 0.5718, + "step": 5210 + }, + { + "epoch": 0.6402506450423885, + "grad_norm": 1.0322791298559149, + "learning_rate": 1.0352509935844248e-05, + "loss": 0.5862, + "step": 5211 + }, + { + "epoch": 0.6403735102592456, + "grad_norm": 1.4739293421972846, + "learning_rate": 1.0346393643329359e-05, + "loss": 0.6882, + "step": 5212 + }, + { + "epoch": 0.6404963754761027, + "grad_norm": 1.2317422676348588, + "learning_rate": 1.0340278206815183e-05, + "loss": 0.5088, + "step": 5213 + }, + { + "epoch": 0.6406192406929598, + "grad_norm": 1.4233805941388213, + "learning_rate": 1.0334163627426603e-05, + "loss": 0.6143, + "step": 5214 + }, + { + "epoch": 0.640742105909817, + "grad_norm": 1.3407696679580865, + "learning_rate": 1.0328049906288371e-05, + "loss": 0.5198, + "step": 5215 + }, + { + "epoch": 0.6408649711266741, + "grad_norm": 1.3631923152831453, + "learning_rate": 1.0321937044525059e-05, + "loss": 0.5079, + "step": 5216 + }, + { + "epoch": 0.6409878363435312, + "grad_norm": 1.379612239264027, + "learning_rate": 1.031582504326109e-05, + "loss": 0.6255, + "step": 5217 + }, + { + "epoch": 0.6411107015603883, + "grad_norm": 1.2293337943126337, + "learning_rate": 1.0309713903620723e-05, + "loss": 0.5749, + "step": 5218 + }, + { + "epoch": 0.6412335667772454, + "grad_norm": 1.2289546945823377, + "learning_rate": 1.0303603626728069e-05, + "loss": 0.6685, + "step": 5219 + }, + { + "epoch": 0.6413564319941024, + "grad_norm": 1.0966618214271466, + "learning_rate": 1.0297494213707073e-05, + "loss": 0.6351, + "step": 5220 + }, + { + "epoch": 0.6414792972109595, + "grad_norm": 1.2942912028946107, + "learning_rate": 1.0291385665681516e-05, + "loss": 0.6112, + "step": 5221 + }, + { + "epoch": 0.6416021624278166, + "grad_norm": 1.490943946173928, + "learning_rate": 1.0285277983775026e-05, + "loss": 0.6193, + "step": 5222 + }, + { + "epoch": 0.6417250276446738, + "grad_norm": 1.28995228247958, + "learning_rate": 1.0279171169111079e-05, + "loss": 0.5637, + "step": 5223 + }, + { + "epoch": 0.6418478928615309, + "grad_norm": 1.2653501857491485, + "learning_rate": 1.0273065222812982e-05, + "loss": 0.5555, + "step": 5224 + }, + { + "epoch": 0.641970758078388, + "grad_norm": 1.0351796822467487, + "learning_rate": 1.0266960146003878e-05, + "loss": 0.5009, + "step": 5225 + }, + { + "epoch": 0.6420936232952451, + "grad_norm": 1.2324758027351077, + "learning_rate": 1.0260855939806759e-05, + "loss": 0.5056, + "step": 5226 + }, + { + "epoch": 0.6422164885121022, + "grad_norm": 1.1874143974424516, + "learning_rate": 1.0254752605344458e-05, + "loss": 0.578, + "step": 5227 + }, + { + "epoch": 0.6423393537289593, + "grad_norm": 1.2349492690367263, + "learning_rate": 1.0248650143739643e-05, + "loss": 0.4156, + "step": 5228 + }, + { + "epoch": 0.6424622189458165, + "grad_norm": 1.2826658911275965, + "learning_rate": 1.024254855611482e-05, + "loss": 0.6324, + "step": 5229 + }, + { + "epoch": 0.6425850841626736, + "grad_norm": 1.516848149384472, + "learning_rate": 1.0236447843592334e-05, + "loss": 0.5394, + "step": 5230 + }, + { + "epoch": 0.6427079493795307, + "grad_norm": 1.1364680567780459, + "learning_rate": 1.0230348007294377e-05, + "loss": 0.6598, + "step": 5231 + }, + { + "epoch": 0.6428308145963878, + "grad_norm": 1.2692820325208305, + "learning_rate": 1.0224249048342974e-05, + "loss": 0.5696, + "step": 5232 + }, + { + "epoch": 0.6429536798132449, + "grad_norm": 1.1961704803001916, + "learning_rate": 1.0218150967859984e-05, + "loss": 0.5898, + "step": 5233 + }, + { + "epoch": 0.643076545030102, + "grad_norm": 1.8253812211649454, + "learning_rate": 1.0212053766967107e-05, + "loss": 0.4961, + "step": 5234 + }, + { + "epoch": 0.643199410246959, + "grad_norm": 1.0953369243163862, + "learning_rate": 1.0205957446785894e-05, + "loss": 0.6521, + "step": 5235 + }, + { + "epoch": 0.6433222754638162, + "grad_norm": 1.1962059312383158, + "learning_rate": 1.0199862008437718e-05, + "loss": 0.6685, + "step": 5236 + }, + { + "epoch": 0.6434451406806733, + "grad_norm": 1.2143008985669947, + "learning_rate": 1.0193767453043795e-05, + "loss": 0.6243, + "step": 5237 + }, + { + "epoch": 0.6435680058975304, + "grad_norm": 1.2740155164787943, + "learning_rate": 1.0187673781725181e-05, + "loss": 0.5238, + "step": 5238 + }, + { + "epoch": 0.6436908711143875, + "grad_norm": 1.3286692062218268, + "learning_rate": 1.0181580995602766e-05, + "loss": 0.5245, + "step": 5239 + }, + { + "epoch": 0.6438137363312446, + "grad_norm": 1.4398486143268203, + "learning_rate": 1.0175489095797278e-05, + "loss": 0.6382, + "step": 5240 + }, + { + "epoch": 0.6439366015481017, + "grad_norm": 1.1346791251717732, + "learning_rate": 1.0169398083429277e-05, + "loss": 0.5205, + "step": 5241 + }, + { + "epoch": 0.6440594667649588, + "grad_norm": 1.2721533548616502, + "learning_rate": 1.0163307959619176e-05, + "loss": 0.7334, + "step": 5242 + }, + { + "epoch": 0.644182331981816, + "grad_norm": 1.3824496774213904, + "learning_rate": 1.015721872548721e-05, + "loss": 0.7426, + "step": 5243 + }, + { + "epoch": 0.6443051971986731, + "grad_norm": 1.3285384661272186, + "learning_rate": 1.0151130382153453e-05, + "loss": 0.6732, + "step": 5244 + }, + { + "epoch": 0.6444280624155302, + "grad_norm": 1.222415779703877, + "learning_rate": 1.014504293073781e-05, + "loss": 0.6038, + "step": 5245 + }, + { + "epoch": 0.6445509276323873, + "grad_norm": 1.1191050829846856, + "learning_rate": 1.0138956372360041e-05, + "loss": 0.6695, + "step": 5246 + }, + { + "epoch": 0.6446737928492444, + "grad_norm": 1.0740902437480164, + "learning_rate": 1.013287070813972e-05, + "loss": 0.5556, + "step": 5247 + }, + { + "epoch": 0.6447966580661015, + "grad_norm": 1.1634176301118708, + "learning_rate": 1.012678593919627e-05, + "loss": 0.5363, + "step": 5248 + }, + { + "epoch": 0.6449195232829585, + "grad_norm": 1.3348413649685462, + "learning_rate": 1.0120702066648938e-05, + "loss": 0.5257, + "step": 5249 + }, + { + "epoch": 0.6450423884998157, + "grad_norm": 1.3330427786997971, + "learning_rate": 1.0114619091616822e-05, + "loss": 0.5985, + "step": 5250 + }, + { + "epoch": 0.6451652537166728, + "grad_norm": 1.4148568346467474, + "learning_rate": 1.010853701521884e-05, + "loss": 0.6473, + "step": 5251 + }, + { + "epoch": 0.6452881189335299, + "grad_norm": 1.1451011408262466, + "learning_rate": 1.0102455838573753e-05, + "loss": 0.4991, + "step": 5252 + }, + { + "epoch": 0.645410984150387, + "grad_norm": 1.2578572202268592, + "learning_rate": 1.0096375562800146e-05, + "loss": 0.5566, + "step": 5253 + }, + { + "epoch": 0.6455338493672441, + "grad_norm": 1.0986390871750857, + "learning_rate": 1.0090296189016459e-05, + "loss": 0.6512, + "step": 5254 + }, + { + "epoch": 0.6456567145841012, + "grad_norm": 1.3086550567843056, + "learning_rate": 1.0084217718340949e-05, + "loss": 0.6531, + "step": 5255 + }, + { + "epoch": 0.6457795798009583, + "grad_norm": 1.0791741329887907, + "learning_rate": 1.0078140151891705e-05, + "loss": 0.5637, + "step": 5256 + }, + { + "epoch": 0.6459024450178155, + "grad_norm": 1.1793118917184937, + "learning_rate": 1.0072063490786665e-05, + "loss": 0.5398, + "step": 5257 + }, + { + "epoch": 0.6460253102346726, + "grad_norm": 1.0199356701458826, + "learning_rate": 1.0065987736143586e-05, + "loss": 0.5875, + "step": 5258 + }, + { + "epoch": 0.6461481754515297, + "grad_norm": 1.1662705372041255, + "learning_rate": 1.0059912889080064e-05, + "loss": 0.65, + "step": 5259 + }, + { + "epoch": 0.6462710406683868, + "grad_norm": 1.275023151132807, + "learning_rate": 1.0053838950713523e-05, + "loss": 0.5454, + "step": 5260 + }, + { + "epoch": 0.6463939058852439, + "grad_norm": 1.145053127385827, + "learning_rate": 1.0047765922161237e-05, + "loss": 0.5463, + "step": 5261 + }, + { + "epoch": 0.646516771102101, + "grad_norm": 1.214735693923012, + "learning_rate": 1.0041693804540293e-05, + "loss": 0.655, + "step": 5262 + }, + { + "epoch": 0.6466396363189582, + "grad_norm": 1.3727005427566359, + "learning_rate": 1.0035622598967618e-05, + "loss": 0.5683, + "step": 5263 + }, + { + "epoch": 0.6467625015358152, + "grad_norm": 1.0077281681277364, + "learning_rate": 1.0029552306559965e-05, + "loss": 0.5233, + "step": 5264 + }, + { + "epoch": 0.6468853667526723, + "grad_norm": 1.4931728661254378, + "learning_rate": 1.0023482928433934e-05, + "loss": 0.6609, + "step": 5265 + }, + { + "epoch": 0.6470082319695294, + "grad_norm": 1.2779237662750802, + "learning_rate": 1.0017414465705948e-05, + "loss": 0.5777, + "step": 5266 + }, + { + "epoch": 0.6471310971863865, + "grad_norm": 1.2457368283612147, + "learning_rate": 1.0011346919492256e-05, + "loss": 0.688, + "step": 5267 + }, + { + "epoch": 0.6472539624032436, + "grad_norm": 1.3438348595782776, + "learning_rate": 1.0005280290908943e-05, + "loss": 0.5642, + "step": 5268 + }, + { + "epoch": 0.6473768276201007, + "grad_norm": 1.1961207838078172, + "learning_rate": 9.999214581071933e-06, + "loss": 0.676, + "step": 5269 + }, + { + "epoch": 0.6474996928369579, + "grad_norm": 1.236862633311059, + "learning_rate": 9.993149791096968e-06, + "loss": 0.5365, + "step": 5270 + }, + { + "epoch": 0.647622558053815, + "grad_norm": 1.3522962865788926, + "learning_rate": 9.987085922099628e-06, + "loss": 0.611, + "step": 5271 + }, + { + "epoch": 0.6477454232706721, + "grad_norm": 1.237890273969685, + "learning_rate": 9.981022975195319e-06, + "loss": 0.6408, + "step": 5272 + }, + { + "epoch": 0.6478682884875292, + "grad_norm": 1.0381382765833658, + "learning_rate": 9.974960951499288e-06, + "loss": 0.7282, + "step": 5273 + }, + { + "epoch": 0.6479911537043863, + "grad_norm": 1.275337234044017, + "learning_rate": 9.968899852126605e-06, + "loss": 0.5089, + "step": 5274 + }, + { + "epoch": 0.6481140189212434, + "grad_norm": 1.2959928848340654, + "learning_rate": 9.962839678192163e-06, + "loss": 0.5296, + "step": 5275 + }, + { + "epoch": 0.6482368841381005, + "grad_norm": 1.391443132552266, + "learning_rate": 9.956780430810692e-06, + "loss": 0.6231, + "step": 5276 + }, + { + "epoch": 0.6483597493549577, + "grad_norm": 1.1933390684958582, + "learning_rate": 9.950722111096758e-06, + "loss": 0.5312, + "step": 5277 + }, + { + "epoch": 0.6484826145718147, + "grad_norm": 1.3848588837467481, + "learning_rate": 9.944664720164745e-06, + "loss": 0.7465, + "step": 5278 + }, + { + "epoch": 0.6486054797886718, + "grad_norm": 1.0528354561948678, + "learning_rate": 9.938608259128866e-06, + "loss": 0.5674, + "step": 5279 + }, + { + "epoch": 0.6487283450055289, + "grad_norm": 1.2233828133737459, + "learning_rate": 9.932552729103183e-06, + "loss": 0.529, + "step": 5280 + }, + { + "epoch": 0.648851210222386, + "grad_norm": 1.335249199502768, + "learning_rate": 9.926498131201556e-06, + "loss": 0.6128, + "step": 5281 + }, + { + "epoch": 0.6489740754392431, + "grad_norm": 1.0779195211212043, + "learning_rate": 9.9204444665377e-06, + "loss": 0.6202, + "step": 5282 + }, + { + "epoch": 0.6490969406561002, + "grad_norm": 1.1425433042135433, + "learning_rate": 9.914391736225134e-06, + "loss": 0.6336, + "step": 5283 + }, + { + "epoch": 0.6492198058729574, + "grad_norm": 1.164427266233832, + "learning_rate": 9.908339941377232e-06, + "loss": 0.535, + "step": 5284 + }, + { + "epoch": 0.6493426710898145, + "grad_norm": 1.3944340775157373, + "learning_rate": 9.902289083107181e-06, + "loss": 0.6988, + "step": 5285 + }, + { + "epoch": 0.6494655363066716, + "grad_norm": 1.167913449130525, + "learning_rate": 9.89623916252799e-06, + "loss": 0.4626, + "step": 5286 + }, + { + "epoch": 0.6495884015235287, + "grad_norm": 0.9804442854530732, + "learning_rate": 9.890190180752503e-06, + "loss": 0.634, + "step": 5287 + }, + { + "epoch": 0.6497112667403858, + "grad_norm": 1.429970011915802, + "learning_rate": 9.884142138893399e-06, + "loss": 0.7161, + "step": 5288 + }, + { + "epoch": 0.6498341319572429, + "grad_norm": 1.3785048994236457, + "learning_rate": 9.87809503806317e-06, + "loss": 0.636, + "step": 5289 + }, + { + "epoch": 0.6499569971741, + "grad_norm": 1.1476338929929766, + "learning_rate": 9.87204887937414e-06, + "loss": 0.4549, + "step": 5290 + }, + { + "epoch": 0.6500798623909572, + "grad_norm": 1.2614802511186742, + "learning_rate": 9.86600366393846e-06, + "loss": 0.4574, + "step": 5291 + }, + { + "epoch": 0.6502027276078143, + "grad_norm": 1.3005422663069057, + "learning_rate": 9.859959392868114e-06, + "loss": 0.5244, + "step": 5292 + }, + { + "epoch": 0.6503255928246713, + "grad_norm": 0.9980722412959651, + "learning_rate": 9.853916067274905e-06, + "loss": 0.5649, + "step": 5293 + }, + { + "epoch": 0.6504484580415284, + "grad_norm": 1.2092539695772027, + "learning_rate": 9.847873688270462e-06, + "loss": 0.6012, + "step": 5294 + }, + { + "epoch": 0.6505713232583855, + "grad_norm": 1.1736001832565368, + "learning_rate": 9.841832256966239e-06, + "loss": 0.5185, + "step": 5295 + }, + { + "epoch": 0.6506941884752426, + "grad_norm": 1.3128149226479897, + "learning_rate": 9.835791774473522e-06, + "loss": 0.6206, + "step": 5296 + }, + { + "epoch": 0.6508170536920997, + "grad_norm": 1.2991607954743523, + "learning_rate": 9.829752241903418e-06, + "loss": 0.5519, + "step": 5297 + }, + { + "epoch": 0.6509399189089569, + "grad_norm": 1.2988971039067583, + "learning_rate": 9.823713660366858e-06, + "loss": 0.6164, + "step": 5298 + }, + { + "epoch": 0.651062784125814, + "grad_norm": 1.135468197586648, + "learning_rate": 9.817676030974596e-06, + "loss": 0.4821, + "step": 5299 + }, + { + "epoch": 0.6511856493426711, + "grad_norm": 1.1634737927263625, + "learning_rate": 9.811639354837224e-06, + "loss": 0.6254, + "step": 5300 + }, + { + "epoch": 0.6513085145595282, + "grad_norm": 1.217018455402315, + "learning_rate": 9.805603633065145e-06, + "loss": 0.5543, + "step": 5301 + }, + { + "epoch": 0.6514313797763853, + "grad_norm": 1.2340227585388304, + "learning_rate": 9.799568866768584e-06, + "loss": 0.5068, + "step": 5302 + }, + { + "epoch": 0.6515542449932424, + "grad_norm": 1.5914870027899903, + "learning_rate": 9.793535057057614e-06, + "loss": 0.6563, + "step": 5303 + }, + { + "epoch": 0.6516771102100996, + "grad_norm": 1.2977952522062588, + "learning_rate": 9.787502205042102e-06, + "loss": 0.635, + "step": 5304 + }, + { + "epoch": 0.6517999754269567, + "grad_norm": 1.389054670078481, + "learning_rate": 9.781470311831755e-06, + "loss": 0.596, + "step": 5305 + }, + { + "epoch": 0.6519228406438138, + "grad_norm": 1.343400591106651, + "learning_rate": 9.7754393785361e-06, + "loss": 0.49, + "step": 5306 + }, + { + "epoch": 0.6520457058606709, + "grad_norm": 1.5138638230448436, + "learning_rate": 9.76940940626449e-06, + "loss": 0.6209, + "step": 5307 + }, + { + "epoch": 0.6521685710775279, + "grad_norm": 1.8257578812721513, + "learning_rate": 9.763380396126099e-06, + "loss": 0.7134, + "step": 5308 + }, + { + "epoch": 0.652291436294385, + "grad_norm": 1.357932636314792, + "learning_rate": 9.757352349229922e-06, + "loss": 0.5503, + "step": 5309 + }, + { + "epoch": 0.6524143015112421, + "grad_norm": 1.126004277996551, + "learning_rate": 9.751325266684775e-06, + "loss": 0.6587, + "step": 5310 + }, + { + "epoch": 0.6525371667280992, + "grad_norm": 1.1878368656326732, + "learning_rate": 9.745299149599314e-06, + "loss": 0.5062, + "step": 5311 + }, + { + "epoch": 0.6526600319449564, + "grad_norm": 1.1762375383952635, + "learning_rate": 9.739273999081995e-06, + "loss": 0.6739, + "step": 5312 + }, + { + "epoch": 0.6527828971618135, + "grad_norm": 1.3154255359819806, + "learning_rate": 9.733249816241108e-06, + "loss": 0.6831, + "step": 5313 + }, + { + "epoch": 0.6529057623786706, + "grad_norm": 1.4174464647486962, + "learning_rate": 9.727226602184759e-06, + "loss": 0.591, + "step": 5314 + }, + { + "epoch": 0.6530286275955277, + "grad_norm": 1.1440365847912402, + "learning_rate": 9.721204358020881e-06, + "loss": 0.7112, + "step": 5315 + }, + { + "epoch": 0.6531514928123848, + "grad_norm": 1.1983939914460777, + "learning_rate": 9.71518308485723e-06, + "loss": 0.558, + "step": 5316 + }, + { + "epoch": 0.6532743580292419, + "grad_norm": 1.103190381300676, + "learning_rate": 9.709162783801375e-06, + "loss": 0.596, + "step": 5317 + }, + { + "epoch": 0.6533972232460991, + "grad_norm": 1.1499280231792122, + "learning_rate": 9.70314345596071e-06, + "loss": 0.6621, + "step": 5318 + }, + { + "epoch": 0.6535200884629562, + "grad_norm": 1.5621353021446582, + "learning_rate": 9.697125102442461e-06, + "loss": 0.6369, + "step": 5319 + }, + { + "epoch": 0.6536429536798133, + "grad_norm": 1.142416765849412, + "learning_rate": 9.691107724353656e-06, + "loss": 0.5158, + "step": 5320 + }, + { + "epoch": 0.6537658188966704, + "grad_norm": 1.287384331614206, + "learning_rate": 9.685091322801155e-06, + "loss": 0.7789, + "step": 5321 + }, + { + "epoch": 0.6538886841135274, + "grad_norm": 1.3185768684216863, + "learning_rate": 9.67907589889164e-06, + "loss": 0.6683, + "step": 5322 + }, + { + "epoch": 0.6540115493303845, + "grad_norm": 1.2779193528575943, + "learning_rate": 9.673061453731605e-06, + "loss": 0.5657, + "step": 5323 + }, + { + "epoch": 0.6541344145472416, + "grad_norm": 1.2853073858552113, + "learning_rate": 9.66704798842737e-06, + "loss": 0.584, + "step": 5324 + }, + { + "epoch": 0.6542572797640988, + "grad_norm": 1.2493455160686304, + "learning_rate": 9.661035504085065e-06, + "loss": 0.5511, + "step": 5325 + }, + { + "epoch": 0.6543801449809559, + "grad_norm": 1.2508294648588896, + "learning_rate": 9.655024001810662e-06, + "loss": 0.5959, + "step": 5326 + }, + { + "epoch": 0.654503010197813, + "grad_norm": 1.2236164578141877, + "learning_rate": 9.64901348270993e-06, + "loss": 0.534, + "step": 5327 + }, + { + "epoch": 0.6546258754146701, + "grad_norm": 1.3133118623805917, + "learning_rate": 9.643003947888465e-06, + "loss": 0.6851, + "step": 5328 + }, + { + "epoch": 0.6547487406315272, + "grad_norm": 1.1594122077009854, + "learning_rate": 9.636995398451677e-06, + "loss": 0.7326, + "step": 5329 + }, + { + "epoch": 0.6548716058483843, + "grad_norm": 1.217171023238275, + "learning_rate": 9.630987835504811e-06, + "loss": 0.5441, + "step": 5330 + }, + { + "epoch": 0.6549944710652414, + "grad_norm": 1.4117247534004862, + "learning_rate": 9.624981260152914e-06, + "loss": 0.6259, + "step": 5331 + }, + { + "epoch": 0.6551173362820986, + "grad_norm": 2.001120900883098, + "learning_rate": 9.618975673500856e-06, + "loss": 0.7654, + "step": 5332 + }, + { + "epoch": 0.6552402014989557, + "grad_norm": 1.3672219950205813, + "learning_rate": 9.61297107665332e-06, + "loss": 0.686, + "step": 5333 + }, + { + "epoch": 0.6553630667158128, + "grad_norm": 1.2726882156130843, + "learning_rate": 9.606967470714826e-06, + "loss": 0.5582, + "step": 5334 + }, + { + "epoch": 0.6554859319326699, + "grad_norm": 1.1354256229893647, + "learning_rate": 9.600964856789688e-06, + "loss": 0.7643, + "step": 5335 + }, + { + "epoch": 0.655608797149527, + "grad_norm": 1.2215005791144686, + "learning_rate": 9.59496323598205e-06, + "loss": 0.4949, + "step": 5336 + }, + { + "epoch": 0.655731662366384, + "grad_norm": 1.0541478514378368, + "learning_rate": 9.588962609395867e-06, + "loss": 0.7002, + "step": 5337 + }, + { + "epoch": 0.6558545275832411, + "grad_norm": 1.465652481026434, + "learning_rate": 9.582962978134924e-06, + "loss": 0.6266, + "step": 5338 + }, + { + "epoch": 0.6559773928000983, + "grad_norm": 1.0962883641339438, + "learning_rate": 9.576964343302812e-06, + "loss": 0.5217, + "step": 5339 + }, + { + "epoch": 0.6561002580169554, + "grad_norm": 1.193801360057326, + "learning_rate": 9.570966706002941e-06, + "loss": 0.6089, + "step": 5340 + }, + { + "epoch": 0.6562231232338125, + "grad_norm": 1.2277485101500245, + "learning_rate": 9.564970067338532e-06, + "loss": 0.499, + "step": 5341 + }, + { + "epoch": 0.6563459884506696, + "grad_norm": 1.2180841616270914, + "learning_rate": 9.558974428412634e-06, + "loss": 0.541, + "step": 5342 + }, + { + "epoch": 0.6564688536675267, + "grad_norm": 1.2314664392934864, + "learning_rate": 9.552979790328105e-06, + "loss": 0.7058, + "step": 5343 + }, + { + "epoch": 0.6565917188843838, + "grad_norm": 1.228727883913474, + "learning_rate": 9.54698615418761e-06, + "loss": 0.5556, + "step": 5344 + }, + { + "epoch": 0.656714584101241, + "grad_norm": 1.1654334981410415, + "learning_rate": 9.540993521093654e-06, + "loss": 0.6523, + "step": 5345 + }, + { + "epoch": 0.6568374493180981, + "grad_norm": 1.1999711558691315, + "learning_rate": 9.535001892148538e-06, + "loss": 0.5968, + "step": 5346 + }, + { + "epoch": 0.6569603145349552, + "grad_norm": 1.2870370400066877, + "learning_rate": 9.529011268454384e-06, + "loss": 0.5379, + "step": 5347 + }, + { + "epoch": 0.6570831797518123, + "grad_norm": 1.2023577931237242, + "learning_rate": 9.523021651113118e-06, + "loss": 0.6441, + "step": 5348 + }, + { + "epoch": 0.6572060449686694, + "grad_norm": 1.0609645248971382, + "learning_rate": 9.517033041226506e-06, + "loss": 0.6309, + "step": 5349 + }, + { + "epoch": 0.6573289101855265, + "grad_norm": 1.4880148474394845, + "learning_rate": 9.51104543989611e-06, + "loss": 0.5704, + "step": 5350 + }, + { + "epoch": 0.6574517754023835, + "grad_norm": 1.0767347327570003, + "learning_rate": 9.505058848223306e-06, + "loss": 0.6691, + "step": 5351 + }, + { + "epoch": 0.6575746406192406, + "grad_norm": 1.307202023110517, + "learning_rate": 9.49907326730929e-06, + "loss": 0.6625, + "step": 5352 + }, + { + "epoch": 0.6576975058360978, + "grad_norm": 1.048024084907999, + "learning_rate": 9.49308869825507e-06, + "loss": 0.6016, + "step": 5353 + }, + { + "epoch": 0.6578203710529549, + "grad_norm": 1.1587684366783015, + "learning_rate": 9.487105142161475e-06, + "loss": 0.5388, + "step": 5354 + }, + { + "epoch": 0.657943236269812, + "grad_norm": 1.2228436495906432, + "learning_rate": 9.481122600129137e-06, + "loss": 0.5212, + "step": 5355 + }, + { + "epoch": 0.6580661014866691, + "grad_norm": 1.1490678571680035, + "learning_rate": 9.475141073258498e-06, + "loss": 0.5049, + "step": 5356 + }, + { + "epoch": 0.6581889667035262, + "grad_norm": 1.3421421990973672, + "learning_rate": 9.469160562649832e-06, + "loss": 0.766, + "step": 5357 + }, + { + "epoch": 0.6583118319203833, + "grad_norm": 1.2023039626992391, + "learning_rate": 9.463181069403216e-06, + "loss": 0.5594, + "step": 5358 + }, + { + "epoch": 0.6584346971372405, + "grad_norm": 1.264128815449993, + "learning_rate": 9.457202594618532e-06, + "loss": 0.5399, + "step": 5359 + }, + { + "epoch": 0.6585575623540976, + "grad_norm": 1.0302714376500743, + "learning_rate": 9.451225139395482e-06, + "loss": 0.5711, + "step": 5360 + }, + { + "epoch": 0.6586804275709547, + "grad_norm": 1.476872702635223, + "learning_rate": 9.445248704833587e-06, + "loss": 0.5419, + "step": 5361 + }, + { + "epoch": 0.6588032927878118, + "grad_norm": 1.26631484840403, + "learning_rate": 9.439273292032168e-06, + "loss": 0.7485, + "step": 5362 + }, + { + "epoch": 0.6589261580046689, + "grad_norm": 1.2814609963682926, + "learning_rate": 9.43329890209036e-06, + "loss": 0.5972, + "step": 5363 + }, + { + "epoch": 0.659049023221526, + "grad_norm": 1.1596355018970608, + "learning_rate": 9.42732553610712e-06, + "loss": 0.5386, + "step": 5364 + }, + { + "epoch": 0.6591718884383831, + "grad_norm": 1.1325534435231255, + "learning_rate": 9.42135319518121e-06, + "loss": 0.6117, + "step": 5365 + }, + { + "epoch": 0.6592947536552402, + "grad_norm": 1.562270049191132, + "learning_rate": 9.4153818804112e-06, + "loss": 0.6685, + "step": 5366 + }, + { + "epoch": 0.6594176188720973, + "grad_norm": 1.1063936691464893, + "learning_rate": 9.409411592895469e-06, + "loss": 0.69, + "step": 5367 + }, + { + "epoch": 0.6595404840889544, + "grad_norm": 1.3756707219684667, + "learning_rate": 9.403442333732227e-06, + "loss": 0.637, + "step": 5368 + }, + { + "epoch": 0.6596633493058115, + "grad_norm": 1.1309852628932937, + "learning_rate": 9.397474104019471e-06, + "loss": 0.5289, + "step": 5369 + }, + { + "epoch": 0.6597862145226686, + "grad_norm": 1.3092417761333737, + "learning_rate": 9.391506904855022e-06, + "loss": 0.4882, + "step": 5370 + }, + { + "epoch": 0.6599090797395257, + "grad_norm": 1.1447562042080837, + "learning_rate": 9.385540737336502e-06, + "loss": 0.6453, + "step": 5371 + }, + { + "epoch": 0.6600319449563828, + "grad_norm": 1.1488727688198235, + "learning_rate": 9.379575602561355e-06, + "loss": 0.634, + "step": 5372 + }, + { + "epoch": 0.66015481017324, + "grad_norm": 1.3106853827476563, + "learning_rate": 9.373611501626826e-06, + "loss": 0.5356, + "step": 5373 + }, + { + "epoch": 0.6602776753900971, + "grad_norm": 1.3234032757337477, + "learning_rate": 9.367648435629973e-06, + "loss": 0.6864, + "step": 5374 + }, + { + "epoch": 0.6604005406069542, + "grad_norm": 1.2746991808922827, + "learning_rate": 9.361686405667657e-06, + "loss": 0.6179, + "step": 5375 + }, + { + "epoch": 0.6605234058238113, + "grad_norm": 1.3999372429350039, + "learning_rate": 9.355725412836565e-06, + "loss": 0.6541, + "step": 5376 + }, + { + "epoch": 0.6606462710406684, + "grad_norm": 1.191222028739644, + "learning_rate": 9.349765458233182e-06, + "loss": 0.5608, + "step": 5377 + }, + { + "epoch": 0.6607691362575255, + "grad_norm": 1.1355170343186067, + "learning_rate": 9.343806542953798e-06, + "loss": 0.6242, + "step": 5378 + }, + { + "epoch": 0.6608920014743827, + "grad_norm": 0.9866164644610046, + "learning_rate": 9.337848668094517e-06, + "loss": 0.5584, + "step": 5379 + }, + { + "epoch": 0.6610148666912397, + "grad_norm": 1.6435242941303976, + "learning_rate": 9.331891834751254e-06, + "loss": 0.6659, + "step": 5380 + }, + { + "epoch": 0.6611377319080968, + "grad_norm": 1.289472696632637, + "learning_rate": 9.32593604401973e-06, + "loss": 0.5339, + "step": 5381 + }, + { + "epoch": 0.6612605971249539, + "grad_norm": 1.2474689524451454, + "learning_rate": 9.319981296995474e-06, + "loss": 0.4602, + "step": 5382 + }, + { + "epoch": 0.661383462341811, + "grad_norm": 1.2936556684083684, + "learning_rate": 9.314027594773816e-06, + "loss": 0.5094, + "step": 5383 + }, + { + "epoch": 0.6615063275586681, + "grad_norm": 1.2774031140397493, + "learning_rate": 9.308074938449914e-06, + "loss": 0.5824, + "step": 5384 + }, + { + "epoch": 0.6616291927755252, + "grad_norm": 1.4407886352203754, + "learning_rate": 9.302123329118712e-06, + "loss": 0.5348, + "step": 5385 + }, + { + "epoch": 0.6617520579923823, + "grad_norm": 1.485692404116567, + "learning_rate": 9.296172767874966e-06, + "loss": 0.6685, + "step": 5386 + }, + { + "epoch": 0.6618749232092395, + "grad_norm": 1.1557099975458156, + "learning_rate": 9.290223255813256e-06, + "loss": 0.5276, + "step": 5387 + }, + { + "epoch": 0.6619977884260966, + "grad_norm": 1.139284771303178, + "learning_rate": 9.284274794027947e-06, + "loss": 0.6, + "step": 5388 + }, + { + "epoch": 0.6621206536429537, + "grad_norm": 1.2846014433444866, + "learning_rate": 9.278327383613224e-06, + "loss": 0.5227, + "step": 5389 + }, + { + "epoch": 0.6622435188598108, + "grad_norm": 1.114264627324994, + "learning_rate": 9.272381025663068e-06, + "loss": 0.5179, + "step": 5390 + }, + { + "epoch": 0.6623663840766679, + "grad_norm": 1.3627667365033898, + "learning_rate": 9.26643572127128e-06, + "loss": 0.6022, + "step": 5391 + }, + { + "epoch": 0.662489249293525, + "grad_norm": 1.055190288150068, + "learning_rate": 9.260491471531459e-06, + "loss": 0.5065, + "step": 5392 + }, + { + "epoch": 0.6626121145103822, + "grad_norm": 1.0569602707736216, + "learning_rate": 9.254548277537008e-06, + "loss": 0.4378, + "step": 5393 + }, + { + "epoch": 0.6627349797272393, + "grad_norm": 1.49087855437183, + "learning_rate": 9.248606140381135e-06, + "loss": 0.4596, + "step": 5394 + }, + { + "epoch": 0.6628578449440963, + "grad_norm": 1.3196504211542666, + "learning_rate": 9.242665061156871e-06, + "loss": 0.6815, + "step": 5395 + }, + { + "epoch": 0.6629807101609534, + "grad_norm": 1.0984101601837264, + "learning_rate": 9.236725040957032e-06, + "loss": 0.571, + "step": 5396 + }, + { + "epoch": 0.6631035753778105, + "grad_norm": 1.077449267281618, + "learning_rate": 9.230786080874243e-06, + "loss": 0.505, + "step": 5397 + }, + { + "epoch": 0.6632264405946676, + "grad_norm": 1.213793907955046, + "learning_rate": 9.224848182000937e-06, + "loss": 0.5252, + "step": 5398 + }, + { + "epoch": 0.6633493058115247, + "grad_norm": 1.4378149659779227, + "learning_rate": 9.21891134542936e-06, + "loss": 0.5563, + "step": 5399 + }, + { + "epoch": 0.6634721710283819, + "grad_norm": 0.9751296963284363, + "learning_rate": 9.212975572251547e-06, + "loss": 0.6215, + "step": 5400 + }, + { + "epoch": 0.663595036245239, + "grad_norm": 1.4951367928944619, + "learning_rate": 9.207040863559349e-06, + "loss": 0.5374, + "step": 5401 + }, + { + "epoch": 0.6637179014620961, + "grad_norm": 1.2108652053668667, + "learning_rate": 9.201107220444407e-06, + "loss": 0.5228, + "step": 5402 + }, + { + "epoch": 0.6638407666789532, + "grad_norm": 1.1732498410045245, + "learning_rate": 9.195174643998193e-06, + "loss": 0.6343, + "step": 5403 + }, + { + "epoch": 0.6639636318958103, + "grad_norm": 1.2948022279847706, + "learning_rate": 9.189243135311957e-06, + "loss": 0.6186, + "step": 5404 + }, + { + "epoch": 0.6640864971126674, + "grad_norm": 1.4402870232333937, + "learning_rate": 9.183312695476762e-06, + "loss": 0.6093, + "step": 5405 + }, + { + "epoch": 0.6642093623295245, + "grad_norm": 1.389453362639374, + "learning_rate": 9.17738332558347e-06, + "loss": 0.6265, + "step": 5406 + }, + { + "epoch": 0.6643322275463817, + "grad_norm": 1.2936087231928903, + "learning_rate": 9.171455026722757e-06, + "loss": 0.5835, + "step": 5407 + }, + { + "epoch": 0.6644550927632388, + "grad_norm": 1.2082915818582454, + "learning_rate": 9.165527799985095e-06, + "loss": 0.6917, + "step": 5408 + }, + { + "epoch": 0.6645779579800959, + "grad_norm": 1.1547852039785829, + "learning_rate": 9.159601646460752e-06, + "loss": 0.6423, + "step": 5409 + }, + { + "epoch": 0.6647008231969529, + "grad_norm": 1.1063935836555865, + "learning_rate": 9.153676567239812e-06, + "loss": 0.6318, + "step": 5410 + }, + { + "epoch": 0.66482368841381, + "grad_norm": 1.2306009537175067, + "learning_rate": 9.147752563412155e-06, + "loss": 0.4776, + "step": 5411 + }, + { + "epoch": 0.6649465536306671, + "grad_norm": 1.2109842315696715, + "learning_rate": 9.141829636067458e-06, + "loss": 0.5815, + "step": 5412 + }, + { + "epoch": 0.6650694188475242, + "grad_norm": 1.2956606503671095, + "learning_rate": 9.135907786295204e-06, + "loss": 0.5081, + "step": 5413 + }, + { + "epoch": 0.6651922840643814, + "grad_norm": 1.1521563306302667, + "learning_rate": 9.129987015184687e-06, + "loss": 0.706, + "step": 5414 + }, + { + "epoch": 0.6653151492812385, + "grad_norm": 1.0816758133943258, + "learning_rate": 9.124067323824993e-06, + "loss": 0.6197, + "step": 5415 + }, + { + "epoch": 0.6654380144980956, + "grad_norm": 1.1961976594312118, + "learning_rate": 9.118148713305006e-06, + "loss": 0.608, + "step": 5416 + }, + { + "epoch": 0.6655608797149527, + "grad_norm": 1.1679838637887987, + "learning_rate": 9.112231184713415e-06, + "loss": 0.5025, + "step": 5417 + }, + { + "epoch": 0.6656837449318098, + "grad_norm": 1.1105847892963847, + "learning_rate": 9.106314739138718e-06, + "loss": 0.5669, + "step": 5418 + }, + { + "epoch": 0.6658066101486669, + "grad_norm": 1.1834194609770026, + "learning_rate": 9.100399377669203e-06, + "loss": 0.5884, + "step": 5419 + }, + { + "epoch": 0.665929475365524, + "grad_norm": 1.400524371713395, + "learning_rate": 9.09448510139296e-06, + "loss": 0.568, + "step": 5420 + }, + { + "epoch": 0.6660523405823812, + "grad_norm": 1.209159503677965, + "learning_rate": 9.088571911397882e-06, + "loss": 0.5615, + "step": 5421 + }, + { + "epoch": 0.6661752057992383, + "grad_norm": 1.1652976737596819, + "learning_rate": 9.082659808771666e-06, + "loss": 0.5745, + "step": 5422 + }, + { + "epoch": 0.6662980710160954, + "grad_norm": 1.3699659286161283, + "learning_rate": 9.076748794601803e-06, + "loss": 0.4894, + "step": 5423 + }, + { + "epoch": 0.6664209362329524, + "grad_norm": 1.3575808020092126, + "learning_rate": 9.070838869975587e-06, + "loss": 0.7357, + "step": 5424 + }, + { + "epoch": 0.6665438014498095, + "grad_norm": 1.1252436867469335, + "learning_rate": 9.064930035980104e-06, + "loss": 0.6039, + "step": 5425 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.3042482167348968, + "learning_rate": 9.059022293702257e-06, + "loss": 0.5965, + "step": 5426 + }, + { + "epoch": 0.6667895318835237, + "grad_norm": 1.122768962709427, + "learning_rate": 9.053115644228729e-06, + "loss": 0.5935, + "step": 5427 + }, + { + "epoch": 0.6669123971003809, + "grad_norm": 1.9934774910091, + "learning_rate": 9.047210088646005e-06, + "loss": 0.6156, + "step": 5428 + }, + { + "epoch": 0.667035262317238, + "grad_norm": 1.303768673261939, + "learning_rate": 9.04130562804039e-06, + "loss": 0.5988, + "step": 5429 + }, + { + "epoch": 0.6671581275340951, + "grad_norm": 1.3457535097062097, + "learning_rate": 9.035402263497956e-06, + "loss": 0.5987, + "step": 5430 + }, + { + "epoch": 0.6672809927509522, + "grad_norm": 1.1191112661120572, + "learning_rate": 9.029499996104594e-06, + "loss": 0.5536, + "step": 5431 + }, + { + "epoch": 0.6674038579678093, + "grad_norm": 1.386817838672882, + "learning_rate": 9.023598826945983e-06, + "loss": 0.669, + "step": 5432 + }, + { + "epoch": 0.6675267231846664, + "grad_norm": 1.1795479657420052, + "learning_rate": 9.017698757107618e-06, + "loss": 0.5726, + "step": 5433 + }, + { + "epoch": 0.6676495884015236, + "grad_norm": 1.3494724693097804, + "learning_rate": 9.011799787674767e-06, + "loss": 0.5632, + "step": 5434 + }, + { + "epoch": 0.6677724536183807, + "grad_norm": 1.167954993697105, + "learning_rate": 9.00590191973251e-06, + "loss": 0.672, + "step": 5435 + }, + { + "epoch": 0.6678953188352378, + "grad_norm": 1.4644311999685764, + "learning_rate": 9.00000515436572e-06, + "loss": 0.5559, + "step": 5436 + }, + { + "epoch": 0.6680181840520949, + "grad_norm": 1.081215350980237, + "learning_rate": 8.994109492659072e-06, + "loss": 0.4664, + "step": 5437 + }, + { + "epoch": 0.668141049268952, + "grad_norm": 1.0673320155500754, + "learning_rate": 8.988214935697036e-06, + "loss": 0.5612, + "step": 5438 + }, + { + "epoch": 0.668263914485809, + "grad_norm": 1.2120820358769335, + "learning_rate": 8.982321484563872e-06, + "loss": 0.4739, + "step": 5439 + }, + { + "epoch": 0.6683867797026661, + "grad_norm": 1.355342133240704, + "learning_rate": 8.976429140343639e-06, + "loss": 0.6064, + "step": 5440 + }, + { + "epoch": 0.6685096449195232, + "grad_norm": 1.1262678856199464, + "learning_rate": 8.970537904120211e-06, + "loss": 0.6567, + "step": 5441 + }, + { + "epoch": 0.6686325101363804, + "grad_norm": 1.1410304089779084, + "learning_rate": 8.96464777697723e-06, + "loss": 0.5707, + "step": 5442 + }, + { + "epoch": 0.6687553753532375, + "grad_norm": 1.2759615966077273, + "learning_rate": 8.95875875999815e-06, + "loss": 0.6248, + "step": 5443 + }, + { + "epoch": 0.6688782405700946, + "grad_norm": 1.1092249027774848, + "learning_rate": 8.952870854266214e-06, + "loss": 0.537, + "step": 5444 + }, + { + "epoch": 0.6690011057869517, + "grad_norm": 1.3107109402210457, + "learning_rate": 8.946984060864471e-06, + "loss": 0.6066, + "step": 5445 + }, + { + "epoch": 0.6691239710038088, + "grad_norm": 1.1387502955950397, + "learning_rate": 8.941098380875754e-06, + "loss": 0.6675, + "step": 5446 + }, + { + "epoch": 0.6692468362206659, + "grad_norm": 1.301273981193034, + "learning_rate": 8.935213815382698e-06, + "loss": 0.5176, + "step": 5447 + }, + { + "epoch": 0.6693697014375231, + "grad_norm": 1.3812533932021833, + "learning_rate": 8.929330365467722e-06, + "loss": 0.5594, + "step": 5448 + }, + { + "epoch": 0.6694925666543802, + "grad_norm": 1.1942232795292123, + "learning_rate": 8.923448032213062e-06, + "loss": 0.5626, + "step": 5449 + }, + { + "epoch": 0.6696154318712373, + "grad_norm": 1.232252205130166, + "learning_rate": 8.917566816700729e-06, + "loss": 0.5891, + "step": 5450 + }, + { + "epoch": 0.6697382970880944, + "grad_norm": 1.1350519775079593, + "learning_rate": 8.911686720012527e-06, + "loss": 0.5841, + "step": 5451 + }, + { + "epoch": 0.6698611623049515, + "grad_norm": 1.3022207139822306, + "learning_rate": 8.905807743230075e-06, + "loss": 0.695, + "step": 5452 + }, + { + "epoch": 0.6699840275218085, + "grad_norm": 0.9273241252514355, + "learning_rate": 8.899929887434767e-06, + "loss": 0.4556, + "step": 5453 + }, + { + "epoch": 0.6701068927386656, + "grad_norm": 1.356155548395237, + "learning_rate": 8.894053153707798e-06, + "loss": 0.5184, + "step": 5454 + }, + { + "epoch": 0.6702297579555228, + "grad_norm": 1.256758526351901, + "learning_rate": 8.888177543130144e-06, + "loss": 0.5238, + "step": 5455 + }, + { + "epoch": 0.6703526231723799, + "grad_norm": 1.2394370149420395, + "learning_rate": 8.882303056782603e-06, + "loss": 0.56, + "step": 5456 + }, + { + "epoch": 0.670475488389237, + "grad_norm": 1.3273126706571647, + "learning_rate": 8.876429695745739e-06, + "loss": 0.6606, + "step": 5457 + }, + { + "epoch": 0.6705983536060941, + "grad_norm": 1.2528763817866215, + "learning_rate": 8.870557461099917e-06, + "loss": 0.5407, + "step": 5458 + }, + { + "epoch": 0.6707212188229512, + "grad_norm": 1.3352729395421492, + "learning_rate": 8.864686353925295e-06, + "loss": 0.5726, + "step": 5459 + }, + { + "epoch": 0.6708440840398083, + "grad_norm": 1.293680801362028, + "learning_rate": 8.858816375301836e-06, + "loss": 0.4985, + "step": 5460 + }, + { + "epoch": 0.6709669492566654, + "grad_norm": 1.0662788707630826, + "learning_rate": 8.852947526309278e-06, + "loss": 0.5809, + "step": 5461 + }, + { + "epoch": 0.6710898144735226, + "grad_norm": 1.153598010778161, + "learning_rate": 8.847079808027156e-06, + "loss": 0.6293, + "step": 5462 + }, + { + "epoch": 0.6712126796903797, + "grad_norm": 1.1362262382995019, + "learning_rate": 8.841213221534798e-06, + "loss": 0.6192, + "step": 5463 + }, + { + "epoch": 0.6713355449072368, + "grad_norm": 1.1157536209690637, + "learning_rate": 8.835347767911329e-06, + "loss": 0.5541, + "step": 5464 + }, + { + "epoch": 0.6714584101240939, + "grad_norm": 1.1504621027864248, + "learning_rate": 8.829483448235659e-06, + "loss": 0.5192, + "step": 5465 + }, + { + "epoch": 0.671581275340951, + "grad_norm": 1.1759856056385154, + "learning_rate": 8.823620263586493e-06, + "loss": 0.591, + "step": 5466 + }, + { + "epoch": 0.6717041405578081, + "grad_norm": 1.3999741737805917, + "learning_rate": 8.817758215042316e-06, + "loss": 0.5875, + "step": 5467 + }, + { + "epoch": 0.6718270057746651, + "grad_norm": 1.4631762116027573, + "learning_rate": 8.81189730368143e-06, + "loss": 0.5893, + "step": 5468 + }, + { + "epoch": 0.6719498709915223, + "grad_norm": 1.3062169732002271, + "learning_rate": 8.806037530581904e-06, + "loss": 0.5323, + "step": 5469 + }, + { + "epoch": 0.6720727362083794, + "grad_norm": 1.038090937012699, + "learning_rate": 8.800178896821597e-06, + "loss": 0.6111, + "step": 5470 + }, + { + "epoch": 0.6721956014252365, + "grad_norm": 1.4464513839744553, + "learning_rate": 8.794321403478182e-06, + "loss": 0.5131, + "step": 5471 + }, + { + "epoch": 0.6723184666420936, + "grad_norm": 1.2818843260301203, + "learning_rate": 8.788465051629101e-06, + "loss": 0.5507, + "step": 5472 + }, + { + "epoch": 0.6724413318589507, + "grad_norm": 1.0063043528163842, + "learning_rate": 8.782609842351587e-06, + "loss": 0.5657, + "step": 5473 + }, + { + "epoch": 0.6725641970758078, + "grad_norm": 1.2237530110480506, + "learning_rate": 8.77675577672267e-06, + "loss": 0.6172, + "step": 5474 + }, + { + "epoch": 0.672687062292665, + "grad_norm": 1.6325690823511276, + "learning_rate": 8.770902855819174e-06, + "loss": 0.7306, + "step": 5475 + }, + { + "epoch": 0.6728099275095221, + "grad_norm": 1.122716475378354, + "learning_rate": 8.765051080717696e-06, + "loss": 0.5497, + "step": 5476 + }, + { + "epoch": 0.6729327927263792, + "grad_norm": 1.0929586403750307, + "learning_rate": 8.75920045249464e-06, + "loss": 0.5477, + "step": 5477 + }, + { + "epoch": 0.6730556579432363, + "grad_norm": 1.104066995462852, + "learning_rate": 8.75335097222618e-06, + "loss": 0.5666, + "step": 5478 + }, + { + "epoch": 0.6731785231600934, + "grad_norm": 1.3603928357760826, + "learning_rate": 8.74750264098831e-06, + "loss": 0.6951, + "step": 5479 + }, + { + "epoch": 0.6733013883769505, + "grad_norm": 1.191752585859024, + "learning_rate": 8.74165545985677e-06, + "loss": 0.6171, + "step": 5480 + }, + { + "epoch": 0.6734242535938076, + "grad_norm": 1.1956443640015022, + "learning_rate": 8.73580942990713e-06, + "loss": 0.6082, + "step": 5481 + }, + { + "epoch": 0.6735471188106646, + "grad_norm": 1.6400869581029824, + "learning_rate": 8.729964552214708e-06, + "loss": 0.5369, + "step": 5482 + }, + { + "epoch": 0.6736699840275218, + "grad_norm": 1.039769184010898, + "learning_rate": 8.724120827854657e-06, + "loss": 0.6166, + "step": 5483 + }, + { + "epoch": 0.6737928492443789, + "grad_norm": 1.4290270239236416, + "learning_rate": 8.718278257901872e-06, + "loss": 0.6507, + "step": 5484 + }, + { + "epoch": 0.673915714461236, + "grad_norm": 1.20287783400799, + "learning_rate": 8.712436843431068e-06, + "loss": 0.5226, + "step": 5485 + }, + { + "epoch": 0.6740385796780931, + "grad_norm": 1.1769862999940464, + "learning_rate": 8.70659658551672e-06, + "loss": 0.546, + "step": 5486 + }, + { + "epoch": 0.6741614448949502, + "grad_norm": 1.4146889874229311, + "learning_rate": 8.700757485233126e-06, + "loss": 0.6247, + "step": 5487 + }, + { + "epoch": 0.6742843101118073, + "grad_norm": 0.9900874639645196, + "learning_rate": 8.694919543654337e-06, + "loss": 0.5139, + "step": 5488 + }, + { + "epoch": 0.6744071753286645, + "grad_norm": 1.1892280813901905, + "learning_rate": 8.689082761854213e-06, + "loss": 0.6207, + "step": 5489 + }, + { + "epoch": 0.6745300405455216, + "grad_norm": 1.3221558244849385, + "learning_rate": 8.683247140906382e-06, + "loss": 0.5785, + "step": 5490 + }, + { + "epoch": 0.6746529057623787, + "grad_norm": 1.200893578318306, + "learning_rate": 8.677412681884273e-06, + "loss": 0.5756, + "step": 5491 + }, + { + "epoch": 0.6747757709792358, + "grad_norm": 1.2019424888020496, + "learning_rate": 8.671579385861105e-06, + "loss": 0.6227, + "step": 5492 + }, + { + "epoch": 0.6748986361960929, + "grad_norm": 1.345512818188321, + "learning_rate": 8.665747253909855e-06, + "loss": 0.5456, + "step": 5493 + }, + { + "epoch": 0.67502150141295, + "grad_norm": 1.1983695490545931, + "learning_rate": 8.659916287103329e-06, + "loss": 0.6501, + "step": 5494 + }, + { + "epoch": 0.6751443666298071, + "grad_norm": 1.5775270603329956, + "learning_rate": 8.65408648651408e-06, + "loss": 0.6643, + "step": 5495 + }, + { + "epoch": 0.6752672318466643, + "grad_norm": 1.2710142579865846, + "learning_rate": 8.648257853214474e-06, + "loss": 0.5551, + "step": 5496 + }, + { + "epoch": 0.6753900970635213, + "grad_norm": 1.3598773955389767, + "learning_rate": 8.642430388276638e-06, + "loss": 0.6102, + "step": 5497 + }, + { + "epoch": 0.6755129622803784, + "grad_norm": 1.2568881776897751, + "learning_rate": 8.6366040927725e-06, + "loss": 0.5403, + "step": 5498 + }, + { + "epoch": 0.6756358274972355, + "grad_norm": 1.080857358681369, + "learning_rate": 8.630778967773777e-06, + "loss": 0.7155, + "step": 5499 + }, + { + "epoch": 0.6757586927140926, + "grad_norm": 1.3515546552574471, + "learning_rate": 8.624955014351953e-06, + "loss": 0.559, + "step": 5500 + }, + { + "epoch": 0.6758815579309497, + "grad_norm": 1.1591632038766402, + "learning_rate": 8.619132233578308e-06, + "loss": 0.5517, + "step": 5501 + }, + { + "epoch": 0.6760044231478068, + "grad_norm": 1.141116659641108, + "learning_rate": 8.61331062652391e-06, + "loss": 0.6355, + "step": 5502 + }, + { + "epoch": 0.676127288364664, + "grad_norm": 1.138180326058647, + "learning_rate": 8.607490194259606e-06, + "loss": 0.564, + "step": 5503 + }, + { + "epoch": 0.6762501535815211, + "grad_norm": 1.2228504948581862, + "learning_rate": 8.60167093785602e-06, + "loss": 0.5694, + "step": 5504 + }, + { + "epoch": 0.6763730187983782, + "grad_norm": 1.2198642429638746, + "learning_rate": 8.59585285838357e-06, + "loss": 0.5719, + "step": 5505 + }, + { + "epoch": 0.6764958840152353, + "grad_norm": 1.4840411341599562, + "learning_rate": 8.590035956912461e-06, + "loss": 0.606, + "step": 5506 + }, + { + "epoch": 0.6766187492320924, + "grad_norm": 1.188612743520994, + "learning_rate": 8.58422023451266e-06, + "loss": 0.5476, + "step": 5507 + }, + { + "epoch": 0.6767416144489495, + "grad_norm": 1.234020954089997, + "learning_rate": 8.578405692253945e-06, + "loss": 0.4883, + "step": 5508 + }, + { + "epoch": 0.6768644796658067, + "grad_norm": 1.1491567658586752, + "learning_rate": 8.572592331205849e-06, + "loss": 0.5469, + "step": 5509 + }, + { + "epoch": 0.6769873448826638, + "grad_norm": 1.5177614857178434, + "learning_rate": 8.566780152437717e-06, + "loss": 0.6416, + "step": 5510 + }, + { + "epoch": 0.6771102100995208, + "grad_norm": 1.2471498210336696, + "learning_rate": 8.560969157018655e-06, + "loss": 0.5462, + "step": 5511 + }, + { + "epoch": 0.6772330753163779, + "grad_norm": 1.5264952298764982, + "learning_rate": 8.555159346017559e-06, + "loss": 0.5744, + "step": 5512 + }, + { + "epoch": 0.677355940533235, + "grad_norm": 1.1324340690621615, + "learning_rate": 8.549350720503094e-06, + "loss": 0.5191, + "step": 5513 + }, + { + "epoch": 0.6774788057500921, + "grad_norm": 1.0421755327117062, + "learning_rate": 8.543543281543745e-06, + "loss": 0.4953, + "step": 5514 + }, + { + "epoch": 0.6776016709669492, + "grad_norm": 1.463222438847887, + "learning_rate": 8.537737030207728e-06, + "loss": 0.5596, + "step": 5515 + }, + { + "epoch": 0.6777245361838063, + "grad_norm": 1.2833605367939092, + "learning_rate": 8.531931967563078e-06, + "loss": 0.4573, + "step": 5516 + }, + { + "epoch": 0.6778474014006635, + "grad_norm": 1.298967859654523, + "learning_rate": 8.5261280946776e-06, + "loss": 0.5422, + "step": 5517 + }, + { + "epoch": 0.6779702666175206, + "grad_norm": 1.1025000833250829, + "learning_rate": 8.520325412618868e-06, + "loss": 0.6198, + "step": 5518 + }, + { + "epoch": 0.6780931318343777, + "grad_norm": 1.189920006343685, + "learning_rate": 8.514523922454263e-06, + "loss": 0.5021, + "step": 5519 + }, + { + "epoch": 0.6782159970512348, + "grad_norm": 1.165503409133728, + "learning_rate": 8.508723625250907e-06, + "loss": 0.608, + "step": 5520 + }, + { + "epoch": 0.6783388622680919, + "grad_norm": 1.5275667204899208, + "learning_rate": 8.502924522075757e-06, + "loss": 0.5433, + "step": 5521 + }, + { + "epoch": 0.678461727484949, + "grad_norm": 1.1290585154282597, + "learning_rate": 8.4971266139955e-06, + "loss": 0.6372, + "step": 5522 + }, + { + "epoch": 0.6785845927018062, + "grad_norm": 1.095017104860675, + "learning_rate": 8.491329902076635e-06, + "loss": 0.5229, + "step": 5523 + }, + { + "epoch": 0.6787074579186633, + "grad_norm": 0.9903107429495395, + "learning_rate": 8.48553438738542e-06, + "loss": 0.5756, + "step": 5524 + }, + { + "epoch": 0.6788303231355204, + "grad_norm": 1.0684487916166505, + "learning_rate": 8.479740070987904e-06, + "loss": 0.4929, + "step": 5525 + }, + { + "epoch": 0.6789531883523774, + "grad_norm": 1.3674926520025616, + "learning_rate": 8.473946953949924e-06, + "loss": 0.5404, + "step": 5526 + }, + { + "epoch": 0.6790760535692345, + "grad_norm": 1.2398411357492116, + "learning_rate": 8.468155037337072e-06, + "loss": 0.5943, + "step": 5527 + }, + { + "epoch": 0.6791989187860916, + "grad_norm": 1.206815087083274, + "learning_rate": 8.462364322214742e-06, + "loss": 0.4884, + "step": 5528 + }, + { + "epoch": 0.6793217840029487, + "grad_norm": 1.0987973419464097, + "learning_rate": 8.456574809648096e-06, + "loss": 0.61, + "step": 5529 + }, + { + "epoch": 0.6794446492198059, + "grad_norm": 1.1972945623907818, + "learning_rate": 8.450786500702084e-06, + "loss": 0.5951, + "step": 5530 + }, + { + "epoch": 0.679567514436663, + "grad_norm": 1.4032252927207136, + "learning_rate": 8.444999396441416e-06, + "loss": 0.5459, + "step": 5531 + }, + { + "epoch": 0.6796903796535201, + "grad_norm": 1.3397801117162873, + "learning_rate": 8.439213497930598e-06, + "loss": 0.5412, + "step": 5532 + }, + { + "epoch": 0.6798132448703772, + "grad_norm": 1.0261508310976428, + "learning_rate": 8.43342880623391e-06, + "loss": 0.5465, + "step": 5533 + }, + { + "epoch": 0.6799361100872343, + "grad_norm": 1.1639757900258374, + "learning_rate": 8.427645322415412e-06, + "loss": 0.6101, + "step": 5534 + }, + { + "epoch": 0.6800589753040914, + "grad_norm": 1.3484777650577826, + "learning_rate": 8.42186304753893e-06, + "loss": 0.7159, + "step": 5535 + }, + { + "epoch": 0.6801818405209485, + "grad_norm": 1.2010976202120776, + "learning_rate": 8.41608198266808e-06, + "loss": 0.5973, + "step": 5536 + }, + { + "epoch": 0.6803047057378057, + "grad_norm": 1.1251202752306608, + "learning_rate": 8.410302128866253e-06, + "loss": 0.6645, + "step": 5537 + }, + { + "epoch": 0.6804275709546628, + "grad_norm": 1.173138914413587, + "learning_rate": 8.40452348719661e-06, + "loss": 0.5466, + "step": 5538 + }, + { + "epoch": 0.6805504361715199, + "grad_norm": 2.128310953377174, + "learning_rate": 8.3987460587221e-06, + "loss": 0.7182, + "step": 5539 + }, + { + "epoch": 0.680673301388377, + "grad_norm": 1.1940472057762614, + "learning_rate": 8.392969844505441e-06, + "loss": 0.5298, + "step": 5540 + }, + { + "epoch": 0.680796166605234, + "grad_norm": 1.0625782469786935, + "learning_rate": 8.387194845609134e-06, + "loss": 0.5711, + "step": 5541 + }, + { + "epoch": 0.6809190318220911, + "grad_norm": 1.1260033162505472, + "learning_rate": 8.381421063095447e-06, + "loss": 0.574, + "step": 5542 + }, + { + "epoch": 0.6810418970389482, + "grad_norm": 1.210223282494203, + "learning_rate": 8.375648498026431e-06, + "loss": 0.6931, + "step": 5543 + }, + { + "epoch": 0.6811647622558054, + "grad_norm": 1.1803403885718766, + "learning_rate": 8.36987715146392e-06, + "loss": 0.584, + "step": 5544 + }, + { + "epoch": 0.6812876274726625, + "grad_norm": 1.1048173550874438, + "learning_rate": 8.364107024469502e-06, + "loss": 0.5232, + "step": 5545 + }, + { + "epoch": 0.6814104926895196, + "grad_norm": 1.5219922021722194, + "learning_rate": 8.358338118104568e-06, + "loss": 0.5957, + "step": 5546 + }, + { + "epoch": 0.6815333579063767, + "grad_norm": 1.320818601571406, + "learning_rate": 8.352570433430254e-06, + "loss": 0.6868, + "step": 5547 + }, + { + "epoch": 0.6816562231232338, + "grad_norm": 1.285050333704961, + "learning_rate": 8.346803971507508e-06, + "loss": 0.567, + "step": 5548 + }, + { + "epoch": 0.6817790883400909, + "grad_norm": 1.2246286448129153, + "learning_rate": 8.34103873339702e-06, + "loss": 0.4756, + "step": 5549 + }, + { + "epoch": 0.681901953556948, + "grad_norm": 1.1277562510119499, + "learning_rate": 8.335274720159279e-06, + "loss": 0.4999, + "step": 5550 + }, + { + "epoch": 0.6820248187738052, + "grad_norm": 1.1962519854302565, + "learning_rate": 8.329511932854517e-06, + "loss": 0.5536, + "step": 5551 + }, + { + "epoch": 0.6821476839906623, + "grad_norm": 1.0034817331854886, + "learning_rate": 8.323750372542788e-06, + "loss": 0.5846, + "step": 5552 + }, + { + "epoch": 0.6822705492075194, + "grad_norm": 1.2922563017629383, + "learning_rate": 8.317990040283876e-06, + "loss": 0.559, + "step": 5553 + }, + { + "epoch": 0.6823934144243765, + "grad_norm": 1.2073993075850846, + "learning_rate": 8.312230937137365e-06, + "loss": 0.4939, + "step": 5554 + }, + { + "epoch": 0.6825162796412335, + "grad_norm": 1.4006924605366056, + "learning_rate": 8.306473064162597e-06, + "loss": 0.5976, + "step": 5555 + }, + { + "epoch": 0.6826391448580906, + "grad_norm": 1.5388442969479064, + "learning_rate": 8.300716422418699e-06, + "loss": 0.6163, + "step": 5556 + }, + { + "epoch": 0.6827620100749477, + "grad_norm": 1.1840074619920913, + "learning_rate": 8.294961012964576e-06, + "loss": 0.6908, + "step": 5557 + }, + { + "epoch": 0.6828848752918049, + "grad_norm": 1.2176931324047124, + "learning_rate": 8.289206836858879e-06, + "loss": 0.5868, + "step": 5558 + }, + { + "epoch": 0.683007740508662, + "grad_norm": 1.2865768154230015, + "learning_rate": 8.283453895160075e-06, + "loss": 0.6198, + "step": 5559 + }, + { + "epoch": 0.6831306057255191, + "grad_norm": 1.439467324645684, + "learning_rate": 8.277702188926363e-06, + "loss": 0.5093, + "step": 5560 + }, + { + "epoch": 0.6832534709423762, + "grad_norm": 1.7064875821353878, + "learning_rate": 8.27195171921574e-06, + "loss": 0.6985, + "step": 5561 + }, + { + "epoch": 0.6833763361592333, + "grad_norm": 1.0512876481173994, + "learning_rate": 8.266202487085964e-06, + "loss": 0.5387, + "step": 5562 + }, + { + "epoch": 0.6834992013760904, + "grad_norm": 1.3029182507694035, + "learning_rate": 8.26045449359457e-06, + "loss": 0.4758, + "step": 5563 + }, + { + "epoch": 0.6836220665929476, + "grad_norm": 1.3361889910948663, + "learning_rate": 8.25470773979887e-06, + "loss": 0.5424, + "step": 5564 + }, + { + "epoch": 0.6837449318098047, + "grad_norm": 1.4569326593928469, + "learning_rate": 8.248962226755929e-06, + "loss": 0.5452, + "step": 5565 + }, + { + "epoch": 0.6838677970266618, + "grad_norm": 1.3843190197186312, + "learning_rate": 8.243217955522605e-06, + "loss": 0.6343, + "step": 5566 + }, + { + "epoch": 0.6839906622435189, + "grad_norm": 1.2034589946111576, + "learning_rate": 8.237474927155517e-06, + "loss": 0.5918, + "step": 5567 + }, + { + "epoch": 0.684113527460376, + "grad_norm": 1.1136777457382039, + "learning_rate": 8.23173314271107e-06, + "loss": 0.4681, + "step": 5568 + }, + { + "epoch": 0.6842363926772331, + "grad_norm": 1.1192514033973737, + "learning_rate": 8.225992603245408e-06, + "loss": 0.6661, + "step": 5569 + }, + { + "epoch": 0.6843592578940901, + "grad_norm": 1.3525239802951745, + "learning_rate": 8.220253309814479e-06, + "loss": 0.5483, + "step": 5570 + }, + { + "epoch": 0.6844821231109472, + "grad_norm": 1.2447744865787258, + "learning_rate": 8.214515263473983e-06, + "loss": 0.5215, + "step": 5571 + }, + { + "epoch": 0.6846049883278044, + "grad_norm": 1.0101377397556024, + "learning_rate": 8.208778465279404e-06, + "loss": 0.5404, + "step": 5572 + }, + { + "epoch": 0.6847278535446615, + "grad_norm": 1.1733425202897443, + "learning_rate": 8.203042916285977e-06, + "loss": 0.5899, + "step": 5573 + }, + { + "epoch": 0.6848507187615186, + "grad_norm": 1.3411169806208167, + "learning_rate": 8.19730861754873e-06, + "loss": 0.6826, + "step": 5574 + }, + { + "epoch": 0.6849735839783757, + "grad_norm": 1.160720658895645, + "learning_rate": 8.191575570122449e-06, + "loss": 0.6242, + "step": 5575 + }, + { + "epoch": 0.6850964491952328, + "grad_norm": 1.2230761480334023, + "learning_rate": 8.185843775061682e-06, + "loss": 0.4872, + "step": 5576 + }, + { + "epoch": 0.6852193144120899, + "grad_norm": 1.1131783110638138, + "learning_rate": 8.180113233420761e-06, + "loss": 0.5275, + "step": 5577 + }, + { + "epoch": 0.6853421796289471, + "grad_norm": 1.3779542346414455, + "learning_rate": 8.174383946253783e-06, + "loss": 0.6241, + "step": 5578 + }, + { + "epoch": 0.6854650448458042, + "grad_norm": 1.2055960405366897, + "learning_rate": 8.168655914614617e-06, + "loss": 0.6009, + "step": 5579 + }, + { + "epoch": 0.6855879100626613, + "grad_norm": 1.545296065187811, + "learning_rate": 8.162929139556888e-06, + "loss": 0.5998, + "step": 5580 + }, + { + "epoch": 0.6857107752795184, + "grad_norm": 1.226098725673508, + "learning_rate": 8.157203622134004e-06, + "loss": 0.5529, + "step": 5581 + }, + { + "epoch": 0.6858336404963755, + "grad_norm": 1.3681611977348092, + "learning_rate": 8.151479363399143e-06, + "loss": 0.7222, + "step": 5582 + }, + { + "epoch": 0.6859565057132326, + "grad_norm": 0.9531265814707083, + "learning_rate": 8.14575636440523e-06, + "loss": 0.5812, + "step": 5583 + }, + { + "epoch": 0.6860793709300896, + "grad_norm": 1.2123401020489009, + "learning_rate": 8.14003462620499e-06, + "loss": 0.545, + "step": 5584 + }, + { + "epoch": 0.6862022361469468, + "grad_norm": 1.1502294049958293, + "learning_rate": 8.134314149850882e-06, + "loss": 0.5276, + "step": 5585 + }, + { + "epoch": 0.6863251013638039, + "grad_norm": 1.0245394659924127, + "learning_rate": 8.12859493639517e-06, + "loss": 0.586, + "step": 5586 + }, + { + "epoch": 0.686447966580661, + "grad_norm": 1.1218356256516622, + "learning_rate": 8.122876986889853e-06, + "loss": 0.6042, + "step": 5587 + }, + { + "epoch": 0.6865708317975181, + "grad_norm": 1.1488006920021612, + "learning_rate": 8.117160302386718e-06, + "loss": 0.671, + "step": 5588 + }, + { + "epoch": 0.6866936970143752, + "grad_norm": 1.4114610304726096, + "learning_rate": 8.111444883937299e-06, + "loss": 0.608, + "step": 5589 + }, + { + "epoch": 0.6868165622312323, + "grad_norm": 1.6530615992788658, + "learning_rate": 8.105730732592931e-06, + "loss": 0.7546, + "step": 5590 + }, + { + "epoch": 0.6869394274480894, + "grad_norm": 1.2015194981633994, + "learning_rate": 8.100017849404677e-06, + "loss": 0.5514, + "step": 5591 + }, + { + "epoch": 0.6870622926649466, + "grad_norm": 1.1175556655429864, + "learning_rate": 8.094306235423398e-06, + "loss": 0.6526, + "step": 5592 + }, + { + "epoch": 0.6871851578818037, + "grad_norm": 1.071196172916086, + "learning_rate": 8.088595891699695e-06, + "loss": 0.5027, + "step": 5593 + }, + { + "epoch": 0.6873080230986608, + "grad_norm": 1.200813024421294, + "learning_rate": 8.082886819283958e-06, + "loss": 0.5855, + "step": 5594 + }, + { + "epoch": 0.6874308883155179, + "grad_norm": 1.2887640013516184, + "learning_rate": 8.077179019226335e-06, + "loss": 0.6348, + "step": 5595 + }, + { + "epoch": 0.687553753532375, + "grad_norm": 1.0971653060804116, + "learning_rate": 8.07147249257673e-06, + "loss": 0.5542, + "step": 5596 + }, + { + "epoch": 0.6876766187492321, + "grad_norm": 1.414093053391415, + "learning_rate": 8.06576724038483e-06, + "loss": 0.4957, + "step": 5597 + }, + { + "epoch": 0.6877994839660893, + "grad_norm": 1.0734360410474622, + "learning_rate": 8.060063263700074e-06, + "loss": 0.6226, + "step": 5598 + }, + { + "epoch": 0.6879223491829463, + "grad_norm": 1.4798562839374585, + "learning_rate": 8.054360563571678e-06, + "loss": 0.5104, + "step": 5599 + }, + { + "epoch": 0.6880452143998034, + "grad_norm": 1.0734530589256912, + "learning_rate": 8.048659141048608e-06, + "loss": 0.6523, + "step": 5600 + }, + { + "epoch": 0.6881680796166605, + "grad_norm": 1.1912903484787905, + "learning_rate": 8.042958997179608e-06, + "loss": 0.6118, + "step": 5601 + }, + { + "epoch": 0.6882909448335176, + "grad_norm": 1.2210127107145723, + "learning_rate": 8.037260133013188e-06, + "loss": 0.6108, + "step": 5602 + }, + { + "epoch": 0.6884138100503747, + "grad_norm": 1.3002063813248594, + "learning_rate": 8.031562549597606e-06, + "loss": 0.5417, + "step": 5603 + }, + { + "epoch": 0.6885366752672318, + "grad_norm": 1.280428016246187, + "learning_rate": 8.025866247980902e-06, + "loss": 0.5671, + "step": 5604 + }, + { + "epoch": 0.688659540484089, + "grad_norm": 1.107851970800824, + "learning_rate": 8.02017122921087e-06, + "loss": 0.6289, + "step": 5605 + }, + { + "epoch": 0.6887824057009461, + "grad_norm": 1.1487124316964399, + "learning_rate": 8.014477494335082e-06, + "loss": 0.6825, + "step": 5606 + }, + { + "epoch": 0.6889052709178032, + "grad_norm": 1.1191892473667016, + "learning_rate": 8.00878504440085e-06, + "loss": 0.6639, + "step": 5607 + }, + { + "epoch": 0.6890281361346603, + "grad_norm": 1.1693597619594591, + "learning_rate": 8.00309388045527e-06, + "loss": 0.5273, + "step": 5608 + }, + { + "epoch": 0.6891510013515174, + "grad_norm": 1.6881721659848923, + "learning_rate": 7.997404003545195e-06, + "loss": 0.7388, + "step": 5609 + }, + { + "epoch": 0.6892738665683745, + "grad_norm": 1.1408695147584327, + "learning_rate": 7.991715414717246e-06, + "loss": 0.6015, + "step": 5610 + }, + { + "epoch": 0.6893967317852316, + "grad_norm": 1.1458268126484092, + "learning_rate": 7.986028115017788e-06, + "loss": 0.5406, + "step": 5611 + }, + { + "epoch": 0.6895195970020888, + "grad_norm": 1.2282994310813244, + "learning_rate": 7.980342105492973e-06, + "loss": 0.5691, + "step": 5612 + }, + { + "epoch": 0.6896424622189458, + "grad_norm": 1.5720000134221044, + "learning_rate": 7.97465738718871e-06, + "loss": 0.5657, + "step": 5613 + }, + { + "epoch": 0.6897653274358029, + "grad_norm": 1.1929345407989702, + "learning_rate": 7.968973961150653e-06, + "loss": 0.6578, + "step": 5614 + }, + { + "epoch": 0.68988819265266, + "grad_norm": 1.2889740515628836, + "learning_rate": 7.963291828424242e-06, + "loss": 0.4608, + "step": 5615 + }, + { + "epoch": 0.6900110578695171, + "grad_norm": 1.3341800463658702, + "learning_rate": 7.957610990054654e-06, + "loss": 0.5959, + "step": 5616 + }, + { + "epoch": 0.6901339230863742, + "grad_norm": 1.1426823184838397, + "learning_rate": 7.951931447086864e-06, + "loss": 0.5376, + "step": 5617 + }, + { + "epoch": 0.6902567883032313, + "grad_norm": 1.1712215859178003, + "learning_rate": 7.946253200565572e-06, + "loss": 0.6444, + "step": 5618 + }, + { + "epoch": 0.6903796535200885, + "grad_norm": 1.3190229299768048, + "learning_rate": 7.940576251535264e-06, + "loss": 0.6519, + "step": 5619 + }, + { + "epoch": 0.6905025187369456, + "grad_norm": 1.149601381954899, + "learning_rate": 7.934900601040165e-06, + "loss": 0.5959, + "step": 5620 + }, + { + "epoch": 0.6906253839538027, + "grad_norm": 1.0909571148045023, + "learning_rate": 7.929226250124284e-06, + "loss": 0.4734, + "step": 5621 + }, + { + "epoch": 0.6907482491706598, + "grad_norm": 1.3306854707555564, + "learning_rate": 7.923553199831384e-06, + "loss": 0.5728, + "step": 5622 + }, + { + "epoch": 0.6908711143875169, + "grad_norm": 1.4063838822256967, + "learning_rate": 7.917881451204966e-06, + "loss": 0.6308, + "step": 5623 + }, + { + "epoch": 0.690993979604374, + "grad_norm": 1.3375804822080501, + "learning_rate": 7.912211005288342e-06, + "loss": 0.6496, + "step": 5624 + }, + { + "epoch": 0.6911168448212311, + "grad_norm": 1.2387419715284353, + "learning_rate": 7.906541863124529e-06, + "loss": 0.48, + "step": 5625 + }, + { + "epoch": 0.6912397100380883, + "grad_norm": 1.0661735329035589, + "learning_rate": 7.900874025756344e-06, + "loss": 0.5583, + "step": 5626 + }, + { + "epoch": 0.6913625752549454, + "grad_norm": 1.4097554021465466, + "learning_rate": 7.895207494226338e-06, + "loss": 0.5354, + "step": 5627 + }, + { + "epoch": 0.6914854404718024, + "grad_norm": 1.0698890036713267, + "learning_rate": 7.889542269576836e-06, + "loss": 0.6071, + "step": 5628 + }, + { + "epoch": 0.6916083056886595, + "grad_norm": 1.1837792820203894, + "learning_rate": 7.883878352849925e-06, + "loss": 0.6039, + "step": 5629 + }, + { + "epoch": 0.6917311709055166, + "grad_norm": 1.0125046671733573, + "learning_rate": 7.878215745087438e-06, + "loss": 0.5037, + "step": 5630 + }, + { + "epoch": 0.6918540361223737, + "grad_norm": 1.1443607297834595, + "learning_rate": 7.872554447330977e-06, + "loss": 0.5463, + "step": 5631 + }, + { + "epoch": 0.6919769013392308, + "grad_norm": 1.4492294499218195, + "learning_rate": 7.866894460621903e-06, + "loss": 0.7272, + "step": 5632 + }, + { + "epoch": 0.692099766556088, + "grad_norm": 1.507517307972361, + "learning_rate": 7.861235786001338e-06, + "loss": 0.6818, + "step": 5633 + }, + { + "epoch": 0.6922226317729451, + "grad_norm": 1.2658210319837337, + "learning_rate": 7.855578424510146e-06, + "loss": 0.4878, + "step": 5634 + }, + { + "epoch": 0.6923454969898022, + "grad_norm": 1.1471634350272175, + "learning_rate": 7.849922377188973e-06, + "loss": 0.5784, + "step": 5635 + }, + { + "epoch": 0.6924683622066593, + "grad_norm": 1.1054239611305727, + "learning_rate": 7.844267645078209e-06, + "loss": 0.6171, + "step": 5636 + }, + { + "epoch": 0.6925912274235164, + "grad_norm": 1.3275223387186565, + "learning_rate": 7.83861422921801e-06, + "loss": 0.6005, + "step": 5637 + }, + { + "epoch": 0.6927140926403735, + "grad_norm": 1.1113006065346165, + "learning_rate": 7.832962130648273e-06, + "loss": 0.5697, + "step": 5638 + }, + { + "epoch": 0.6928369578572307, + "grad_norm": 1.3695552852779618, + "learning_rate": 7.827311350408674e-06, + "loss": 0.6664, + "step": 5639 + }, + { + "epoch": 0.6929598230740878, + "grad_norm": 1.0935866027567274, + "learning_rate": 7.821661889538641e-06, + "loss": 0.5187, + "step": 5640 + }, + { + "epoch": 0.6930826882909449, + "grad_norm": 1.1679051292258025, + "learning_rate": 7.816013749077344e-06, + "loss": 0.5086, + "step": 5641 + }, + { + "epoch": 0.6932055535078019, + "grad_norm": 1.2047314097416144, + "learning_rate": 7.810366930063729e-06, + "loss": 0.6281, + "step": 5642 + }, + { + "epoch": 0.693328418724659, + "grad_norm": 1.1345764296002423, + "learning_rate": 7.80472143353649e-06, + "loss": 0.5801, + "step": 5643 + }, + { + "epoch": 0.6934512839415161, + "grad_norm": 1.0867907048973902, + "learning_rate": 7.799077260534085e-06, + "loss": 0.5474, + "step": 5644 + }, + { + "epoch": 0.6935741491583732, + "grad_norm": 1.1866949359263121, + "learning_rate": 7.793434412094714e-06, + "loss": 0.4608, + "step": 5645 + }, + { + "epoch": 0.6936970143752303, + "grad_norm": 1.0256636615153212, + "learning_rate": 7.787792889256347e-06, + "loss": 0.5405, + "step": 5646 + }, + { + "epoch": 0.6938198795920875, + "grad_norm": 1.2885703812221854, + "learning_rate": 7.782152693056711e-06, + "loss": 0.6253, + "step": 5647 + }, + { + "epoch": 0.6939427448089446, + "grad_norm": 1.3821879314000147, + "learning_rate": 7.776513824533272e-06, + "loss": 0.6429, + "step": 5648 + }, + { + "epoch": 0.6940656100258017, + "grad_norm": 1.0736786957674584, + "learning_rate": 7.770876284723272e-06, + "loss": 0.6469, + "step": 5649 + }, + { + "epoch": 0.6941884752426588, + "grad_norm": 1.1301913789275426, + "learning_rate": 7.765240074663689e-06, + "loss": 0.5696, + "step": 5650 + }, + { + "epoch": 0.6943113404595159, + "grad_norm": 1.2614628451929968, + "learning_rate": 7.759605195391285e-06, + "loss": 0.5564, + "step": 5651 + }, + { + "epoch": 0.694434205676373, + "grad_norm": 1.2454094822044728, + "learning_rate": 7.753971647942543e-06, + "loss": 0.5206, + "step": 5652 + }, + { + "epoch": 0.6945570708932302, + "grad_norm": 1.2479422071068338, + "learning_rate": 7.748339433353731e-06, + "loss": 0.5159, + "step": 5653 + }, + { + "epoch": 0.6946799361100873, + "grad_norm": 1.092935019326991, + "learning_rate": 7.74270855266084e-06, + "loss": 0.6027, + "step": 5654 + }, + { + "epoch": 0.6948028013269444, + "grad_norm": 1.2107553984811215, + "learning_rate": 7.737079006899658e-06, + "loss": 0.5776, + "step": 5655 + }, + { + "epoch": 0.6949256665438015, + "grad_norm": 1.2042640984228694, + "learning_rate": 7.731450797105687e-06, + "loss": 0.5394, + "step": 5656 + }, + { + "epoch": 0.6950485317606585, + "grad_norm": 1.1758020142593968, + "learning_rate": 7.725823924314203e-06, + "loss": 0.545, + "step": 5657 + }, + { + "epoch": 0.6951713969775156, + "grad_norm": 1.3310976546154771, + "learning_rate": 7.720198389560233e-06, + "loss": 0.6924, + "step": 5658 + }, + { + "epoch": 0.6952942621943727, + "grad_norm": 1.0256778764811518, + "learning_rate": 7.714574193878557e-06, + "loss": 0.5266, + "step": 5659 + }, + { + "epoch": 0.6954171274112299, + "grad_norm": 1.1561607465014199, + "learning_rate": 7.708951338303715e-06, + "loss": 0.5359, + "step": 5660 + }, + { + "epoch": 0.695539992628087, + "grad_norm": 1.5261346983439317, + "learning_rate": 7.703329823869987e-06, + "loss": 0.5763, + "step": 5661 + }, + { + "epoch": 0.6956628578449441, + "grad_norm": 1.3941145863416418, + "learning_rate": 7.697709651611415e-06, + "loss": 0.6436, + "step": 5662 + }, + { + "epoch": 0.6957857230618012, + "grad_norm": 1.1354969693506516, + "learning_rate": 7.692090822561796e-06, + "loss": 0.5279, + "step": 5663 + }, + { + "epoch": 0.6959085882786583, + "grad_norm": 1.2967865765940443, + "learning_rate": 7.686473337754682e-06, + "loss": 0.5861, + "step": 5664 + }, + { + "epoch": 0.6960314534955154, + "grad_norm": 1.1111375791308593, + "learning_rate": 7.680857198223364e-06, + "loss": 0.6005, + "step": 5665 + }, + { + "epoch": 0.6961543187123725, + "grad_norm": 1.2470828550503483, + "learning_rate": 7.675242405000896e-06, + "loss": 0.6591, + "step": 5666 + }, + { + "epoch": 0.6962771839292297, + "grad_norm": 1.3823013567306917, + "learning_rate": 7.66962895912009e-06, + "loss": 0.5917, + "step": 5667 + }, + { + "epoch": 0.6964000491460868, + "grad_norm": 1.1512067943788893, + "learning_rate": 7.664016861613495e-06, + "loss": 0.5543, + "step": 5668 + }, + { + "epoch": 0.6965229143629439, + "grad_norm": 1.1769896891607714, + "learning_rate": 7.65840611351342e-06, + "loss": 0.4839, + "step": 5669 + }, + { + "epoch": 0.696645779579801, + "grad_norm": 1.295673789436105, + "learning_rate": 7.65279671585193e-06, + "loss": 0.6511, + "step": 5670 + }, + { + "epoch": 0.6967686447966581, + "grad_norm": 1.0905549751060228, + "learning_rate": 7.647188669660842e-06, + "loss": 0.511, + "step": 5671 + }, + { + "epoch": 0.6968915100135151, + "grad_norm": 1.2038976994560286, + "learning_rate": 7.641581975971705e-06, + "loss": 0.6512, + "step": 5672 + }, + { + "epoch": 0.6970143752303722, + "grad_norm": 1.3976966227036185, + "learning_rate": 7.635976635815845e-06, + "loss": 0.5718, + "step": 5673 + }, + { + "epoch": 0.6971372404472294, + "grad_norm": 1.1285535705847112, + "learning_rate": 7.630372650224326e-06, + "loss": 0.5627, + "step": 5674 + }, + { + "epoch": 0.6972601056640865, + "grad_norm": 1.272360168612306, + "learning_rate": 7.624770020227968e-06, + "loss": 0.6512, + "step": 5675 + }, + { + "epoch": 0.6973829708809436, + "grad_norm": 1.2633200639653437, + "learning_rate": 7.619168746857331e-06, + "loss": 0.6078, + "step": 5676 + }, + { + "epoch": 0.6975058360978007, + "grad_norm": 0.9865428383733277, + "learning_rate": 7.6135688311427364e-06, + "loss": 0.5448, + "step": 5677 + }, + { + "epoch": 0.6976287013146578, + "grad_norm": 1.079019876695279, + "learning_rate": 7.607970274114257e-06, + "loss": 0.6267, + "step": 5678 + }, + { + "epoch": 0.6977515665315149, + "grad_norm": 1.1987782607368782, + "learning_rate": 7.602373076801701e-06, + "loss": 0.611, + "step": 5679 + }, + { + "epoch": 0.697874431748372, + "grad_norm": 1.250032472940683, + "learning_rate": 7.596777240234649e-06, + "loss": 0.5383, + "step": 5680 + }, + { + "epoch": 0.6979972969652292, + "grad_norm": 1.273125630750744, + "learning_rate": 7.5911827654424005e-06, + "loss": 0.6805, + "step": 5681 + }, + { + "epoch": 0.6981201621820863, + "grad_norm": 1.2105492561606177, + "learning_rate": 7.585589653454045e-06, + "loss": 0.7155, + "step": 5682 + }, + { + "epoch": 0.6982430273989434, + "grad_norm": 1.25057798880049, + "learning_rate": 7.579997905298382e-06, + "loss": 0.5783, + "step": 5683 + }, + { + "epoch": 0.6983658926158005, + "grad_norm": 1.4169305899390214, + "learning_rate": 7.574407522003988e-06, + "loss": 0.6321, + "step": 5684 + }, + { + "epoch": 0.6984887578326576, + "grad_norm": 1.077743644185033, + "learning_rate": 7.568818504599175e-06, + "loss": 0.52, + "step": 5685 + }, + { + "epoch": 0.6986116230495146, + "grad_norm": 1.2882541657759443, + "learning_rate": 7.563230854112002e-06, + "loss": 0.6798, + "step": 5686 + }, + { + "epoch": 0.6987344882663717, + "grad_norm": 1.1307087000224498, + "learning_rate": 7.557644571570289e-06, + "loss": 0.5788, + "step": 5687 + }, + { + "epoch": 0.6988573534832289, + "grad_norm": 1.0912600519475106, + "learning_rate": 7.55205965800158e-06, + "loss": 0.5221, + "step": 5688 + }, + { + "epoch": 0.698980218700086, + "grad_norm": 1.3111601572461575, + "learning_rate": 7.5464761144332074e-06, + "loss": 0.544, + "step": 5689 + }, + { + "epoch": 0.6991030839169431, + "grad_norm": 1.609823079301775, + "learning_rate": 7.5408939418922095e-06, + "loss": 0.7466, + "step": 5690 + }, + { + "epoch": 0.6992259491338002, + "grad_norm": 1.161310037813083, + "learning_rate": 7.5353131414054025e-06, + "loss": 0.5927, + "step": 5691 + }, + { + "epoch": 0.6993488143506573, + "grad_norm": 1.2913202148830971, + "learning_rate": 7.529733713999323e-06, + "loss": 0.5768, + "step": 5692 + }, + { + "epoch": 0.6994716795675144, + "grad_norm": 1.1795172187584488, + "learning_rate": 7.52415566070029e-06, + "loss": 0.654, + "step": 5693 + }, + { + "epoch": 0.6995945447843716, + "grad_norm": 1.1598207260919409, + "learning_rate": 7.518578982534336e-06, + "loss": 0.71, + "step": 5694 + }, + { + "epoch": 0.6997174100012287, + "grad_norm": 1.1637059942482801, + "learning_rate": 7.513003680527265e-06, + "loss": 0.4501, + "step": 5695 + }, + { + "epoch": 0.6998402752180858, + "grad_norm": 1.0356150086496085, + "learning_rate": 7.507429755704606e-06, + "loss": 0.5652, + "step": 5696 + }, + { + "epoch": 0.6999631404349429, + "grad_norm": 1.3385933889742565, + "learning_rate": 7.5018572090916526e-06, + "loss": 0.5309, + "step": 5697 + }, + { + "epoch": 0.7000860056518, + "grad_norm": 1.3891938941138404, + "learning_rate": 7.496286041713444e-06, + "loss": 0.5656, + "step": 5698 + }, + { + "epoch": 0.7002088708686571, + "grad_norm": 1.162934908193538, + "learning_rate": 7.490716254594751e-06, + "loss": 0.6868, + "step": 5699 + }, + { + "epoch": 0.7003317360855142, + "grad_norm": 1.425167790116577, + "learning_rate": 7.485147848760102e-06, + "loss": 0.6007, + "step": 5700 + }, + { + "epoch": 0.7004546013023712, + "grad_norm": 1.3785772115628128, + "learning_rate": 7.47958082523377e-06, + "loss": 0.5679, + "step": 5701 + }, + { + "epoch": 0.7005774665192284, + "grad_norm": 1.6163685702014767, + "learning_rate": 7.47401518503978e-06, + "loss": 0.5532, + "step": 5702 + }, + { + "epoch": 0.7007003317360855, + "grad_norm": 1.296126857595835, + "learning_rate": 7.468450929201882e-06, + "loss": 0.6498, + "step": 5703 + }, + { + "epoch": 0.7008231969529426, + "grad_norm": 1.4082813184421166, + "learning_rate": 7.462888058743593e-06, + "loss": 0.6941, + "step": 5704 + }, + { + "epoch": 0.7009460621697997, + "grad_norm": 1.3587331734418637, + "learning_rate": 7.457326574688172e-06, + "loss": 0.5115, + "step": 5705 + }, + { + "epoch": 0.7010689273866568, + "grad_norm": 1.0856302316312183, + "learning_rate": 7.451766478058605e-06, + "loss": 0.6778, + "step": 5706 + }, + { + "epoch": 0.7011917926035139, + "grad_norm": 1.1182188276308027, + "learning_rate": 7.446207769877642e-06, + "loss": 0.5694, + "step": 5707 + }, + { + "epoch": 0.7013146578203711, + "grad_norm": 1.274010209326199, + "learning_rate": 7.440650451167772e-06, + "loss": 0.5606, + "step": 5708 + }, + { + "epoch": 0.7014375230372282, + "grad_norm": 1.3152318270549028, + "learning_rate": 7.435094522951234e-06, + "loss": 0.5273, + "step": 5709 + }, + { + "epoch": 0.7015603882540853, + "grad_norm": 1.0797431714811438, + "learning_rate": 7.429539986249992e-06, + "loss": 0.6372, + "step": 5710 + }, + { + "epoch": 0.7016832534709424, + "grad_norm": 1.480381388069687, + "learning_rate": 7.423986842085774e-06, + "loss": 0.5983, + "step": 5711 + }, + { + "epoch": 0.7018061186877995, + "grad_norm": 1.3506365482903868, + "learning_rate": 7.4184350914800435e-06, + "loss": 0.6559, + "step": 5712 + }, + { + "epoch": 0.7019289839046566, + "grad_norm": 1.266889336902524, + "learning_rate": 7.412884735454016e-06, + "loss": 0.5382, + "step": 5713 + }, + { + "epoch": 0.7020518491215137, + "grad_norm": 1.1932786930559602, + "learning_rate": 7.407335775028631e-06, + "loss": 0.5765, + "step": 5714 + }, + { + "epoch": 0.7021747143383708, + "grad_norm": 1.2348856560985757, + "learning_rate": 7.401788211224589e-06, + "loss": 0.6038, + "step": 5715 + }, + { + "epoch": 0.7022975795552279, + "grad_norm": 1.2137744630007445, + "learning_rate": 7.396242045062336e-06, + "loss": 0.5517, + "step": 5716 + }, + { + "epoch": 0.702420444772085, + "grad_norm": 1.2903470516336304, + "learning_rate": 7.3906972775620415e-06, + "loss": 0.5516, + "step": 5717 + }, + { + "epoch": 0.7025433099889421, + "grad_norm": 1.3303982605415705, + "learning_rate": 7.385153909743641e-06, + "loss": 0.5255, + "step": 5718 + }, + { + "epoch": 0.7026661752057992, + "grad_norm": 1.0508281893475862, + "learning_rate": 7.3796119426267815e-06, + "loss": 0.7319, + "step": 5719 + }, + { + "epoch": 0.7027890404226563, + "grad_norm": 1.0458664995896987, + "learning_rate": 7.374071377230898e-06, + "loss": 0.534, + "step": 5720 + }, + { + "epoch": 0.7029119056395134, + "grad_norm": 1.0855774192859997, + "learning_rate": 7.3685322145751235e-06, + "loss": 0.6294, + "step": 5721 + }, + { + "epoch": 0.7030347708563706, + "grad_norm": 1.0757897668175076, + "learning_rate": 7.36299445567836e-06, + "loss": 0.6078, + "step": 5722 + }, + { + "epoch": 0.7031576360732277, + "grad_norm": 1.1357546924440463, + "learning_rate": 7.3574581015592355e-06, + "loss": 0.6748, + "step": 5723 + }, + { + "epoch": 0.7032805012900848, + "grad_norm": 1.1450350702581613, + "learning_rate": 7.351923153236128e-06, + "loss": 0.5483, + "step": 5724 + }, + { + "epoch": 0.7034033665069419, + "grad_norm": 1.1949006761202412, + "learning_rate": 7.346389611727163e-06, + "loss": 0.544, + "step": 5725 + }, + { + "epoch": 0.703526231723799, + "grad_norm": 1.348417226763605, + "learning_rate": 7.340857478050183e-06, + "loss": 0.648, + "step": 5726 + }, + { + "epoch": 0.7036490969406561, + "grad_norm": 1.570794600174266, + "learning_rate": 7.335326753222808e-06, + "loss": 0.6631, + "step": 5727 + }, + { + "epoch": 0.7037719621575133, + "grad_norm": 1.342669481166849, + "learning_rate": 7.329797438262366e-06, + "loss": 0.6576, + "step": 5728 + }, + { + "epoch": 0.7038948273743704, + "grad_norm": 1.1490408673632881, + "learning_rate": 7.324269534185947e-06, + "loss": 0.5541, + "step": 5729 + }, + { + "epoch": 0.7040176925912274, + "grad_norm": 1.1310173866604216, + "learning_rate": 7.318743042010361e-06, + "loss": 0.5324, + "step": 5730 + }, + { + "epoch": 0.7041405578080845, + "grad_norm": 1.290339168145589, + "learning_rate": 7.313217962752179e-06, + "loss": 0.5413, + "step": 5731 + }, + { + "epoch": 0.7042634230249416, + "grad_norm": 1.1947658790443367, + "learning_rate": 7.307694297427704e-06, + "loss": 0.6342, + "step": 5732 + }, + { + "epoch": 0.7043862882417987, + "grad_norm": 1.3069075764237337, + "learning_rate": 7.3021720470529794e-06, + "loss": 0.5293, + "step": 5733 + }, + { + "epoch": 0.7045091534586558, + "grad_norm": 1.4767806045932323, + "learning_rate": 7.296651212643781e-06, + "loss": 0.5251, + "step": 5734 + }, + { + "epoch": 0.704632018675513, + "grad_norm": 1.08301307920495, + "learning_rate": 7.291131795215632e-06, + "loss": 0.6573, + "step": 5735 + }, + { + "epoch": 0.7047548838923701, + "grad_norm": 1.418815346852245, + "learning_rate": 7.285613795783803e-06, + "loss": 0.5266, + "step": 5736 + }, + { + "epoch": 0.7048777491092272, + "grad_norm": 1.060798681542825, + "learning_rate": 7.28009721536328e-06, + "loss": 0.5649, + "step": 5737 + }, + { + "epoch": 0.7050006143260843, + "grad_norm": 1.4179499931732644, + "learning_rate": 7.274582054968811e-06, + "loss": 0.5327, + "step": 5738 + }, + { + "epoch": 0.7051234795429414, + "grad_norm": 1.2006922451582343, + "learning_rate": 7.2690683156148705e-06, + "loss": 0.5705, + "step": 5739 + }, + { + "epoch": 0.7052463447597985, + "grad_norm": 1.453856439986463, + "learning_rate": 7.2635559983156825e-06, + "loss": 0.5847, + "step": 5740 + }, + { + "epoch": 0.7053692099766556, + "grad_norm": 1.2678633571549354, + "learning_rate": 7.258045104085189e-06, + "loss": 0.7074, + "step": 5741 + }, + { + "epoch": 0.7054920751935128, + "grad_norm": 1.2830859278039677, + "learning_rate": 7.252535633937092e-06, + "loss": 0.6406, + "step": 5742 + }, + { + "epoch": 0.7056149404103699, + "grad_norm": 1.5228588099144023, + "learning_rate": 7.247027588884825e-06, + "loss": 0.5874, + "step": 5743 + }, + { + "epoch": 0.7057378056272269, + "grad_norm": 1.050519264222237, + "learning_rate": 7.2415209699415485e-06, + "loss": 0.6125, + "step": 5744 + }, + { + "epoch": 0.705860670844084, + "grad_norm": 1.1074581223682745, + "learning_rate": 7.23601577812018e-06, + "loss": 0.6422, + "step": 5745 + }, + { + "epoch": 0.7059835360609411, + "grad_norm": 1.054443322793296, + "learning_rate": 7.2305120144333465e-06, + "loss": 0.6069, + "step": 5746 + }, + { + "epoch": 0.7061064012777982, + "grad_norm": 1.2150167351662573, + "learning_rate": 7.225009679893452e-06, + "loss": 0.6173, + "step": 5747 + }, + { + "epoch": 0.7062292664946553, + "grad_norm": 1.119401951426301, + "learning_rate": 7.2195087755125975e-06, + "loss": 0.6253, + "step": 5748 + }, + { + "epoch": 0.7063521317115125, + "grad_norm": 1.2784924724924924, + "learning_rate": 7.214009302302648e-06, + "loss": 0.6218, + "step": 5749 + }, + { + "epoch": 0.7064749969283696, + "grad_norm": 1.3532266467638157, + "learning_rate": 7.208511261275198e-06, + "loss": 0.6088, + "step": 5750 + }, + { + "epoch": 0.7065978621452267, + "grad_norm": 1.2192633630658183, + "learning_rate": 7.203014653441567e-06, + "loss": 0.5655, + "step": 5751 + }, + { + "epoch": 0.7067207273620838, + "grad_norm": 1.3934715056550437, + "learning_rate": 7.197519479812828e-06, + "loss": 0.615, + "step": 5752 + }, + { + "epoch": 0.7068435925789409, + "grad_norm": 1.314031226393348, + "learning_rate": 7.192025741399771e-06, + "loss": 0.5845, + "step": 5753 + }, + { + "epoch": 0.706966457795798, + "grad_norm": 1.3724073012025795, + "learning_rate": 7.186533439212953e-06, + "loss": 0.5574, + "step": 5754 + }, + { + "epoch": 0.7070893230126551, + "grad_norm": 1.127366029157803, + "learning_rate": 7.181042574262633e-06, + "loss": 0.4481, + "step": 5755 + }, + { + "epoch": 0.7072121882295123, + "grad_norm": 1.3308792127641735, + "learning_rate": 7.1755531475588265e-06, + "loss": 0.5948, + "step": 5756 + }, + { + "epoch": 0.7073350534463694, + "grad_norm": 1.3394849395710051, + "learning_rate": 7.1700651601112646e-06, + "loss": 0.6052, + "step": 5757 + }, + { + "epoch": 0.7074579186632265, + "grad_norm": 1.2965800008465418, + "learning_rate": 7.16457861292945e-06, + "loss": 0.5313, + "step": 5758 + }, + { + "epoch": 0.7075807838800835, + "grad_norm": 1.3562799875478142, + "learning_rate": 7.159093507022579e-06, + "loss": 0.5599, + "step": 5759 + }, + { + "epoch": 0.7077036490969406, + "grad_norm": 1.201626183894177, + "learning_rate": 7.153609843399613e-06, + "loss": 0.5679, + "step": 5760 + }, + { + "epoch": 0.7078265143137977, + "grad_norm": 1.8863322234032065, + "learning_rate": 7.148127623069225e-06, + "loss": 0.61, + "step": 5761 + }, + { + "epoch": 0.7079493795306548, + "grad_norm": 1.160344671752403, + "learning_rate": 7.14264684703984e-06, + "loss": 0.7411, + "step": 5762 + }, + { + "epoch": 0.708072244747512, + "grad_norm": 1.559916338736381, + "learning_rate": 7.137167516319615e-06, + "loss": 0.4664, + "step": 5763 + }, + { + "epoch": 0.7081951099643691, + "grad_norm": 1.2994403490311055, + "learning_rate": 7.131689631916427e-06, + "loss": 0.4639, + "step": 5764 + }, + { + "epoch": 0.7083179751812262, + "grad_norm": 1.1796104771453266, + "learning_rate": 7.126213194837905e-06, + "loss": 0.5722, + "step": 5765 + }, + { + "epoch": 0.7084408403980833, + "grad_norm": 1.342407954801766, + "learning_rate": 7.120738206091403e-06, + "loss": 0.5607, + "step": 5766 + }, + { + "epoch": 0.7085637056149404, + "grad_norm": 1.8618112928715833, + "learning_rate": 7.115264666684013e-06, + "loss": 0.7505, + "step": 5767 + }, + { + "epoch": 0.7086865708317975, + "grad_norm": 1.2353491104981922, + "learning_rate": 7.1097925776225495e-06, + "loss": 0.5712, + "step": 5768 + }, + { + "epoch": 0.7088094360486547, + "grad_norm": 1.1026642876623363, + "learning_rate": 7.10432193991357e-06, + "loss": 0.5352, + "step": 5769 + }, + { + "epoch": 0.7089323012655118, + "grad_norm": 0.9995481099165657, + "learning_rate": 7.098852754563371e-06, + "loss": 0.5693, + "step": 5770 + }, + { + "epoch": 0.7090551664823689, + "grad_norm": 1.3572916016518013, + "learning_rate": 7.09338502257796e-06, + "loss": 0.6754, + "step": 5771 + }, + { + "epoch": 0.709178031699226, + "grad_norm": 1.6833130537807133, + "learning_rate": 7.0879187449631e-06, + "loss": 0.7323, + "step": 5772 + }, + { + "epoch": 0.7093008969160831, + "grad_norm": 1.1190422922049048, + "learning_rate": 7.082453922724275e-06, + "loss": 0.529, + "step": 5773 + }, + { + "epoch": 0.7094237621329401, + "grad_norm": 1.4662071365643152, + "learning_rate": 7.076990556866708e-06, + "loss": 0.5821, + "step": 5774 + }, + { + "epoch": 0.7095466273497972, + "grad_norm": 1.5187844540061315, + "learning_rate": 7.0715286483953405e-06, + "loss": 0.7054, + "step": 5775 + }, + { + "epoch": 0.7096694925666543, + "grad_norm": 1.4415135956915555, + "learning_rate": 7.06606819831486e-06, + "loss": 0.7331, + "step": 5776 + }, + { + "epoch": 0.7097923577835115, + "grad_norm": 1.432305802135095, + "learning_rate": 7.060609207629682e-06, + "loss": 0.594, + "step": 5777 + }, + { + "epoch": 0.7099152230003686, + "grad_norm": 1.2004923109971903, + "learning_rate": 7.055151677343955e-06, + "loss": 0.5097, + "step": 5778 + }, + { + "epoch": 0.7100380882172257, + "grad_norm": 1.3837346391899004, + "learning_rate": 7.04969560846155e-06, + "loss": 0.7393, + "step": 5779 + }, + { + "epoch": 0.7101609534340828, + "grad_norm": 1.458954586563351, + "learning_rate": 7.044241001986076e-06, + "loss": 0.5354, + "step": 5780 + }, + { + "epoch": 0.7102838186509399, + "grad_norm": 1.1587697933268901, + "learning_rate": 7.038787858920881e-06, + "loss": 0.6123, + "step": 5781 + }, + { + "epoch": 0.710406683867797, + "grad_norm": 1.4033532077227382, + "learning_rate": 7.033336180269024e-06, + "loss": 0.5905, + "step": 5782 + }, + { + "epoch": 0.7105295490846542, + "grad_norm": 1.2716849479562329, + "learning_rate": 7.027885967033316e-06, + "loss": 0.6176, + "step": 5783 + }, + { + "epoch": 0.7106524143015113, + "grad_norm": 1.2990841617996842, + "learning_rate": 7.022437220216273e-06, + "loss": 0.5403, + "step": 5784 + }, + { + "epoch": 0.7107752795183684, + "grad_norm": 1.2625967395725193, + "learning_rate": 7.016989940820178e-06, + "loss": 0.6724, + "step": 5785 + }, + { + "epoch": 0.7108981447352255, + "grad_norm": 1.2337725191366085, + "learning_rate": 7.011544129847006e-06, + "loss": 0.6022, + "step": 5786 + }, + { + "epoch": 0.7110210099520826, + "grad_norm": 1.1749126535146508, + "learning_rate": 7.00609978829849e-06, + "loss": 0.5373, + "step": 5787 + }, + { + "epoch": 0.7111438751689396, + "grad_norm": 1.1310656804647898, + "learning_rate": 7.000656917176069e-06, + "loss": 0.6863, + "step": 5788 + }, + { + "epoch": 0.7112667403857967, + "grad_norm": 1.3237019281635043, + "learning_rate": 6.995215517480932e-06, + "loss": 0.467, + "step": 5789 + }, + { + "epoch": 0.7113896056026539, + "grad_norm": 0.9795057904179649, + "learning_rate": 6.9897755902139946e-06, + "loss": 0.5613, + "step": 5790 + }, + { + "epoch": 0.711512470819511, + "grad_norm": 1.108221594022862, + "learning_rate": 6.984337136375875e-06, + "loss": 0.6727, + "step": 5791 + }, + { + "epoch": 0.7116353360363681, + "grad_norm": 1.1322700748963865, + "learning_rate": 6.978900156966968e-06, + "loss": 0.6065, + "step": 5792 + }, + { + "epoch": 0.7117582012532252, + "grad_norm": 1.09238176404221, + "learning_rate": 6.973464652987353e-06, + "loss": 0.547, + "step": 5793 + }, + { + "epoch": 0.7118810664700823, + "grad_norm": 1.166119928883705, + "learning_rate": 6.968030625436867e-06, + "loss": 0.5626, + "step": 5794 + }, + { + "epoch": 0.7120039316869394, + "grad_norm": 1.3001472245561245, + "learning_rate": 6.962598075315047e-06, + "loss": 0.5918, + "step": 5795 + }, + { + "epoch": 0.7121267969037965, + "grad_norm": 1.1697190246044635, + "learning_rate": 6.957167003621199e-06, + "loss": 0.7425, + "step": 5796 + }, + { + "epoch": 0.7122496621206537, + "grad_norm": 1.215358827917493, + "learning_rate": 6.951737411354313e-06, + "loss": 0.6029, + "step": 5797 + }, + { + "epoch": 0.7123725273375108, + "grad_norm": 1.139689711258494, + "learning_rate": 6.9463092995131426e-06, + "loss": 0.6305, + "step": 5798 + }, + { + "epoch": 0.7124953925543679, + "grad_norm": 1.2867420354061996, + "learning_rate": 6.94088266909614e-06, + "loss": 0.6005, + "step": 5799 + }, + { + "epoch": 0.712618257771225, + "grad_norm": 1.1015665044397294, + "learning_rate": 6.935457521101507e-06, + "loss": 0.6065, + "step": 5800 + }, + { + "epoch": 0.7127411229880821, + "grad_norm": 1.0911902802413385, + "learning_rate": 6.930033856527167e-06, + "loss": 0.6235, + "step": 5801 + }, + { + "epoch": 0.7128639882049392, + "grad_norm": 1.119956223380845, + "learning_rate": 6.9246116763707575e-06, + "loss": 0.5333, + "step": 5802 + }, + { + "epoch": 0.7129868534217962, + "grad_norm": 1.1589425896356937, + "learning_rate": 6.91919098162966e-06, + "loss": 0.6897, + "step": 5803 + }, + { + "epoch": 0.7131097186386534, + "grad_norm": 1.312514040665402, + "learning_rate": 6.913771773300975e-06, + "loss": 0.5699, + "step": 5804 + }, + { + "epoch": 0.7132325838555105, + "grad_norm": 1.2886164671113787, + "learning_rate": 6.908354052381538e-06, + "loss": 0.587, + "step": 5805 + }, + { + "epoch": 0.7133554490723676, + "grad_norm": 1.083492006467809, + "learning_rate": 6.902937819867891e-06, + "loss": 0.648, + "step": 5806 + }, + { + "epoch": 0.7134783142892247, + "grad_norm": 2.3155017166343206, + "learning_rate": 6.897523076756319e-06, + "loss": 0.7698, + "step": 5807 + }, + { + "epoch": 0.7136011795060818, + "grad_norm": 1.1904415268941864, + "learning_rate": 6.892109824042838e-06, + "loss": 0.5685, + "step": 5808 + }, + { + "epoch": 0.7137240447229389, + "grad_norm": 1.2052961582087776, + "learning_rate": 6.886698062723167e-06, + "loss": 0.5626, + "step": 5809 + }, + { + "epoch": 0.713846909939796, + "grad_norm": 1.1903740127915126, + "learning_rate": 6.881287793792777e-06, + "loss": 0.4029, + "step": 5810 + }, + { + "epoch": 0.7139697751566532, + "grad_norm": 1.2328405696389708, + "learning_rate": 6.875879018246835e-06, + "loss": 0.5794, + "step": 5811 + }, + { + "epoch": 0.7140926403735103, + "grad_norm": 1.2718028143784392, + "learning_rate": 6.87047173708027e-06, + "loss": 0.5968, + "step": 5812 + }, + { + "epoch": 0.7142155055903674, + "grad_norm": 1.4642964746635874, + "learning_rate": 6.865065951287703e-06, + "loss": 0.532, + "step": 5813 + }, + { + "epoch": 0.7143383708072245, + "grad_norm": 0.9703985059574514, + "learning_rate": 6.859661661863497e-06, + "loss": 0.611, + "step": 5814 + }, + { + "epoch": 0.7144612360240816, + "grad_norm": 1.2426170798487577, + "learning_rate": 6.854258869801736e-06, + "loss": 0.5581, + "step": 5815 + }, + { + "epoch": 0.7145841012409387, + "grad_norm": 1.15961472317541, + "learning_rate": 6.848857576096235e-06, + "loss": 0.6456, + "step": 5816 + }, + { + "epoch": 0.7147069664577957, + "grad_norm": 1.334802166167363, + "learning_rate": 6.843457781740516e-06, + "loss": 0.5513, + "step": 5817 + }, + { + "epoch": 0.7148298316746529, + "grad_norm": 1.1935085554264357, + "learning_rate": 6.83805948772784e-06, + "loss": 0.5916, + "step": 5818 + }, + { + "epoch": 0.71495269689151, + "grad_norm": 1.2401037072410983, + "learning_rate": 6.832662695051195e-06, + "loss": 0.6048, + "step": 5819 + }, + { + "epoch": 0.7150755621083671, + "grad_norm": 1.1521125544105246, + "learning_rate": 6.827267404703274e-06, + "loss": 0.6853, + "step": 5820 + }, + { + "epoch": 0.7151984273252242, + "grad_norm": 1.100269249745341, + "learning_rate": 6.821873617676519e-06, + "loss": 0.5521, + "step": 5821 + }, + { + "epoch": 0.7153212925420813, + "grad_norm": 1.1608308359172952, + "learning_rate": 6.816481334963061e-06, + "loss": 0.5839, + "step": 5822 + }, + { + "epoch": 0.7154441577589384, + "grad_norm": 1.2031034055225187, + "learning_rate": 6.811090557554803e-06, + "loss": 0.4529, + "step": 5823 + }, + { + "epoch": 0.7155670229757956, + "grad_norm": 1.0360813009000631, + "learning_rate": 6.805701286443323e-06, + "loss": 0.6079, + "step": 5824 + }, + { + "epoch": 0.7156898881926527, + "grad_norm": 1.0333578924081652, + "learning_rate": 6.800313522619957e-06, + "loss": 0.5528, + "step": 5825 + }, + { + "epoch": 0.7158127534095098, + "grad_norm": 1.363867745416032, + "learning_rate": 6.794927267075735e-06, + "loss": 0.5827, + "step": 5826 + }, + { + "epoch": 0.7159356186263669, + "grad_norm": 1.335096629255007, + "learning_rate": 6.7895425208014304e-06, + "loss": 0.5528, + "step": 5827 + }, + { + "epoch": 0.716058483843224, + "grad_norm": 1.3653220299525013, + "learning_rate": 6.784159284787537e-06, + "loss": 0.608, + "step": 5828 + }, + { + "epoch": 0.7161813490600811, + "grad_norm": 1.1690578325619843, + "learning_rate": 6.7787775600242575e-06, + "loss": 0.5433, + "step": 5829 + }, + { + "epoch": 0.7163042142769382, + "grad_norm": 1.3498927927132656, + "learning_rate": 6.773397347501529e-06, + "loss": 0.6958, + "step": 5830 + }, + { + "epoch": 0.7164270794937954, + "grad_norm": 1.3017941944838383, + "learning_rate": 6.768018648209008e-06, + "loss": 0.5491, + "step": 5831 + }, + { + "epoch": 0.7165499447106524, + "grad_norm": 1.029874971190032, + "learning_rate": 6.762641463136074e-06, + "loss": 0.6349, + "step": 5832 + }, + { + "epoch": 0.7166728099275095, + "grad_norm": 1.3823513046614209, + "learning_rate": 6.757265793271811e-06, + "loss": 0.6754, + "step": 5833 + }, + { + "epoch": 0.7167956751443666, + "grad_norm": 1.066391911473468, + "learning_rate": 6.7518916396050606e-06, + "loss": 0.4928, + "step": 5834 + }, + { + "epoch": 0.7169185403612237, + "grad_norm": 1.2921118397480762, + "learning_rate": 6.746519003124347e-06, + "loss": 0.6599, + "step": 5835 + }, + { + "epoch": 0.7170414055780808, + "grad_norm": 1.261324359695132, + "learning_rate": 6.7411478848179435e-06, + "loss": 0.54, + "step": 5836 + }, + { + "epoch": 0.7171642707949379, + "grad_norm": 1.2037502538260048, + "learning_rate": 6.73577828567382e-06, + "loss": 0.5765, + "step": 5837 + }, + { + "epoch": 0.7172871360117951, + "grad_norm": 1.2757813455826494, + "learning_rate": 6.730410206679684e-06, + "loss": 0.5553, + "step": 5838 + }, + { + "epoch": 0.7174100012286522, + "grad_norm": 1.189168625597662, + "learning_rate": 6.725043648822967e-06, + "loss": 0.6089, + "step": 5839 + }, + { + "epoch": 0.7175328664455093, + "grad_norm": 1.266175823420697, + "learning_rate": 6.719678613090801e-06, + "loss": 0.4964, + "step": 5840 + }, + { + "epoch": 0.7176557316623664, + "grad_norm": 1.245378677795781, + "learning_rate": 6.714315100470053e-06, + "loss": 0.5989, + "step": 5841 + }, + { + "epoch": 0.7177785968792235, + "grad_norm": 1.2673256738776728, + "learning_rate": 6.708953111947308e-06, + "loss": 0.5609, + "step": 5842 + }, + { + "epoch": 0.7179014620960806, + "grad_norm": 1.394447056315142, + "learning_rate": 6.703592648508875e-06, + "loss": 0.5856, + "step": 5843 + }, + { + "epoch": 0.7180243273129377, + "grad_norm": 1.2561057041157506, + "learning_rate": 6.698233711140764e-06, + "loss": 0.5507, + "step": 5844 + }, + { + "epoch": 0.7181471925297949, + "grad_norm": 1.3673702330852875, + "learning_rate": 6.692876300828723e-06, + "loss": 0.4616, + "step": 5845 + }, + { + "epoch": 0.7182700577466519, + "grad_norm": 1.0892524708049547, + "learning_rate": 6.687520418558219e-06, + "loss": 0.5948, + "step": 5846 + }, + { + "epoch": 0.718392922963509, + "grad_norm": 1.306129399523482, + "learning_rate": 6.68216606531442e-06, + "loss": 0.5445, + "step": 5847 + }, + { + "epoch": 0.7185157881803661, + "grad_norm": 1.2353131277077498, + "learning_rate": 6.676813242082236e-06, + "loss": 0.6348, + "step": 5848 + }, + { + "epoch": 0.7186386533972232, + "grad_norm": 1.1960504482141205, + "learning_rate": 6.671461949846265e-06, + "loss": 0.5705, + "step": 5849 + }, + { + "epoch": 0.7187615186140803, + "grad_norm": 1.1581117824682023, + "learning_rate": 6.6661121895908695e-06, + "loss": 0.5788, + "step": 5850 + }, + { + "epoch": 0.7188843838309374, + "grad_norm": 1.0885453901059732, + "learning_rate": 6.660763962300084e-06, + "loss": 0.6672, + "step": 5851 + }, + { + "epoch": 0.7190072490477946, + "grad_norm": 1.021082603113587, + "learning_rate": 6.6554172689576896e-06, + "loss": 0.6934, + "step": 5852 + }, + { + "epoch": 0.7191301142646517, + "grad_norm": 1.4147487807923342, + "learning_rate": 6.650072110547169e-06, + "loss": 0.5802, + "step": 5853 + }, + { + "epoch": 0.7192529794815088, + "grad_norm": 1.456128322366719, + "learning_rate": 6.64472848805173e-06, + "loss": 0.6713, + "step": 5854 + }, + { + "epoch": 0.7193758446983659, + "grad_norm": 1.5482187142607937, + "learning_rate": 6.639386402454302e-06, + "loss": 0.6006, + "step": 5855 + }, + { + "epoch": 0.719498709915223, + "grad_norm": 1.3313188204213053, + "learning_rate": 6.634045854737523e-06, + "loss": 0.6198, + "step": 5856 + }, + { + "epoch": 0.7196215751320801, + "grad_norm": 1.1935769977932142, + "learning_rate": 6.628706845883759e-06, + "loss": 0.551, + "step": 5857 + }, + { + "epoch": 0.7197444403489373, + "grad_norm": 0.9809574033746302, + "learning_rate": 6.623369376875077e-06, + "loss": 0.5906, + "step": 5858 + }, + { + "epoch": 0.7198673055657944, + "grad_norm": 1.1480795696286137, + "learning_rate": 6.618033448693279e-06, + "loss": 0.4643, + "step": 5859 + }, + { + "epoch": 0.7199901707826515, + "grad_norm": 1.3224795815770858, + "learning_rate": 6.612699062319858e-06, + "loss": 0.5762, + "step": 5860 + }, + { + "epoch": 0.7201130359995085, + "grad_norm": 1.2275632415973505, + "learning_rate": 6.607366218736062e-06, + "loss": 0.6079, + "step": 5861 + }, + { + "epoch": 0.7202359012163656, + "grad_norm": 1.4578528065837546, + "learning_rate": 6.602034918922816e-06, + "loss": 0.6368, + "step": 5862 + }, + { + "epoch": 0.7203587664332227, + "grad_norm": 1.4257493297019976, + "learning_rate": 6.59670516386079e-06, + "loss": 0.5525, + "step": 5863 + }, + { + "epoch": 0.7204816316500798, + "grad_norm": 1.408099849960531, + "learning_rate": 6.591376954530345e-06, + "loss": 0.6496, + "step": 5864 + }, + { + "epoch": 0.720604496866937, + "grad_norm": 1.041965922861159, + "learning_rate": 6.586050291911579e-06, + "loss": 0.508, + "step": 5865 + }, + { + "epoch": 0.7207273620837941, + "grad_norm": 1.2623752804977917, + "learning_rate": 6.5807251769843e-06, + "loss": 0.5828, + "step": 5866 + }, + { + "epoch": 0.7208502273006512, + "grad_norm": 1.108162636920431, + "learning_rate": 6.575401610728019e-06, + "loss": 0.5089, + "step": 5867 + }, + { + "epoch": 0.7209730925175083, + "grad_norm": 1.460852553840171, + "learning_rate": 6.570079594121976e-06, + "loss": 0.4718, + "step": 5868 + }, + { + "epoch": 0.7210959577343654, + "grad_norm": 1.506222196836904, + "learning_rate": 6.5647591281451215e-06, + "loss": 0.6707, + "step": 5869 + }, + { + "epoch": 0.7212188229512225, + "grad_norm": 1.26504426927464, + "learning_rate": 6.559440213776126e-06, + "loss": 0.7019, + "step": 5870 + }, + { + "epoch": 0.7213416881680796, + "grad_norm": 1.4473614819118563, + "learning_rate": 6.554122851993359e-06, + "loss": 0.5905, + "step": 5871 + }, + { + "epoch": 0.7214645533849368, + "grad_norm": 1.2645483847930272, + "learning_rate": 6.54880704377492e-06, + "loss": 0.7117, + "step": 5872 + }, + { + "epoch": 0.7215874186017939, + "grad_norm": 1.1346258566050182, + "learning_rate": 6.543492790098623e-06, + "loss": 0.5783, + "step": 5873 + }, + { + "epoch": 0.721710283818651, + "grad_norm": 1.1944941966838851, + "learning_rate": 6.5381800919419805e-06, + "loss": 0.4596, + "step": 5874 + }, + { + "epoch": 0.721833149035508, + "grad_norm": 1.3847356282574934, + "learning_rate": 6.532868950282237e-06, + "loss": 0.551, + "step": 5875 + }, + { + "epoch": 0.7219560142523651, + "grad_norm": 0.998276745121459, + "learning_rate": 6.527559366096328e-06, + "loss": 0.5862, + "step": 5876 + }, + { + "epoch": 0.7220788794692222, + "grad_norm": 1.1960746727221903, + "learning_rate": 6.5222513403609405e-06, + "loss": 0.5407, + "step": 5877 + }, + { + "epoch": 0.7222017446860793, + "grad_norm": 1.2121705670926441, + "learning_rate": 6.5169448740524315e-06, + "loss": 0.6076, + "step": 5878 + }, + { + "epoch": 0.7223246099029365, + "grad_norm": 1.632566802957556, + "learning_rate": 6.511639968146898e-06, + "loss": 0.5831, + "step": 5879 + }, + { + "epoch": 0.7224474751197936, + "grad_norm": 1.249922835772653, + "learning_rate": 6.506336623620145e-06, + "loss": 0.5896, + "step": 5880 + }, + { + "epoch": 0.7225703403366507, + "grad_norm": 1.0771952403425245, + "learning_rate": 6.501034841447692e-06, + "loss": 0.495, + "step": 5881 + }, + { + "epoch": 0.7226932055535078, + "grad_norm": 1.6649935456645526, + "learning_rate": 6.495734622604757e-06, + "loss": 0.6819, + "step": 5882 + }, + { + "epoch": 0.7228160707703649, + "grad_norm": 1.326199440818517, + "learning_rate": 6.490435968066284e-06, + "loss": 0.6102, + "step": 5883 + }, + { + "epoch": 0.722938935987222, + "grad_norm": 1.4040447141637833, + "learning_rate": 6.485138878806937e-06, + "loss": 0.6631, + "step": 5884 + }, + { + "epoch": 0.7230618012040791, + "grad_norm": 1.034310870853518, + "learning_rate": 6.479843355801064e-06, + "loss": 0.6146, + "step": 5885 + }, + { + "epoch": 0.7231846664209363, + "grad_norm": 1.247525381713037, + "learning_rate": 6.474549400022757e-06, + "loss": 0.5254, + "step": 5886 + }, + { + "epoch": 0.7233075316377934, + "grad_norm": 1.0939967040409733, + "learning_rate": 6.469257012445788e-06, + "loss": 0.4725, + "step": 5887 + }, + { + "epoch": 0.7234303968546505, + "grad_norm": 1.3484032706354077, + "learning_rate": 6.463966194043678e-06, + "loss": 0.5708, + "step": 5888 + }, + { + "epoch": 0.7235532620715076, + "grad_norm": 1.2519822253477928, + "learning_rate": 6.458676945789624e-06, + "loss": 0.6783, + "step": 5889 + }, + { + "epoch": 0.7236761272883646, + "grad_norm": 1.182292211595881, + "learning_rate": 6.453389268656558e-06, + "loss": 0.6201, + "step": 5890 + }, + { + "epoch": 0.7237989925052217, + "grad_norm": 1.330363878433059, + "learning_rate": 6.448103163617103e-06, + "loss": 0.5394, + "step": 5891 + }, + { + "epoch": 0.7239218577220788, + "grad_norm": 1.2783003980284016, + "learning_rate": 6.442818631643612e-06, + "loss": 0.6113, + "step": 5892 + }, + { + "epoch": 0.724044722938936, + "grad_norm": 1.300900595706287, + "learning_rate": 6.437535673708143e-06, + "loss": 0.587, + "step": 5893 + }, + { + "epoch": 0.7241675881557931, + "grad_norm": 0.9614770447119556, + "learning_rate": 6.432254290782452e-06, + "loss": 0.5052, + "step": 5894 + }, + { + "epoch": 0.7242904533726502, + "grad_norm": 1.274874404137922, + "learning_rate": 6.42697448383802e-06, + "loss": 0.5565, + "step": 5895 + }, + { + "epoch": 0.7244133185895073, + "grad_norm": 1.373522044669741, + "learning_rate": 6.421696253846033e-06, + "loss": 0.5155, + "step": 5896 + }, + { + "epoch": 0.7245361838063644, + "grad_norm": 1.0846491184633413, + "learning_rate": 6.416419601777395e-06, + "loss": 0.6001, + "step": 5897 + }, + { + "epoch": 0.7246590490232215, + "grad_norm": 1.1291527677621216, + "learning_rate": 6.411144528602693e-06, + "loss": 0.6141, + "step": 5898 + }, + { + "epoch": 0.7247819142400787, + "grad_norm": 1.3662341731540006, + "learning_rate": 6.405871035292266e-06, + "loss": 0.5789, + "step": 5899 + }, + { + "epoch": 0.7249047794569358, + "grad_norm": 1.3330922174913324, + "learning_rate": 6.40059912281612e-06, + "loss": 0.5758, + "step": 5900 + }, + { + "epoch": 0.7250276446737929, + "grad_norm": 1.1482048287579354, + "learning_rate": 6.395328792144003e-06, + "loss": 0.4388, + "step": 5901 + }, + { + "epoch": 0.72515050989065, + "grad_norm": 1.1567440331917391, + "learning_rate": 6.390060044245345e-06, + "loss": 0.6982, + "step": 5902 + }, + { + "epoch": 0.7252733751075071, + "grad_norm": 1.3265421998861628, + "learning_rate": 6.384792880089306e-06, + "loss": 0.5814, + "step": 5903 + }, + { + "epoch": 0.7253962403243642, + "grad_norm": 1.2290424290949398, + "learning_rate": 6.3795273006447505e-06, + "loss": 0.5888, + "step": 5904 + }, + { + "epoch": 0.7255191055412212, + "grad_norm": 1.3258951255819902, + "learning_rate": 6.3742633068802356e-06, + "loss": 0.482, + "step": 5905 + }, + { + "epoch": 0.7256419707580783, + "grad_norm": 1.0747516445633205, + "learning_rate": 6.369000899764046e-06, + "loss": 0.6336, + "step": 5906 + }, + { + "epoch": 0.7257648359749355, + "grad_norm": 1.1720069059810452, + "learning_rate": 6.363740080264166e-06, + "loss": 0.5521, + "step": 5907 + }, + { + "epoch": 0.7258877011917926, + "grad_norm": 1.0247347601216206, + "learning_rate": 6.358480849348296e-06, + "loss": 0.568, + "step": 5908 + }, + { + "epoch": 0.7260105664086497, + "grad_norm": 1.3829418598242205, + "learning_rate": 6.3532232079838275e-06, + "loss": 0.5832, + "step": 5909 + }, + { + "epoch": 0.7261334316255068, + "grad_norm": 1.9129859369169844, + "learning_rate": 6.347967157137873e-06, + "loss": 0.7403, + "step": 5910 + }, + { + "epoch": 0.7262562968423639, + "grad_norm": 1.4682502974136928, + "learning_rate": 6.342712697777254e-06, + "loss": 0.5661, + "step": 5911 + }, + { + "epoch": 0.726379162059221, + "grad_norm": 1.2427824115944508, + "learning_rate": 6.337459830868486e-06, + "loss": 0.5946, + "step": 5912 + }, + { + "epoch": 0.7265020272760782, + "grad_norm": 1.0845964632939977, + "learning_rate": 6.332208557377807e-06, + "loss": 0.5487, + "step": 5913 + }, + { + "epoch": 0.7266248924929353, + "grad_norm": 1.4755333067834688, + "learning_rate": 6.326958878271143e-06, + "loss": 0.563, + "step": 5914 + }, + { + "epoch": 0.7267477577097924, + "grad_norm": 1.1005801506073436, + "learning_rate": 6.321710794514154e-06, + "loss": 0.5698, + "step": 5915 + }, + { + "epoch": 0.7268706229266495, + "grad_norm": 1.1934159640738673, + "learning_rate": 6.3164643070721806e-06, + "loss": 0.6238, + "step": 5916 + }, + { + "epoch": 0.7269934881435066, + "grad_norm": 1.395501281297613, + "learning_rate": 6.3112194169102885e-06, + "loss": 0.461, + "step": 5917 + }, + { + "epoch": 0.7271163533603637, + "grad_norm": 1.2762042769610031, + "learning_rate": 6.305976124993225e-06, + "loss": 0.592, + "step": 5918 + }, + { + "epoch": 0.7272392185772207, + "grad_norm": 1.109726512219594, + "learning_rate": 6.3007344322854815e-06, + "loss": 0.5161, + "step": 5919 + }, + { + "epoch": 0.7273620837940779, + "grad_norm": 1.2906211037528015, + "learning_rate": 6.295494339751217e-06, + "loss": 0.6052, + "step": 5920 + }, + { + "epoch": 0.727484949010935, + "grad_norm": 1.3145932217453877, + "learning_rate": 6.290255848354316e-06, + "loss": 0.5234, + "step": 5921 + }, + { + "epoch": 0.7276078142277921, + "grad_norm": 1.7723376009715528, + "learning_rate": 6.285018959058376e-06, + "loss": 0.6858, + "step": 5922 + }, + { + "epoch": 0.7277306794446492, + "grad_norm": 1.1134422401722797, + "learning_rate": 6.279783672826672e-06, + "loss": 0.6467, + "step": 5923 + }, + { + "epoch": 0.7278535446615063, + "grad_norm": 1.3086179643331146, + "learning_rate": 6.2745499906222136e-06, + "loss": 0.5943, + "step": 5924 + }, + { + "epoch": 0.7279764098783634, + "grad_norm": 1.2477066773697527, + "learning_rate": 6.269317913407688e-06, + "loss": 0.6428, + "step": 5925 + }, + { + "epoch": 0.7280992750952205, + "grad_norm": 1.2521216033209712, + "learning_rate": 6.264087442145524e-06, + "loss": 0.5792, + "step": 5926 + }, + { + "epoch": 0.7282221403120777, + "grad_norm": 1.204469864443805, + "learning_rate": 6.258858577797815e-06, + "loss": 0.6036, + "step": 5927 + }, + { + "epoch": 0.7283450055289348, + "grad_norm": 1.1417131723002953, + "learning_rate": 6.253631321326386e-06, + "loss": 0.5674, + "step": 5928 + }, + { + "epoch": 0.7284678707457919, + "grad_norm": 1.0973539554766425, + "learning_rate": 6.248405673692748e-06, + "loss": 0.5951, + "step": 5929 + }, + { + "epoch": 0.728590735962649, + "grad_norm": 1.2484040826524825, + "learning_rate": 6.243181635858131e-06, + "loss": 0.575, + "step": 5930 + }, + { + "epoch": 0.7287136011795061, + "grad_norm": 1.1844958523526037, + "learning_rate": 6.237959208783468e-06, + "loss": 0.6443, + "step": 5931 + }, + { + "epoch": 0.7288364663963632, + "grad_norm": 1.2809746459385443, + "learning_rate": 6.232738393429378e-06, + "loss": 0.5548, + "step": 5932 + }, + { + "epoch": 0.7289593316132204, + "grad_norm": 1.0363534178395724, + "learning_rate": 6.227519190756204e-06, + "loss": 0.5554, + "step": 5933 + }, + { + "epoch": 0.7290821968300774, + "grad_norm": 1.3955788303044536, + "learning_rate": 6.2223016017239835e-06, + "loss": 0.5876, + "step": 5934 + }, + { + "epoch": 0.7292050620469345, + "grad_norm": 1.0763451590920206, + "learning_rate": 6.217085627292463e-06, + "loss": 0.5376, + "step": 5935 + }, + { + "epoch": 0.7293279272637916, + "grad_norm": 1.0972807194165155, + "learning_rate": 6.2118712684210755e-06, + "loss": 0.5253, + "step": 5936 + }, + { + "epoch": 0.7294507924806487, + "grad_norm": 1.0571268208883549, + "learning_rate": 6.206658526068976e-06, + "loss": 0.5792, + "step": 5937 + }, + { + "epoch": 0.7295736576975058, + "grad_norm": 1.1298074260574107, + "learning_rate": 6.201447401195015e-06, + "loss": 0.5905, + "step": 5938 + }, + { + "epoch": 0.7296965229143629, + "grad_norm": 1.2544465283188764, + "learning_rate": 6.1962378947577486e-06, + "loss": 0.6032, + "step": 5939 + }, + { + "epoch": 0.72981938813122, + "grad_norm": 1.1364535976073067, + "learning_rate": 6.191030007715422e-06, + "loss": 0.5962, + "step": 5940 + }, + { + "epoch": 0.7299422533480772, + "grad_norm": 1.3799265287752271, + "learning_rate": 6.185823741025995e-06, + "loss": 0.6463, + "step": 5941 + }, + { + "epoch": 0.7300651185649343, + "grad_norm": 1.2026825809471708, + "learning_rate": 6.180619095647137e-06, + "loss": 0.5569, + "step": 5942 + }, + { + "epoch": 0.7301879837817914, + "grad_norm": 1.3459257984461155, + "learning_rate": 6.175416072536194e-06, + "loss": 0.5877, + "step": 5943 + }, + { + "epoch": 0.7303108489986485, + "grad_norm": 1.1533345005253142, + "learning_rate": 6.170214672650236e-06, + "loss": 0.5926, + "step": 5944 + }, + { + "epoch": 0.7304337142155056, + "grad_norm": 1.1810523849591672, + "learning_rate": 6.165014896946024e-06, + "loss": 0.4937, + "step": 5945 + }, + { + "epoch": 0.7305565794323627, + "grad_norm": 1.4059823448197015, + "learning_rate": 6.159816746380033e-06, + "loss": 0.6126, + "step": 5946 + }, + { + "epoch": 0.7306794446492199, + "grad_norm": 1.4520605462189493, + "learning_rate": 6.154620221908414e-06, + "loss": 0.5598, + "step": 5947 + }, + { + "epoch": 0.7308023098660769, + "grad_norm": 1.4818425933333033, + "learning_rate": 6.149425324487039e-06, + "loss": 0.6234, + "step": 5948 + }, + { + "epoch": 0.730925175082934, + "grad_norm": 1.7402680824444128, + "learning_rate": 6.144232055071485e-06, + "loss": 0.616, + "step": 5949 + }, + { + "epoch": 0.7310480402997911, + "grad_norm": 1.3141211841280032, + "learning_rate": 6.139040414617006e-06, + "loss": 0.492, + "step": 5950 + }, + { + "epoch": 0.7311709055166482, + "grad_norm": 1.1316602941277514, + "learning_rate": 6.133850404078585e-06, + "loss": 0.6132, + "step": 5951 + }, + { + "epoch": 0.7312937707335053, + "grad_norm": 1.2937085228202196, + "learning_rate": 6.128662024410871e-06, + "loss": 0.5258, + "step": 5952 + }, + { + "epoch": 0.7314166359503624, + "grad_norm": 1.2072587779478297, + "learning_rate": 6.123475276568257e-06, + "loss": 0.614, + "step": 5953 + }, + { + "epoch": 0.7315395011672196, + "grad_norm": 1.2061846015241873, + "learning_rate": 6.118290161504792e-06, + "loss": 0.6343, + "step": 5954 + }, + { + "epoch": 0.7316623663840767, + "grad_norm": 1.1360251862906505, + "learning_rate": 6.113106680174259e-06, + "loss": 0.6981, + "step": 5955 + }, + { + "epoch": 0.7317852316009338, + "grad_norm": 1.1564907474094646, + "learning_rate": 6.107924833530107e-06, + "loss": 0.6094, + "step": 5956 + }, + { + "epoch": 0.7319080968177909, + "grad_norm": 1.1288652984378609, + "learning_rate": 6.102744622525527e-06, + "loss": 0.5222, + "step": 5957 + }, + { + "epoch": 0.732030962034648, + "grad_norm": 1.2862212160413213, + "learning_rate": 6.097566048113365e-06, + "loss": 0.5733, + "step": 5958 + }, + { + "epoch": 0.7321538272515051, + "grad_norm": 1.2075806144187795, + "learning_rate": 6.092389111246201e-06, + "loss": 0.51, + "step": 5959 + }, + { + "epoch": 0.7322766924683622, + "grad_norm": 1.2333403638336322, + "learning_rate": 6.0872138128762866e-06, + "loss": 0.5283, + "step": 5960 + }, + { + "epoch": 0.7323995576852194, + "grad_norm": 1.2197854405846857, + "learning_rate": 6.08204015395559e-06, + "loss": 0.4928, + "step": 5961 + }, + { + "epoch": 0.7325224229020765, + "grad_norm": 1.3259220857267917, + "learning_rate": 6.076868135435778e-06, + "loss": 0.6056, + "step": 5962 + }, + { + "epoch": 0.7326452881189335, + "grad_norm": 1.4463684246246447, + "learning_rate": 6.071697758268192e-06, + "loss": 0.5903, + "step": 5963 + }, + { + "epoch": 0.7327681533357906, + "grad_norm": 1.1483600549479456, + "learning_rate": 6.066529023403913e-06, + "loss": 0.4696, + "step": 5964 + }, + { + "epoch": 0.7328910185526477, + "grad_norm": 1.3368009324499486, + "learning_rate": 6.061361931793679e-06, + "loss": 0.4937, + "step": 5965 + }, + { + "epoch": 0.7330138837695048, + "grad_norm": 1.2541882269709037, + "learning_rate": 6.056196484387954e-06, + "loss": 0.6631, + "step": 5966 + }, + { + "epoch": 0.7331367489863619, + "grad_norm": 1.9830293190277053, + "learning_rate": 6.051032682136877e-06, + "loss": 0.7542, + "step": 5967 + }, + { + "epoch": 0.7332596142032191, + "grad_norm": 1.2351742583726097, + "learning_rate": 6.0458705259903015e-06, + "loss": 0.4148, + "step": 5968 + }, + { + "epoch": 0.7333824794200762, + "grad_norm": 1.4448325231082835, + "learning_rate": 6.04071001689778e-06, + "loss": 0.5496, + "step": 5969 + }, + { + "epoch": 0.7335053446369333, + "grad_norm": 1.267859894869566, + "learning_rate": 6.035551155808542e-06, + "loss": 0.7047, + "step": 5970 + }, + { + "epoch": 0.7336282098537904, + "grad_norm": 1.1089565257096052, + "learning_rate": 6.0303939436715324e-06, + "loss": 0.5056, + "step": 5971 + }, + { + "epoch": 0.7337510750706475, + "grad_norm": 1.2699012659995954, + "learning_rate": 6.025238381435387e-06, + "loss": 0.5287, + "step": 5972 + }, + { + "epoch": 0.7338739402875046, + "grad_norm": 1.1963148002393753, + "learning_rate": 6.020084470048444e-06, + "loss": 0.5251, + "step": 5973 + }, + { + "epoch": 0.7339968055043617, + "grad_norm": 1.2186325250156769, + "learning_rate": 6.01493221045872e-06, + "loss": 0.5778, + "step": 5974 + }, + { + "epoch": 0.7341196707212189, + "grad_norm": 1.1372699334153606, + "learning_rate": 6.0097816036139455e-06, + "loss": 0.6232, + "step": 5975 + }, + { + "epoch": 0.734242535938076, + "grad_norm": 1.3430751220172323, + "learning_rate": 6.004632650461542e-06, + "loss": 0.5674, + "step": 5976 + }, + { + "epoch": 0.734365401154933, + "grad_norm": 1.1960903613778302, + "learning_rate": 5.9994853519486284e-06, + "loss": 0.5804, + "step": 5977 + }, + { + "epoch": 0.7344882663717901, + "grad_norm": 1.09391820737372, + "learning_rate": 5.994339709022012e-06, + "loss": 0.6682, + "step": 5978 + }, + { + "epoch": 0.7346111315886472, + "grad_norm": 1.582821129127882, + "learning_rate": 5.9891957226282e-06, + "loss": 0.5616, + "step": 5979 + }, + { + "epoch": 0.7347339968055043, + "grad_norm": 1.2774247955815607, + "learning_rate": 5.984053393713405e-06, + "loss": 0.5446, + "step": 5980 + }, + { + "epoch": 0.7348568620223614, + "grad_norm": 1.1364319389711852, + "learning_rate": 5.97891272322351e-06, + "loss": 0.6178, + "step": 5981 + }, + { + "epoch": 0.7349797272392186, + "grad_norm": 1.149309820862441, + "learning_rate": 5.973773712104122e-06, + "loss": 0.6612, + "step": 5982 + }, + { + "epoch": 0.7351025924560757, + "grad_norm": 1.3380929231928833, + "learning_rate": 5.968636361300512e-06, + "loss": 0.5601, + "step": 5983 + }, + { + "epoch": 0.7352254576729328, + "grad_norm": 1.3770975218221415, + "learning_rate": 5.963500671757684e-06, + "loss": 0.6472, + "step": 5984 + }, + { + "epoch": 0.7353483228897899, + "grad_norm": 1.2442066080140806, + "learning_rate": 5.958366644420298e-06, + "loss": 0.7093, + "step": 5985 + }, + { + "epoch": 0.735471188106647, + "grad_norm": 1.0344792525846933, + "learning_rate": 5.9532342802327315e-06, + "loss": 0.579, + "step": 5986 + }, + { + "epoch": 0.7355940533235041, + "grad_norm": 1.1072300177138872, + "learning_rate": 5.948103580139052e-06, + "loss": 0.5611, + "step": 5987 + }, + { + "epoch": 0.7357169185403613, + "grad_norm": 1.2861780738839328, + "learning_rate": 5.942974545083013e-06, + "loss": 0.5147, + "step": 5988 + }, + { + "epoch": 0.7358397837572184, + "grad_norm": 1.1631968816144655, + "learning_rate": 5.937847176008072e-06, + "loss": 0.5004, + "step": 5989 + }, + { + "epoch": 0.7359626489740755, + "grad_norm": 1.4123513251296511, + "learning_rate": 5.9327214738573645e-06, + "loss": 0.4884, + "step": 5990 + }, + { + "epoch": 0.7360855141909326, + "grad_norm": 1.088185507115247, + "learning_rate": 5.927597439573748e-06, + "loss": 0.5288, + "step": 5991 + }, + { + "epoch": 0.7362083794077896, + "grad_norm": 1.2043624305397838, + "learning_rate": 5.92247507409974e-06, + "loss": 0.5604, + "step": 5992 + }, + { + "epoch": 0.7363312446246467, + "grad_norm": 1.2374005616242658, + "learning_rate": 5.917354378377579e-06, + "loss": 0.6005, + "step": 5993 + }, + { + "epoch": 0.7364541098415038, + "grad_norm": 1.2192300886964207, + "learning_rate": 5.912235353349171e-06, + "loss": 0.5748, + "step": 5994 + }, + { + "epoch": 0.736576975058361, + "grad_norm": 1.1630264840369593, + "learning_rate": 5.907117999956134e-06, + "loss": 0.6274, + "step": 5995 + }, + { + "epoch": 0.7366998402752181, + "grad_norm": 1.183241522426505, + "learning_rate": 5.9020023191397766e-06, + "loss": 0.6259, + "step": 5996 + }, + { + "epoch": 0.7368227054920752, + "grad_norm": 1.2744069643042508, + "learning_rate": 5.896888311841084e-06, + "loss": 0.5441, + "step": 5997 + }, + { + "epoch": 0.7369455707089323, + "grad_norm": 1.169968376460733, + "learning_rate": 5.891775979000752e-06, + "loss": 0.5319, + "step": 5998 + }, + { + "epoch": 0.7370684359257894, + "grad_norm": 1.2011447435398324, + "learning_rate": 5.886665321559158e-06, + "loss": 0.6592, + "step": 5999 + }, + { + "epoch": 0.7371913011426465, + "grad_norm": 1.0319320468139719, + "learning_rate": 5.881556340456382e-06, + "loss": 0.5846, + "step": 6000 + }, + { + "epoch": 0.7373141663595036, + "grad_norm": 1.2698817725369442, + "learning_rate": 5.876449036632177e-06, + "loss": 0.6434, + "step": 6001 + }, + { + "epoch": 0.7374370315763608, + "grad_norm": 1.0166722020708958, + "learning_rate": 5.871343411026004e-06, + "loss": 0.6642, + "step": 6002 + }, + { + "epoch": 0.7375598967932179, + "grad_norm": 1.4917398292443638, + "learning_rate": 5.866239464577008e-06, + "loss": 0.6124, + "step": 6003 + }, + { + "epoch": 0.737682762010075, + "grad_norm": 1.1968932631243, + "learning_rate": 5.8611371982240344e-06, + "loss": 0.5535, + "step": 6004 + }, + { + "epoch": 0.7378056272269321, + "grad_norm": 1.2764286401208274, + "learning_rate": 5.856036612905598e-06, + "loss": 0.6282, + "step": 6005 + }, + { + "epoch": 0.7379284924437891, + "grad_norm": 1.3193297984251757, + "learning_rate": 5.850937709559929e-06, + "loss": 0.5564, + "step": 6006 + }, + { + "epoch": 0.7380513576606462, + "grad_norm": 1.2521457302985337, + "learning_rate": 5.845840489124939e-06, + "loss": 0.5837, + "step": 6007 + }, + { + "epoch": 0.7381742228775033, + "grad_norm": 1.380659765469898, + "learning_rate": 5.840744952538218e-06, + "loss": 0.5015, + "step": 6008 + }, + { + "epoch": 0.7382970880943605, + "grad_norm": 1.3265391379375717, + "learning_rate": 5.835651100737064e-06, + "loss": 0.5972, + "step": 6009 + }, + { + "epoch": 0.7384199533112176, + "grad_norm": 1.297428109609007, + "learning_rate": 5.8305589346584555e-06, + "loss": 0.5728, + "step": 6010 + }, + { + "epoch": 0.7385428185280747, + "grad_norm": 1.1726722835765486, + "learning_rate": 5.825468455239073e-06, + "loss": 0.7074, + "step": 6011 + }, + { + "epoch": 0.7386656837449318, + "grad_norm": 1.3524320325360817, + "learning_rate": 5.820379663415262e-06, + "loss": 0.5133, + "step": 6012 + }, + { + "epoch": 0.7387885489617889, + "grad_norm": 1.1589277357243195, + "learning_rate": 5.81529256012308e-06, + "loss": 0.5676, + "step": 6013 + }, + { + "epoch": 0.738911414178646, + "grad_norm": 1.237004786707932, + "learning_rate": 5.810207146298273e-06, + "loss": 0.5993, + "step": 6014 + }, + { + "epoch": 0.7390342793955031, + "grad_norm": 1.0277524056318577, + "learning_rate": 5.8051234228762574e-06, + "loss": 0.4883, + "step": 6015 + }, + { + "epoch": 0.7391571446123603, + "grad_norm": 1.3196894386463087, + "learning_rate": 5.800041390792163e-06, + "loss": 0.6824, + "step": 6016 + }, + { + "epoch": 0.7392800098292174, + "grad_norm": 1.2234524638160185, + "learning_rate": 5.79496105098078e-06, + "loss": 0.6819, + "step": 6017 + }, + { + "epoch": 0.7394028750460745, + "grad_norm": 1.30020437962447, + "learning_rate": 5.789882404376626e-06, + "loss": 0.4918, + "step": 6018 + }, + { + "epoch": 0.7395257402629316, + "grad_norm": 1.3522444806470741, + "learning_rate": 5.7848054519138686e-06, + "loss": 0.6636, + "step": 6019 + }, + { + "epoch": 0.7396486054797887, + "grad_norm": 1.2802008188229215, + "learning_rate": 5.77973019452639e-06, + "loss": 0.5443, + "step": 6020 + }, + { + "epoch": 0.7397714706966457, + "grad_norm": 1.5265771170141045, + "learning_rate": 5.7746566331477375e-06, + "loss": 0.591, + "step": 6021 + }, + { + "epoch": 0.7398943359135028, + "grad_norm": 1.1901625382169225, + "learning_rate": 5.769584768711178e-06, + "loss": 0.5486, + "step": 6022 + }, + { + "epoch": 0.74001720113036, + "grad_norm": 1.1494136577626306, + "learning_rate": 5.764514602149634e-06, + "loss": 0.6355, + "step": 6023 + }, + { + "epoch": 0.7401400663472171, + "grad_norm": 1.0393758524348675, + "learning_rate": 5.7594461343957416e-06, + "loss": 0.4735, + "step": 6024 + }, + { + "epoch": 0.7402629315640742, + "grad_norm": 1.1700334866565187, + "learning_rate": 5.7543793663817995e-06, + "loss": 0.6593, + "step": 6025 + }, + { + "epoch": 0.7403857967809313, + "grad_norm": 1.3691896217535486, + "learning_rate": 5.749314299039813e-06, + "loss": 0.6212, + "step": 6026 + }, + { + "epoch": 0.7405086619977884, + "grad_norm": 1.337860819267661, + "learning_rate": 5.744250933301473e-06, + "loss": 0.5703, + "step": 6027 + }, + { + "epoch": 0.7406315272146455, + "grad_norm": 1.0834616116919549, + "learning_rate": 5.739189270098137e-06, + "loss": 0.595, + "step": 6028 + }, + { + "epoch": 0.7407543924315027, + "grad_norm": 1.222051423102389, + "learning_rate": 5.734129310360889e-06, + "loss": 0.5357, + "step": 6029 + }, + { + "epoch": 0.7408772576483598, + "grad_norm": 1.3341920305954666, + "learning_rate": 5.729071055020456e-06, + "loss": 0.5699, + "step": 6030 + }, + { + "epoch": 0.7410001228652169, + "grad_norm": 1.2761497765819843, + "learning_rate": 5.724014505007285e-06, + "loss": 0.5389, + "step": 6031 + }, + { + "epoch": 0.741122988082074, + "grad_norm": 1.1614709998045407, + "learning_rate": 5.7189596612514814e-06, + "loss": 0.5717, + "step": 6032 + }, + { + "epoch": 0.7412458532989311, + "grad_norm": 1.2432202064633506, + "learning_rate": 5.71390652468286e-06, + "loss": 0.596, + "step": 6033 + }, + { + "epoch": 0.7413687185157882, + "grad_norm": 1.2830264710161703, + "learning_rate": 5.7088550962309175e-06, + "loss": 0.5265, + "step": 6034 + }, + { + "epoch": 0.7414915837326453, + "grad_norm": 1.0342901730090683, + "learning_rate": 5.703805376824817e-06, + "loss": 0.5165, + "step": 6035 + }, + { + "epoch": 0.7416144489495023, + "grad_norm": 1.4783153634721442, + "learning_rate": 5.69875736739343e-06, + "loss": 0.6446, + "step": 6036 + }, + { + "epoch": 0.7417373141663595, + "grad_norm": 1.320644368665161, + "learning_rate": 5.693711068865307e-06, + "loss": 0.6387, + "step": 6037 + }, + { + "epoch": 0.7418601793832166, + "grad_norm": 1.1737786084778394, + "learning_rate": 5.688666482168682e-06, + "loss": 0.5594, + "step": 6038 + }, + { + "epoch": 0.7419830446000737, + "grad_norm": 1.031752689088059, + "learning_rate": 5.683623608231467e-06, + "loss": 0.5765, + "step": 6039 + }, + { + "epoch": 0.7421059098169308, + "grad_norm": 1.2002489814169268, + "learning_rate": 5.678582447981271e-06, + "loss": 0.6122, + "step": 6040 + }, + { + "epoch": 0.7422287750337879, + "grad_norm": 1.2611756341580498, + "learning_rate": 5.673543002345383e-06, + "loss": 0.6215, + "step": 6041 + }, + { + "epoch": 0.742351640250645, + "grad_norm": 1.4438676866866063, + "learning_rate": 5.66850527225078e-06, + "loss": 0.674, + "step": 6042 + }, + { + "epoch": 0.7424745054675022, + "grad_norm": 1.4561988233519245, + "learning_rate": 5.663469258624109e-06, + "loss": 0.699, + "step": 6043 + }, + { + "epoch": 0.7425973706843593, + "grad_norm": 1.0284174656433023, + "learning_rate": 5.658434962391719e-06, + "loss": 0.5688, + "step": 6044 + }, + { + "epoch": 0.7427202359012164, + "grad_norm": 1.1855071106902055, + "learning_rate": 5.653402384479642e-06, + "loss": 0.5861, + "step": 6045 + }, + { + "epoch": 0.7428431011180735, + "grad_norm": 1.2311506919796311, + "learning_rate": 5.648371525813575e-06, + "loss": 0.6429, + "step": 6046 + }, + { + "epoch": 0.7429659663349306, + "grad_norm": 1.3117150448098116, + "learning_rate": 5.6433423873189184e-06, + "loss": 0.5261, + "step": 6047 + }, + { + "epoch": 0.7430888315517877, + "grad_norm": 1.3101177328395803, + "learning_rate": 5.638314969920749e-06, + "loss": 0.5316, + "step": 6048 + }, + { + "epoch": 0.7432116967686448, + "grad_norm": 1.7512872937941992, + "learning_rate": 5.633289274543835e-06, + "loss": 0.629, + "step": 6049 + }, + { + "epoch": 0.7433345619855019, + "grad_norm": 1.093438423117958, + "learning_rate": 5.628265302112607e-06, + "loss": 0.5206, + "step": 6050 + }, + { + "epoch": 0.743457427202359, + "grad_norm": 1.2703556848558022, + "learning_rate": 5.623243053551199e-06, + "loss": 0.6004, + "step": 6051 + }, + { + "epoch": 0.7435802924192161, + "grad_norm": 1.0767379304353755, + "learning_rate": 5.618222529783428e-06, + "loss": 0.5467, + "step": 6052 + }, + { + "epoch": 0.7437031576360732, + "grad_norm": 1.090686818516809, + "learning_rate": 5.613203731732772e-06, + "loss": 0.5954, + "step": 6053 + }, + { + "epoch": 0.7438260228529303, + "grad_norm": 1.345758458008963, + "learning_rate": 5.608186660322421e-06, + "loss": 0.6289, + "step": 6054 + }, + { + "epoch": 0.7439488880697874, + "grad_norm": 1.2378387278421428, + "learning_rate": 5.603171316475213e-06, + "loss": 0.6071, + "step": 6055 + }, + { + "epoch": 0.7440717532866445, + "grad_norm": 1.407903280891253, + "learning_rate": 5.598157701113714e-06, + "loss": 0.5653, + "step": 6056 + }, + { + "epoch": 0.7441946185035017, + "grad_norm": 1.2773942282658086, + "learning_rate": 5.593145815160127e-06, + "loss": 0.5962, + "step": 6057 + }, + { + "epoch": 0.7443174837203588, + "grad_norm": 1.0284426236178532, + "learning_rate": 5.588135659536366e-06, + "loss": 0.6871, + "step": 6058 + }, + { + "epoch": 0.7444403489372159, + "grad_norm": 1.2113082018050378, + "learning_rate": 5.583127235164003e-06, + "loss": 0.581, + "step": 6059 + }, + { + "epoch": 0.744563214154073, + "grad_norm": 1.3996241452045892, + "learning_rate": 5.578120542964324e-06, + "loss": 0.6388, + "step": 6060 + }, + { + "epoch": 0.7446860793709301, + "grad_norm": 1.448496226776883, + "learning_rate": 5.573115583858262e-06, + "loss": 0.582, + "step": 6061 + }, + { + "epoch": 0.7448089445877872, + "grad_norm": 1.3096975882700905, + "learning_rate": 5.568112358766461e-06, + "loss": 0.5868, + "step": 6062 + }, + { + "epoch": 0.7449318098046444, + "grad_norm": 1.2467837349311237, + "learning_rate": 5.563110868609215e-06, + "loss": 0.6521, + "step": 6063 + }, + { + "epoch": 0.7450546750215015, + "grad_norm": 1.1651963975809811, + "learning_rate": 5.5581111143065265e-06, + "loss": 0.5628, + "step": 6064 + }, + { + "epoch": 0.7451775402383585, + "grad_norm": 1.3373911580584463, + "learning_rate": 5.55311309677807e-06, + "loss": 0.7413, + "step": 6065 + }, + { + "epoch": 0.7453004054552156, + "grad_norm": 1.0156222455888035, + "learning_rate": 5.548116816943191e-06, + "loss": 0.51, + "step": 6066 + }, + { + "epoch": 0.7454232706720727, + "grad_norm": 1.0477607363752004, + "learning_rate": 5.543122275720922e-06, + "loss": 0.496, + "step": 6067 + }, + { + "epoch": 0.7455461358889298, + "grad_norm": 1.107352628700259, + "learning_rate": 5.538129474029984e-06, + "loss": 0.6389, + "step": 6068 + }, + { + "epoch": 0.7456690011057869, + "grad_norm": 1.3697580022233429, + "learning_rate": 5.533138412788771e-06, + "loss": 0.5557, + "step": 6069 + }, + { + "epoch": 0.745791866322644, + "grad_norm": 1.4529111356403113, + "learning_rate": 5.528149092915346e-06, + "loss": 0.6221, + "step": 6070 + }, + { + "epoch": 0.7459147315395012, + "grad_norm": 1.2491690982821884, + "learning_rate": 5.523161515327469e-06, + "loss": 0.6295, + "step": 6071 + }, + { + "epoch": 0.7460375967563583, + "grad_norm": 1.2559167473495108, + "learning_rate": 5.518175680942577e-06, + "loss": 0.4856, + "step": 6072 + }, + { + "epoch": 0.7461604619732154, + "grad_norm": 1.3168904145230191, + "learning_rate": 5.513191590677772e-06, + "loss": 0.5608, + "step": 6073 + }, + { + "epoch": 0.7462833271900725, + "grad_norm": 1.3354579303629706, + "learning_rate": 5.508209245449849e-06, + "loss": 0.5437, + "step": 6074 + }, + { + "epoch": 0.7464061924069296, + "grad_norm": 1.1841902713986125, + "learning_rate": 5.503228646175278e-06, + "loss": 0.6426, + "step": 6075 + }, + { + "epoch": 0.7465290576237867, + "grad_norm": 1.2754241581672172, + "learning_rate": 5.498249793770216e-06, + "loss": 0.6013, + "step": 6076 + }, + { + "epoch": 0.7466519228406439, + "grad_norm": 0.9659092244574345, + "learning_rate": 5.493272689150478e-06, + "loss": 0.538, + "step": 6077 + }, + { + "epoch": 0.746774788057501, + "grad_norm": 1.252808086029255, + "learning_rate": 5.4882973332315746e-06, + "loss": 0.6307, + "step": 6078 + }, + { + "epoch": 0.746897653274358, + "grad_norm": 1.2146466694336877, + "learning_rate": 5.4833237269286915e-06, + "loss": 0.4044, + "step": 6079 + }, + { + "epoch": 0.7470205184912151, + "grad_norm": 1.242132717296946, + "learning_rate": 5.478351871156696e-06, + "loss": 0.6377, + "step": 6080 + }, + { + "epoch": 0.7471433837080722, + "grad_norm": 1.4479828061014228, + "learning_rate": 5.473381766830119e-06, + "loss": 0.4967, + "step": 6081 + }, + { + "epoch": 0.7472662489249293, + "grad_norm": 1.142134231146321, + "learning_rate": 5.468413414863184e-06, + "loss": 0.8034, + "step": 6082 + }, + { + "epoch": 0.7473891141417864, + "grad_norm": 1.259165955092884, + "learning_rate": 5.463446816169792e-06, + "loss": 0.5917, + "step": 6083 + }, + { + "epoch": 0.7475119793586436, + "grad_norm": 1.2563544795477573, + "learning_rate": 5.458481971663505e-06, + "loss": 0.5482, + "step": 6084 + }, + { + "epoch": 0.7476348445755007, + "grad_norm": 1.3367601963924942, + "learning_rate": 5.453518882257586e-06, + "loss": 0.6902, + "step": 6085 + }, + { + "epoch": 0.7477577097923578, + "grad_norm": 1.2211660584750295, + "learning_rate": 5.448557548864948e-06, + "loss": 0.5078, + "step": 6086 + }, + { + "epoch": 0.7478805750092149, + "grad_norm": 1.4390848227927397, + "learning_rate": 5.4435979723982145e-06, + "loss": 0.6475, + "step": 6087 + }, + { + "epoch": 0.748003440226072, + "grad_norm": 1.3301476730524902, + "learning_rate": 5.438640153769654e-06, + "loss": 0.6309, + "step": 6088 + }, + { + "epoch": 0.7481263054429291, + "grad_norm": 1.3365094298724436, + "learning_rate": 5.433684093891231e-06, + "loss": 0.5787, + "step": 6089 + }, + { + "epoch": 0.7482491706597862, + "grad_norm": 1.0099168728841101, + "learning_rate": 5.428729793674582e-06, + "loss": 0.6494, + "step": 6090 + }, + { + "epoch": 0.7483720358766434, + "grad_norm": 1.2636693648761934, + "learning_rate": 5.423777254031013e-06, + "loss": 0.5569, + "step": 6091 + }, + { + "epoch": 0.7484949010935005, + "grad_norm": 1.625175580808489, + "learning_rate": 5.4188264758715165e-06, + "loss": 0.6215, + "step": 6092 + }, + { + "epoch": 0.7486177663103576, + "grad_norm": 1.3529318446552734, + "learning_rate": 5.4138774601067456e-06, + "loss": 0.5353, + "step": 6093 + }, + { + "epoch": 0.7487406315272146, + "grad_norm": 1.4421611053600714, + "learning_rate": 5.408930207647057e-06, + "loss": 0.6331, + "step": 6094 + }, + { + "epoch": 0.7488634967440717, + "grad_norm": 1.4879297539763068, + "learning_rate": 5.403984719402452e-06, + "loss": 0.6621, + "step": 6095 + }, + { + "epoch": 0.7489863619609288, + "grad_norm": 1.1211069764683337, + "learning_rate": 5.399040996282631e-06, + "loss": 0.6214, + "step": 6096 + }, + { + "epoch": 0.7491092271777859, + "grad_norm": 0.9889354233537592, + "learning_rate": 5.394099039196947e-06, + "loss": 0.6027, + "step": 6097 + }, + { + "epoch": 0.7492320923946431, + "grad_norm": 1.3037438374079497, + "learning_rate": 5.38915884905445e-06, + "loss": 0.5803, + "step": 6098 + }, + { + "epoch": 0.7493549576115002, + "grad_norm": 1.2825431032011878, + "learning_rate": 5.384220426763854e-06, + "loss": 0.6134, + "step": 6099 + }, + { + "epoch": 0.7494778228283573, + "grad_norm": 1.316272283376697, + "learning_rate": 5.379283773233556e-06, + "loss": 0.6671, + "step": 6100 + }, + { + "epoch": 0.7496006880452144, + "grad_norm": 1.0121925444146407, + "learning_rate": 5.374348889371608e-06, + "loss": 0.5582, + "step": 6101 + }, + { + "epoch": 0.7497235532620715, + "grad_norm": 1.1676704592210234, + "learning_rate": 5.369415776085759e-06, + "loss": 0.5957, + "step": 6102 + }, + { + "epoch": 0.7498464184789286, + "grad_norm": 2.3826005274307875, + "learning_rate": 5.364484434283427e-06, + "loss": 0.7237, + "step": 6103 + }, + { + "epoch": 0.7499692836957857, + "grad_norm": 1.26556584812381, + "learning_rate": 5.3595548648716884e-06, + "loss": 0.6356, + "step": 6104 + }, + { + "epoch": 0.7500921489126429, + "grad_norm": 1.2742291296398531, + "learning_rate": 5.354627068757311e-06, + "loss": 0.4672, + "step": 6105 + }, + { + "epoch": 0.7502150141295, + "grad_norm": 1.529380974434644, + "learning_rate": 5.349701046846734e-06, + "loss": 0.657, + "step": 6106 + }, + { + "epoch": 0.7503378793463571, + "grad_norm": 1.0173141226317903, + "learning_rate": 5.344776800046068e-06, + "loss": 0.5586, + "step": 6107 + }, + { + "epoch": 0.7504607445632141, + "grad_norm": 1.2751864603319956, + "learning_rate": 5.33985432926109e-06, + "loss": 0.6142, + "step": 6108 + }, + { + "epoch": 0.7505836097800712, + "grad_norm": 1.0585410319364028, + "learning_rate": 5.334933635397261e-06, + "loss": 0.639, + "step": 6109 + }, + { + "epoch": 0.7507064749969283, + "grad_norm": 1.1691734362489559, + "learning_rate": 5.330014719359712e-06, + "loss": 0.6146, + "step": 6110 + }, + { + "epoch": 0.7508293402137854, + "grad_norm": 1.196801053660907, + "learning_rate": 5.325097582053239e-06, + "loss": 0.5665, + "step": 6111 + }, + { + "epoch": 0.7509522054306426, + "grad_norm": 1.054031998669263, + "learning_rate": 5.320182224382322e-06, + "loss": 0.6945, + "step": 6112 + }, + { + "epoch": 0.7510750706474997, + "grad_norm": 1.1985353137494985, + "learning_rate": 5.315268647251109e-06, + "loss": 0.5241, + "step": 6113 + }, + { + "epoch": 0.7511979358643568, + "grad_norm": 1.4198611645549337, + "learning_rate": 5.310356851563427e-06, + "loss": 0.5883, + "step": 6114 + }, + { + "epoch": 0.7513208010812139, + "grad_norm": 1.4801667084919736, + "learning_rate": 5.305446838222757e-06, + "loss": 0.5917, + "step": 6115 + }, + { + "epoch": 0.751443666298071, + "grad_norm": 1.657229424621451, + "learning_rate": 5.300538608132269e-06, + "loss": 0.5299, + "step": 6116 + }, + { + "epoch": 0.7515665315149281, + "grad_norm": 1.1982248542615805, + "learning_rate": 5.295632162194806e-06, + "loss": 0.5749, + "step": 6117 + }, + { + "epoch": 0.7516893967317853, + "grad_norm": 1.235749036147684, + "learning_rate": 5.290727501312867e-06, + "loss": 0.4401, + "step": 6118 + }, + { + "epoch": 0.7518122619486424, + "grad_norm": 1.0895257425129083, + "learning_rate": 5.285824626388641e-06, + "loss": 0.6312, + "step": 6119 + }, + { + "epoch": 0.7519351271654995, + "grad_norm": 1.3182467364675798, + "learning_rate": 5.280923538323967e-06, + "loss": 0.64, + "step": 6120 + }, + { + "epoch": 0.7520579923823566, + "grad_norm": 1.2106496567939957, + "learning_rate": 5.276024238020389e-06, + "loss": 0.4504, + "step": 6121 + }, + { + "epoch": 0.7521808575992137, + "grad_norm": 1.1865498924379072, + "learning_rate": 5.2711267263790845e-06, + "loss": 0.5681, + "step": 6122 + }, + { + "epoch": 0.7523037228160707, + "grad_norm": 1.2874707682428514, + "learning_rate": 5.2662310043009295e-06, + "loss": 0.5759, + "step": 6123 + }, + { + "epoch": 0.7524265880329278, + "grad_norm": 1.3517991895554586, + "learning_rate": 5.2613370726864445e-06, + "loss": 0.5021, + "step": 6124 + }, + { + "epoch": 0.752549453249785, + "grad_norm": 1.2013241688223106, + "learning_rate": 5.256444932435859e-06, + "loss": 0.6487, + "step": 6125 + }, + { + "epoch": 0.7526723184666421, + "grad_norm": 1.0977314099593183, + "learning_rate": 5.251554584449034e-06, + "loss": 0.6188, + "step": 6126 + }, + { + "epoch": 0.7527951836834992, + "grad_norm": 1.5069596800774885, + "learning_rate": 5.246666029625527e-06, + "loss": 0.5148, + "step": 6127 + }, + { + "epoch": 0.7529180489003563, + "grad_norm": 1.2193935419740007, + "learning_rate": 5.241779268864546e-06, + "loss": 0.5836, + "step": 6128 + }, + { + "epoch": 0.7530409141172134, + "grad_norm": 1.3105081364373665, + "learning_rate": 5.2368943030649835e-06, + "loss": 0.6254, + "step": 6129 + }, + { + "epoch": 0.7531637793340705, + "grad_norm": 1.2289434304229043, + "learning_rate": 5.2320111331254054e-06, + "loss": 0.5794, + "step": 6130 + }, + { + "epoch": 0.7532866445509276, + "grad_norm": 1.184804193250878, + "learning_rate": 5.227129759944024e-06, + "loss": 0.5688, + "step": 6131 + }, + { + "epoch": 0.7534095097677848, + "grad_norm": 1.3139461522279745, + "learning_rate": 5.2222501844187465e-06, + "loss": 0.549, + "step": 6132 + }, + { + "epoch": 0.7535323749846419, + "grad_norm": 1.4339102776878776, + "learning_rate": 5.217372407447135e-06, + "loss": 0.5826, + "step": 6133 + }, + { + "epoch": 0.753655240201499, + "grad_norm": 1.0487029947852495, + "learning_rate": 5.212496429926432e-06, + "loss": 0.484, + "step": 6134 + }, + { + "epoch": 0.7537781054183561, + "grad_norm": 1.4212711989058104, + "learning_rate": 5.2076222527535296e-06, + "loss": 0.679, + "step": 6135 + }, + { + "epoch": 0.7539009706352132, + "grad_norm": 1.2610673705348483, + "learning_rate": 5.202749876825011e-06, + "loss": 0.5446, + "step": 6136 + }, + { + "epoch": 0.7540238358520703, + "grad_norm": 1.3027082000811856, + "learning_rate": 5.197879303037119e-06, + "loss": 0.6387, + "step": 6137 + }, + { + "epoch": 0.7541467010689273, + "grad_norm": 1.2118673595606397, + "learning_rate": 5.193010532285755e-06, + "loss": 0.5791, + "step": 6138 + }, + { + "epoch": 0.7542695662857845, + "grad_norm": 1.0593746234212837, + "learning_rate": 5.188143565466503e-06, + "loss": 0.4491, + "step": 6139 + }, + { + "epoch": 0.7543924315026416, + "grad_norm": 1.3584070051575179, + "learning_rate": 5.183278403474611e-06, + "loss": 0.7176, + "step": 6140 + }, + { + "epoch": 0.7545152967194987, + "grad_norm": 1.499931011260162, + "learning_rate": 5.1784150472049975e-06, + "loss": 0.6363, + "step": 6141 + }, + { + "epoch": 0.7546381619363558, + "grad_norm": 1.5272644928820764, + "learning_rate": 5.173553497552235e-06, + "loss": 0.6509, + "step": 6142 + }, + { + "epoch": 0.7547610271532129, + "grad_norm": 1.4759613239165421, + "learning_rate": 5.168693755410581e-06, + "loss": 0.4918, + "step": 6143 + }, + { + "epoch": 0.75488389237007, + "grad_norm": 1.228810580076364, + "learning_rate": 5.163835821673952e-06, + "loss": 0.5377, + "step": 6144 + }, + { + "epoch": 0.7550067575869271, + "grad_norm": 1.1113373284193049, + "learning_rate": 5.158979697235938e-06, + "loss": 0.6257, + "step": 6145 + }, + { + "epoch": 0.7551296228037843, + "grad_norm": 1.105737129222101, + "learning_rate": 5.154125382989783e-06, + "loss": 0.6495, + "step": 6146 + }, + { + "epoch": 0.7552524880206414, + "grad_norm": 1.118900294928338, + "learning_rate": 5.149272879828411e-06, + "loss": 0.5114, + "step": 6147 + }, + { + "epoch": 0.7553753532374985, + "grad_norm": 1.1864789048762436, + "learning_rate": 5.144422188644414e-06, + "loss": 0.6526, + "step": 6148 + }, + { + "epoch": 0.7554982184543556, + "grad_norm": 1.1015799944886335, + "learning_rate": 5.139573310330035e-06, + "loss": 0.5812, + "step": 6149 + }, + { + "epoch": 0.7556210836712127, + "grad_norm": 1.6425011729817087, + "learning_rate": 5.134726245777202e-06, + "loss": 0.5987, + "step": 6150 + }, + { + "epoch": 0.7557439488880698, + "grad_norm": 1.3189035533624787, + "learning_rate": 5.1298809958774884e-06, + "loss": 0.5212, + "step": 6151 + }, + { + "epoch": 0.7558668141049268, + "grad_norm": 1.0176286382053101, + "learning_rate": 5.125037561522166e-06, + "loss": 0.59, + "step": 6152 + }, + { + "epoch": 0.755989679321784, + "grad_norm": 1.407559640736647, + "learning_rate": 5.120195943602138e-06, + "loss": 0.6334, + "step": 6153 + }, + { + "epoch": 0.7561125445386411, + "grad_norm": 1.2585253669721768, + "learning_rate": 5.115356143007993e-06, + "loss": 0.5648, + "step": 6154 + }, + { + "epoch": 0.7562354097554982, + "grad_norm": 1.121747527505758, + "learning_rate": 5.110518160629987e-06, + "loss": 0.5633, + "step": 6155 + }, + { + "epoch": 0.7563582749723553, + "grad_norm": 0.9225109886306078, + "learning_rate": 5.105681997358023e-06, + "loss": 0.6463, + "step": 6156 + }, + { + "epoch": 0.7564811401892124, + "grad_norm": 1.1917777836092838, + "learning_rate": 5.100847654081695e-06, + "loss": 0.585, + "step": 6157 + }, + { + "epoch": 0.7566040054060695, + "grad_norm": 1.0895499549305776, + "learning_rate": 5.096015131690233e-06, + "loss": 0.6331, + "step": 6158 + }, + { + "epoch": 0.7567268706229267, + "grad_norm": 0.901903859344225, + "learning_rate": 5.091184431072567e-06, + "loss": 0.5574, + "step": 6159 + }, + { + "epoch": 0.7568497358397838, + "grad_norm": 1.4040315849000065, + "learning_rate": 5.086355553117259e-06, + "loss": 0.6491, + "step": 6160 + }, + { + "epoch": 0.7569726010566409, + "grad_norm": 1.145943015333412, + "learning_rate": 5.08152849871256e-06, + "loss": 0.4806, + "step": 6161 + }, + { + "epoch": 0.757095466273498, + "grad_norm": 1.2144487851649939, + "learning_rate": 5.07670326874636e-06, + "loss": 0.5783, + "step": 6162 + }, + { + "epoch": 0.7572183314903551, + "grad_norm": 1.0630192076072107, + "learning_rate": 5.07187986410625e-06, + "loss": 0.627, + "step": 6163 + }, + { + "epoch": 0.7573411967072122, + "grad_norm": 1.441261929979935, + "learning_rate": 5.067058285679448e-06, + "loss": 0.5747, + "step": 6164 + }, + { + "epoch": 0.7574640619240693, + "grad_norm": 1.2627915262666378, + "learning_rate": 5.06223853435286e-06, + "loss": 0.5178, + "step": 6165 + }, + { + "epoch": 0.7575869271409265, + "grad_norm": 1.2279999824632948, + "learning_rate": 5.057420611013041e-06, + "loss": 0.6524, + "step": 6166 + }, + { + "epoch": 0.7577097923577835, + "grad_norm": 1.268294591501964, + "learning_rate": 5.052604516546221e-06, + "loss": 0.5171, + "step": 6167 + }, + { + "epoch": 0.7578326575746406, + "grad_norm": 1.544119733374258, + "learning_rate": 5.047790251838293e-06, + "loss": 0.6296, + "step": 6168 + }, + { + "epoch": 0.7579555227914977, + "grad_norm": 1.3334093714663329, + "learning_rate": 5.042977817774802e-06, + "loss": 0.5049, + "step": 6169 + }, + { + "epoch": 0.7580783880083548, + "grad_norm": 1.387452131540937, + "learning_rate": 5.038167215240967e-06, + "loss": 0.5679, + "step": 6170 + }, + { + "epoch": 0.7582012532252119, + "grad_norm": 1.5397749387987048, + "learning_rate": 5.033358445121669e-06, + "loss": 0.5457, + "step": 6171 + }, + { + "epoch": 0.758324118442069, + "grad_norm": 1.1448887450597311, + "learning_rate": 5.028551508301453e-06, + "loss": 0.5245, + "step": 6172 + }, + { + "epoch": 0.7584469836589262, + "grad_norm": 0.9206369068121089, + "learning_rate": 5.0237464056645155e-06, + "loss": 0.577, + "step": 6173 + }, + { + "epoch": 0.7585698488757833, + "grad_norm": 1.1482537768766417, + "learning_rate": 5.0189431380947295e-06, + "loss": 0.5781, + "step": 6174 + }, + { + "epoch": 0.7586927140926404, + "grad_norm": 1.0220101529823937, + "learning_rate": 5.014141706475626e-06, + "loss": 0.5492, + "step": 6175 + }, + { + "epoch": 0.7588155793094975, + "grad_norm": 1.3622774311055634, + "learning_rate": 5.009342111690393e-06, + "loss": 0.5655, + "step": 6176 + }, + { + "epoch": 0.7589384445263546, + "grad_norm": 1.0652049592489619, + "learning_rate": 5.0045443546218855e-06, + "loss": 0.6516, + "step": 6177 + }, + { + "epoch": 0.7590613097432117, + "grad_norm": 1.0541472095861057, + "learning_rate": 4.999748436152621e-06, + "loss": 0.5034, + "step": 6178 + }, + { + "epoch": 0.7591841749600688, + "grad_norm": 1.3687837913679113, + "learning_rate": 4.9949543571647834e-06, + "loss": 0.4931, + "step": 6179 + }, + { + "epoch": 0.759307040176926, + "grad_norm": 1.2365334204426892, + "learning_rate": 4.9901621185402005e-06, + "loss": 0.5504, + "step": 6180 + }, + { + "epoch": 0.759429905393783, + "grad_norm": 1.1483223381318586, + "learning_rate": 4.985371721160381e-06, + "loss": 0.5601, + "step": 6181 + }, + { + "epoch": 0.7595527706106401, + "grad_norm": 1.268905294065355, + "learning_rate": 4.980583165906486e-06, + "loss": 0.6327, + "step": 6182 + }, + { + "epoch": 0.7596756358274972, + "grad_norm": 1.126703186176789, + "learning_rate": 4.9757964536593444e-06, + "loss": 0.5699, + "step": 6183 + }, + { + "epoch": 0.7597985010443543, + "grad_norm": 1.1687955472833225, + "learning_rate": 4.971011585299431e-06, + "loss": 0.5893, + "step": 6184 + }, + { + "epoch": 0.7599213662612114, + "grad_norm": 1.7013007344719617, + "learning_rate": 4.966228561706895e-06, + "loss": 0.6978, + "step": 6185 + }, + { + "epoch": 0.7600442314780685, + "grad_norm": 1.282032821629575, + "learning_rate": 4.9614473837615505e-06, + "loss": 0.5414, + "step": 6186 + }, + { + "epoch": 0.7601670966949257, + "grad_norm": 1.1737288099912215, + "learning_rate": 4.956668052342852e-06, + "loss": 0.5107, + "step": 6187 + }, + { + "epoch": 0.7602899619117828, + "grad_norm": 1.081648149539416, + "learning_rate": 4.951890568329937e-06, + "loss": 0.5472, + "step": 6188 + }, + { + "epoch": 0.7604128271286399, + "grad_norm": 1.2943207177251297, + "learning_rate": 4.947114932601577e-06, + "loss": 0.5688, + "step": 6189 + }, + { + "epoch": 0.760535692345497, + "grad_norm": 1.1216193742291742, + "learning_rate": 4.94234114603624e-06, + "loss": 0.5954, + "step": 6190 + }, + { + "epoch": 0.7606585575623541, + "grad_norm": 1.136562457481189, + "learning_rate": 4.937569209512019e-06, + "loss": 0.6068, + "step": 6191 + }, + { + "epoch": 0.7607814227792112, + "grad_norm": 1.4005803900789797, + "learning_rate": 4.9327991239066885e-06, + "loss": 0.6366, + "step": 6192 + }, + { + "epoch": 0.7609042879960684, + "grad_norm": 1.109078145593001, + "learning_rate": 4.928030890097666e-06, + "loss": 0.4931, + "step": 6193 + }, + { + "epoch": 0.7610271532129255, + "grad_norm": 1.3206458846754716, + "learning_rate": 4.923264508962044e-06, + "loss": 0.7064, + "step": 6194 + }, + { + "epoch": 0.7611500184297826, + "grad_norm": 1.199083028037502, + "learning_rate": 4.91849998137657e-06, + "loss": 0.6143, + "step": 6195 + }, + { + "epoch": 0.7612728836466396, + "grad_norm": 1.1561876973244052, + "learning_rate": 4.9137373082176336e-06, + "loss": 0.7175, + "step": 6196 + }, + { + "epoch": 0.7613957488634967, + "grad_norm": 1.236131186143729, + "learning_rate": 4.908976490361316e-06, + "loss": 0.6585, + "step": 6197 + }, + { + "epoch": 0.7615186140803538, + "grad_norm": 1.3188430926487582, + "learning_rate": 4.904217528683327e-06, + "loss": 0.6059, + "step": 6198 + }, + { + "epoch": 0.7616414792972109, + "grad_norm": 1.2367378743342927, + "learning_rate": 4.899460424059056e-06, + "loss": 0.5243, + "step": 6199 + }, + { + "epoch": 0.761764344514068, + "grad_norm": 1.4464975492447327, + "learning_rate": 4.894705177363523e-06, + "loss": 0.6234, + "step": 6200 + }, + { + "epoch": 0.7618872097309252, + "grad_norm": 1.3710133008679388, + "learning_rate": 4.88995178947145e-06, + "loss": 0.5522, + "step": 6201 + }, + { + "epoch": 0.7620100749477823, + "grad_norm": 1.5029693734151188, + "learning_rate": 4.885200261257172e-06, + "loss": 0.5972, + "step": 6202 + }, + { + "epoch": 0.7621329401646394, + "grad_norm": 1.3219938948833083, + "learning_rate": 4.880450593594717e-06, + "loss": 0.615, + "step": 6203 + }, + { + "epoch": 0.7622558053814965, + "grad_norm": 1.2337912491301737, + "learning_rate": 4.87570278735774e-06, + "loss": 0.5287, + "step": 6204 + }, + { + "epoch": 0.7623786705983536, + "grad_norm": 1.175639188728221, + "learning_rate": 4.870956843419579e-06, + "loss": 0.5069, + "step": 6205 + }, + { + "epoch": 0.7625015358152107, + "grad_norm": 1.2774523654752785, + "learning_rate": 4.866212762653221e-06, + "loss": 0.5115, + "step": 6206 + }, + { + "epoch": 0.7626244010320679, + "grad_norm": 1.400148555438113, + "learning_rate": 4.861470545931302e-06, + "loss": 0.5766, + "step": 6207 + }, + { + "epoch": 0.762747266248925, + "grad_norm": 1.0746691885530342, + "learning_rate": 4.856730194126124e-06, + "loss": 0.5796, + "step": 6208 + }, + { + "epoch": 0.7628701314657821, + "grad_norm": 1.3226997939066472, + "learning_rate": 4.851991708109646e-06, + "loss": 0.5499, + "step": 6209 + }, + { + "epoch": 0.7629929966826391, + "grad_norm": 1.3808453118503121, + "learning_rate": 4.8472550887534865e-06, + "loss": 0.5127, + "step": 6210 + }, + { + "epoch": 0.7631158618994962, + "grad_norm": 1.2170373017741853, + "learning_rate": 4.842520336928904e-06, + "loss": 0.7245, + "step": 6211 + }, + { + "epoch": 0.7632387271163533, + "grad_norm": 1.179310433568015, + "learning_rate": 4.837787453506833e-06, + "loss": 0.6094, + "step": 6212 + }, + { + "epoch": 0.7633615923332104, + "grad_norm": 1.2921501332671048, + "learning_rate": 4.83305643935786e-06, + "loss": 0.598, + "step": 6213 + }, + { + "epoch": 0.7634844575500676, + "grad_norm": 1.1203097315269281, + "learning_rate": 4.828327295352217e-06, + "loss": 0.5371, + "step": 6214 + }, + { + "epoch": 0.7636073227669247, + "grad_norm": 1.2654813508610638, + "learning_rate": 4.8236000223598045e-06, + "loss": 0.4966, + "step": 6215 + }, + { + "epoch": 0.7637301879837818, + "grad_norm": 1.1691843620722076, + "learning_rate": 4.8188746212501634e-06, + "loss": 0.5401, + "step": 6216 + }, + { + "epoch": 0.7638530532006389, + "grad_norm": 1.4363930082260319, + "learning_rate": 4.814151092892518e-06, + "loss": 0.6883, + "step": 6217 + }, + { + "epoch": 0.763975918417496, + "grad_norm": 1.069100891001439, + "learning_rate": 4.809429438155717e-06, + "loss": 0.5987, + "step": 6218 + }, + { + "epoch": 0.7640987836343531, + "grad_norm": 1.234878531666306, + "learning_rate": 4.804709657908283e-06, + "loss": 0.6507, + "step": 6219 + }, + { + "epoch": 0.7642216488512102, + "grad_norm": 1.4696871933726865, + "learning_rate": 4.799991753018393e-06, + "loss": 0.6745, + "step": 6220 + }, + { + "epoch": 0.7643445140680674, + "grad_norm": 1.1252327997901903, + "learning_rate": 4.795275724353867e-06, + "loss": 0.6441, + "step": 6221 + }, + { + "epoch": 0.7644673792849245, + "grad_norm": 1.4135921864366447, + "learning_rate": 4.790561572782192e-06, + "loss": 0.68, + "step": 6222 + }, + { + "epoch": 0.7645902445017816, + "grad_norm": 1.3494369101663166, + "learning_rate": 4.785849299170502e-06, + "loss": 0.7376, + "step": 6223 + }, + { + "epoch": 0.7647131097186387, + "grad_norm": 1.175910356601133, + "learning_rate": 4.7811389043856e-06, + "loss": 0.5506, + "step": 6224 + }, + { + "epoch": 0.7648359749354957, + "grad_norm": 1.3267349188180304, + "learning_rate": 4.776430389293919e-06, + "loss": 0.5065, + "step": 6225 + }, + { + "epoch": 0.7649588401523528, + "grad_norm": 1.3451028909304565, + "learning_rate": 4.77172375476157e-06, + "loss": 0.623, + "step": 6226 + }, + { + "epoch": 0.7650817053692099, + "grad_norm": 1.183749142953873, + "learning_rate": 4.767019001654295e-06, + "loss": 0.6685, + "step": 6227 + }, + { + "epoch": 0.7652045705860671, + "grad_norm": 1.1141377490944588, + "learning_rate": 4.762316130837522e-06, + "loss": 0.5884, + "step": 6228 + }, + { + "epoch": 0.7653274358029242, + "grad_norm": 1.2140686364616433, + "learning_rate": 4.757615143176296e-06, + "loss": 0.6978, + "step": 6229 + }, + { + "epoch": 0.7654503010197813, + "grad_norm": 1.2692802896492696, + "learning_rate": 4.752916039535345e-06, + "loss": 0.4974, + "step": 6230 + }, + { + "epoch": 0.7655731662366384, + "grad_norm": 1.1548387817101293, + "learning_rate": 4.74821882077903e-06, + "loss": 0.4873, + "step": 6231 + }, + { + "epoch": 0.7656960314534955, + "grad_norm": 1.2511501276402661, + "learning_rate": 4.743523487771378e-06, + "loss": 0.4687, + "step": 6232 + }, + { + "epoch": 0.7658188966703526, + "grad_norm": 1.2041355973355108, + "learning_rate": 4.73883004137607e-06, + "loss": 0.4655, + "step": 6233 + }, + { + "epoch": 0.7659417618872097, + "grad_norm": 1.2371555197407673, + "learning_rate": 4.7341384824564235e-06, + "loss": 0.6041, + "step": 6234 + }, + { + "epoch": 0.7660646271040669, + "grad_norm": 1.2287600718545706, + "learning_rate": 4.729448811875428e-06, + "loss": 0.5772, + "step": 6235 + }, + { + "epoch": 0.766187492320924, + "grad_norm": 1.2173094081965368, + "learning_rate": 4.724761030495716e-06, + "loss": 0.6274, + "step": 6236 + }, + { + "epoch": 0.7663103575377811, + "grad_norm": 1.5665707781841818, + "learning_rate": 4.72007513917958e-06, + "loss": 0.5051, + "step": 6237 + }, + { + "epoch": 0.7664332227546382, + "grad_norm": 1.2592965735061168, + "learning_rate": 4.71539113878895e-06, + "loss": 0.5514, + "step": 6238 + }, + { + "epoch": 0.7665560879714952, + "grad_norm": 1.2057976180370376, + "learning_rate": 4.710709030185422e-06, + "loss": 0.505, + "step": 6239 + }, + { + "epoch": 0.7666789531883523, + "grad_norm": 0.9619849404331129, + "learning_rate": 4.706028814230245e-06, + "loss": 0.5444, + "step": 6240 + }, + { + "epoch": 0.7668018184052094, + "grad_norm": 1.0867129792128223, + "learning_rate": 4.701350491784302e-06, + "loss": 0.5499, + "step": 6241 + }, + { + "epoch": 0.7669246836220666, + "grad_norm": 1.0849058589879117, + "learning_rate": 4.696674063708148e-06, + "loss": 0.6069, + "step": 6242 + }, + { + "epoch": 0.7670475488389237, + "grad_norm": 1.213358131736252, + "learning_rate": 4.691999530861981e-06, + "loss": 0.5266, + "step": 6243 + }, + { + "epoch": 0.7671704140557808, + "grad_norm": 1.7185450092412362, + "learning_rate": 4.687326894105657e-06, + "loss": 0.688, + "step": 6244 + }, + { + "epoch": 0.7672932792726379, + "grad_norm": 1.0864161310553995, + "learning_rate": 4.682656154298662e-06, + "loss": 0.6191, + "step": 6245 + }, + { + "epoch": 0.767416144489495, + "grad_norm": 1.2538813087375211, + "learning_rate": 4.67798731230016e-06, + "loss": 0.553, + "step": 6246 + }, + { + "epoch": 0.7675390097063521, + "grad_norm": 1.2481555904935009, + "learning_rate": 4.673320368968951e-06, + "loss": 0.6408, + "step": 6247 + }, + { + "epoch": 0.7676618749232093, + "grad_norm": 1.2628510458930893, + "learning_rate": 4.668655325163493e-06, + "loss": 0.5718, + "step": 6248 + }, + { + "epoch": 0.7677847401400664, + "grad_norm": 1.4523061637155512, + "learning_rate": 4.663992181741883e-06, + "loss": 0.4845, + "step": 6249 + }, + { + "epoch": 0.7679076053569235, + "grad_norm": 1.0683473972235542, + "learning_rate": 4.659330939561879e-06, + "loss": 0.5216, + "step": 6250 + }, + { + "epoch": 0.7680304705737806, + "grad_norm": 1.0445964178561986, + "learning_rate": 4.654671599480893e-06, + "loss": 0.5919, + "step": 6251 + }, + { + "epoch": 0.7681533357906377, + "grad_norm": 1.1016632752617819, + "learning_rate": 4.650014162355969e-06, + "loss": 0.5199, + "step": 6252 + }, + { + "epoch": 0.7682762010074948, + "grad_norm": 1.5814117651379662, + "learning_rate": 4.6453586290438214e-06, + "loss": 0.6055, + "step": 6253 + }, + { + "epoch": 0.7683990662243518, + "grad_norm": 1.133664177329624, + "learning_rate": 4.640705000400795e-06, + "loss": 0.6018, + "step": 6254 + }, + { + "epoch": 0.768521931441209, + "grad_norm": 1.0327133270464934, + "learning_rate": 4.636053277282909e-06, + "loss": 0.6042, + "step": 6255 + }, + { + "epoch": 0.7686447966580661, + "grad_norm": 1.1111313158176601, + "learning_rate": 4.631403460545806e-06, + "loss": 0.4757, + "step": 6256 + }, + { + "epoch": 0.7687676618749232, + "grad_norm": 1.0784030098250352, + "learning_rate": 4.626755551044798e-06, + "loss": 0.5982, + "step": 6257 + }, + { + "epoch": 0.7688905270917803, + "grad_norm": 1.152555130728522, + "learning_rate": 4.622109549634829e-06, + "loss": 0.503, + "step": 6258 + }, + { + "epoch": 0.7690133923086374, + "grad_norm": 1.141108486730237, + "learning_rate": 4.617465457170504e-06, + "loss": 0.5294, + "step": 6259 + }, + { + "epoch": 0.7691362575254945, + "grad_norm": 1.1776200238177257, + "learning_rate": 4.6128232745060815e-06, + "loss": 0.5346, + "step": 6260 + }, + { + "epoch": 0.7692591227423516, + "grad_norm": 1.0542351619729198, + "learning_rate": 4.608183002495445e-06, + "loss": 0.5826, + "step": 6261 + }, + { + "epoch": 0.7693819879592088, + "grad_norm": 1.1949258155079843, + "learning_rate": 4.603544641992161e-06, + "loss": 0.5741, + "step": 6262 + }, + { + "epoch": 0.7695048531760659, + "grad_norm": 1.090343224994622, + "learning_rate": 4.598908193849412e-06, + "loss": 0.517, + "step": 6263 + }, + { + "epoch": 0.769627718392923, + "grad_norm": 1.3212678407968577, + "learning_rate": 4.594273658920052e-06, + "loss": 0.4831, + "step": 6264 + }, + { + "epoch": 0.7697505836097801, + "grad_norm": 1.297364160759807, + "learning_rate": 4.58964103805656e-06, + "loss": 0.5932, + "step": 6265 + }, + { + "epoch": 0.7698734488266372, + "grad_norm": 1.2194681166402321, + "learning_rate": 4.585010332111093e-06, + "loss": 0.518, + "step": 6266 + }, + { + "epoch": 0.7699963140434943, + "grad_norm": 1.3243342586449698, + "learning_rate": 4.580381541935429e-06, + "loss": 0.6362, + "step": 6267 + }, + { + "epoch": 0.7701191792603514, + "grad_norm": 1.4170735812436925, + "learning_rate": 4.575754668381011e-06, + "loss": 0.5805, + "step": 6268 + }, + { + "epoch": 0.7702420444772085, + "grad_norm": 1.0651749335276803, + "learning_rate": 4.571129712298913e-06, + "loss": 0.5212, + "step": 6269 + }, + { + "epoch": 0.7703649096940656, + "grad_norm": 1.707586176196874, + "learning_rate": 4.5665066745398705e-06, + "loss": 0.5746, + "step": 6270 + }, + { + "epoch": 0.7704877749109227, + "grad_norm": 1.1164525809246417, + "learning_rate": 4.561885555954269e-06, + "loss": 0.5952, + "step": 6271 + }, + { + "epoch": 0.7706106401277798, + "grad_norm": 1.2462006244562889, + "learning_rate": 4.557266357392119e-06, + "loss": 0.5665, + "step": 6272 + }, + { + "epoch": 0.7707335053446369, + "grad_norm": 1.2779997039016302, + "learning_rate": 4.552649079703099e-06, + "loss": 0.584, + "step": 6273 + }, + { + "epoch": 0.770856370561494, + "grad_norm": 1.4478910558602702, + "learning_rate": 4.548033723736527e-06, + "loss": 0.5768, + "step": 6274 + }, + { + "epoch": 0.7709792357783511, + "grad_norm": 1.4396267369227167, + "learning_rate": 4.543420290341374e-06, + "loss": 0.6062, + "step": 6275 + }, + { + "epoch": 0.7711021009952083, + "grad_norm": 1.1691108865823459, + "learning_rate": 4.538808780366239e-06, + "loss": 0.5635, + "step": 6276 + }, + { + "epoch": 0.7712249662120654, + "grad_norm": 1.7962300711292578, + "learning_rate": 4.534199194659387e-06, + "loss": 0.6638, + "step": 6277 + }, + { + "epoch": 0.7713478314289225, + "grad_norm": 1.115311339490574, + "learning_rate": 4.5295915340687255e-06, + "loss": 0.7471, + "step": 6278 + }, + { + "epoch": 0.7714706966457796, + "grad_norm": 1.2300007070444725, + "learning_rate": 4.524985799441792e-06, + "loss": 0.6419, + "step": 6279 + }, + { + "epoch": 0.7715935618626367, + "grad_norm": 1.5551230809595364, + "learning_rate": 4.520381991625794e-06, + "loss": 0.6703, + "step": 6280 + }, + { + "epoch": 0.7717164270794938, + "grad_norm": 1.5281624205583717, + "learning_rate": 4.515780111467555e-06, + "loss": 0.6306, + "step": 6281 + }, + { + "epoch": 0.771839292296351, + "grad_norm": 1.321248570268165, + "learning_rate": 4.511180159813582e-06, + "loss": 0.5789, + "step": 6282 + }, + { + "epoch": 0.771962157513208, + "grad_norm": 1.2776144186638512, + "learning_rate": 4.506582137509992e-06, + "loss": 0.62, + "step": 6283 + }, + { + "epoch": 0.7720850227300651, + "grad_norm": 1.1343581520592374, + "learning_rate": 4.501986045402565e-06, + "loss": 0.5413, + "step": 6284 + }, + { + "epoch": 0.7722078879469222, + "grad_norm": 1.2800598931571545, + "learning_rate": 4.497391884336722e-06, + "loss": 0.6004, + "step": 6285 + }, + { + "epoch": 0.7723307531637793, + "grad_norm": 1.228349872726117, + "learning_rate": 4.492799655157538e-06, + "loss": 0.6428, + "step": 6286 + }, + { + "epoch": 0.7724536183806364, + "grad_norm": 1.0588536903270813, + "learning_rate": 4.488209358709708e-06, + "loss": 0.4612, + "step": 6287 + }, + { + "epoch": 0.7725764835974935, + "grad_norm": 1.3225315427993296, + "learning_rate": 4.483620995837597e-06, + "loss": 0.623, + "step": 6288 + }, + { + "epoch": 0.7726993488143507, + "grad_norm": 1.3575675162472496, + "learning_rate": 4.4790345673852055e-06, + "loss": 0.5715, + "step": 6289 + }, + { + "epoch": 0.7728222140312078, + "grad_norm": 1.3992061602907506, + "learning_rate": 4.474450074196171e-06, + "loss": 0.5579, + "step": 6290 + }, + { + "epoch": 0.7729450792480649, + "grad_norm": 1.5290432796897608, + "learning_rate": 4.4698675171137895e-06, + "loss": 0.6415, + "step": 6291 + }, + { + "epoch": 0.773067944464922, + "grad_norm": 1.3211549144070112, + "learning_rate": 4.465286896980979e-06, + "loss": 0.5601, + "step": 6292 + }, + { + "epoch": 0.7731908096817791, + "grad_norm": 1.2204652421168538, + "learning_rate": 4.460708214640331e-06, + "loss": 0.5406, + "step": 6293 + }, + { + "epoch": 0.7733136748986362, + "grad_norm": 1.475594296265917, + "learning_rate": 4.456131470934053e-06, + "loss": 0.623, + "step": 6294 + }, + { + "epoch": 0.7734365401154933, + "grad_norm": 1.4330346514116319, + "learning_rate": 4.451556666704018e-06, + "loss": 0.6226, + "step": 6295 + }, + { + "epoch": 0.7735594053323505, + "grad_norm": 1.3036524714302877, + "learning_rate": 4.44698380279172e-06, + "loss": 0.6437, + "step": 6296 + }, + { + "epoch": 0.7736822705492076, + "grad_norm": 1.2009381532581267, + "learning_rate": 4.442412880038312e-06, + "loss": 0.632, + "step": 6297 + }, + { + "epoch": 0.7738051357660646, + "grad_norm": 1.1721925096919241, + "learning_rate": 4.437843899284592e-06, + "loss": 0.6381, + "step": 6298 + }, + { + "epoch": 0.7739280009829217, + "grad_norm": 0.8816396309665469, + "learning_rate": 4.433276861370984e-06, + "loss": 0.5699, + "step": 6299 + }, + { + "epoch": 0.7740508661997788, + "grad_norm": 1.096711218461627, + "learning_rate": 4.428711767137568e-06, + "loss": 0.5081, + "step": 6300 + }, + { + "epoch": 0.7741737314166359, + "grad_norm": 1.187224521518917, + "learning_rate": 4.424148617424066e-06, + "loss": 0.6486, + "step": 6301 + }, + { + "epoch": 0.774296596633493, + "grad_norm": 1.3437645508284541, + "learning_rate": 4.4195874130698455e-06, + "loss": 0.5115, + "step": 6302 + }, + { + "epoch": 0.7744194618503502, + "grad_norm": 1.2812004063804139, + "learning_rate": 4.415028154913892e-06, + "loss": 0.607, + "step": 6303 + }, + { + "epoch": 0.7745423270672073, + "grad_norm": 1.2460332617770993, + "learning_rate": 4.410470843794876e-06, + "loss": 0.5748, + "step": 6304 + }, + { + "epoch": 0.7746651922840644, + "grad_norm": 1.161534117412885, + "learning_rate": 4.405915480551065e-06, + "loss": 0.5862, + "step": 6305 + }, + { + "epoch": 0.7747880575009215, + "grad_norm": 1.169254750086548, + "learning_rate": 4.401362066020402e-06, + "loss": 0.6463, + "step": 6306 + }, + { + "epoch": 0.7749109227177786, + "grad_norm": 1.2416404815848552, + "learning_rate": 4.396810601040448e-06, + "loss": 0.4814, + "step": 6307 + }, + { + "epoch": 0.7750337879346357, + "grad_norm": 1.2811906028072304, + "learning_rate": 4.39226108644842e-06, + "loss": 0.6075, + "step": 6308 + }, + { + "epoch": 0.7751566531514928, + "grad_norm": 1.367855909537754, + "learning_rate": 4.387713523081176e-06, + "loss": 0.5333, + "step": 6309 + }, + { + "epoch": 0.77527951836835, + "grad_norm": 1.1571307652975766, + "learning_rate": 4.383167911775201e-06, + "loss": 0.6512, + "step": 6310 + }, + { + "epoch": 0.7754023835852071, + "grad_norm": 1.2752795849758816, + "learning_rate": 4.378624253366636e-06, + "loss": 0.5576, + "step": 6311 + }, + { + "epoch": 0.7755252488020641, + "grad_norm": 1.1593394072036474, + "learning_rate": 4.3740825486912585e-06, + "loss": 0.567, + "step": 6312 + }, + { + "epoch": 0.7756481140189212, + "grad_norm": 1.388428378866682, + "learning_rate": 4.36954279858449e-06, + "loss": 0.564, + "step": 6313 + }, + { + "epoch": 0.7757709792357783, + "grad_norm": 1.1445288971920875, + "learning_rate": 4.365005003881377e-06, + "loss": 0.476, + "step": 6314 + }, + { + "epoch": 0.7758938444526354, + "grad_norm": 1.2019430415626244, + "learning_rate": 4.360469165416623e-06, + "loss": 0.642, + "step": 6315 + }, + { + "epoch": 0.7760167096694925, + "grad_norm": 1.4794268971423072, + "learning_rate": 4.355935284024571e-06, + "loss": 0.4888, + "step": 6316 + }, + { + "epoch": 0.7761395748863497, + "grad_norm": 1.176335318534209, + "learning_rate": 4.35140336053919e-06, + "loss": 0.6049, + "step": 6317 + }, + { + "epoch": 0.7762624401032068, + "grad_norm": 1.494392420033212, + "learning_rate": 4.346873395794107e-06, + "loss": 0.6341, + "step": 6318 + }, + { + "epoch": 0.7763853053200639, + "grad_norm": 1.1841494859069324, + "learning_rate": 4.342345390622564e-06, + "loss": 0.4877, + "step": 6319 + }, + { + "epoch": 0.776508170536921, + "grad_norm": 1.3489284927829526, + "learning_rate": 4.33781934585748e-06, + "loss": 0.5334, + "step": 6320 + }, + { + "epoch": 0.7766310357537781, + "grad_norm": 1.4090251268799716, + "learning_rate": 4.333295262331375e-06, + "loss": 0.5623, + "step": 6321 + }, + { + "epoch": 0.7767539009706352, + "grad_norm": 1.1891350081756187, + "learning_rate": 4.328773140876436e-06, + "loss": 0.5196, + "step": 6322 + }, + { + "epoch": 0.7768767661874924, + "grad_norm": 1.4505951136339352, + "learning_rate": 4.324252982324465e-06, + "loss": 0.5824, + "step": 6323 + }, + { + "epoch": 0.7769996314043495, + "grad_norm": 1.338649143849978, + "learning_rate": 4.3197347875069285e-06, + "loss": 0.5553, + "step": 6324 + }, + { + "epoch": 0.7771224966212066, + "grad_norm": 1.1185256708797666, + "learning_rate": 4.315218557254912e-06, + "loss": 0.5589, + "step": 6325 + }, + { + "epoch": 0.7772453618380637, + "grad_norm": 1.1474253076566034, + "learning_rate": 4.310704292399147e-06, + "loss": 0.5581, + "step": 6326 + }, + { + "epoch": 0.7773682270549207, + "grad_norm": 1.089106132635584, + "learning_rate": 4.306191993770011e-06, + "loss": 0.5871, + "step": 6327 + }, + { + "epoch": 0.7774910922717778, + "grad_norm": 1.1188221277347377, + "learning_rate": 4.3016816621975006e-06, + "loss": 0.5219, + "step": 6328 + }, + { + "epoch": 0.7776139574886349, + "grad_norm": 1.117543283510135, + "learning_rate": 4.297173298511273e-06, + "loss": 0.5379, + "step": 6329 + }, + { + "epoch": 0.777736822705492, + "grad_norm": 1.3813160165405014, + "learning_rate": 4.292666903540597e-06, + "loss": 0.6394, + "step": 6330 + }, + { + "epoch": 0.7778596879223492, + "grad_norm": 1.0893415296594056, + "learning_rate": 4.288162478114413e-06, + "loss": 0.5567, + "step": 6331 + }, + { + "epoch": 0.7779825531392063, + "grad_norm": 0.8923941716615689, + "learning_rate": 4.283660023061268e-06, + "loss": 0.4933, + "step": 6332 + }, + { + "epoch": 0.7781054183560634, + "grad_norm": 1.1712365771903017, + "learning_rate": 4.27915953920937e-06, + "loss": 0.4574, + "step": 6333 + }, + { + "epoch": 0.7782282835729205, + "grad_norm": 1.2451441924504392, + "learning_rate": 4.274661027386542e-06, + "loss": 0.5874, + "step": 6334 + }, + { + "epoch": 0.7783511487897776, + "grad_norm": 0.9506620956511708, + "learning_rate": 4.270164488420262e-06, + "loss": 0.6527, + "step": 6335 + }, + { + "epoch": 0.7784740140066347, + "grad_norm": 1.1217161997432044, + "learning_rate": 4.265669923137642e-06, + "loss": 0.5478, + "step": 6336 + }, + { + "epoch": 0.7785968792234919, + "grad_norm": 1.2982192293674244, + "learning_rate": 4.261177332365422e-06, + "loss": 0.6504, + "step": 6337 + }, + { + "epoch": 0.778719744440349, + "grad_norm": 1.2673951282099476, + "learning_rate": 4.256686716929989e-06, + "loss": 0.4959, + "step": 6338 + }, + { + "epoch": 0.7788426096572061, + "grad_norm": 1.8439238103805369, + "learning_rate": 4.25219807765736e-06, + "loss": 0.6752, + "step": 6339 + }, + { + "epoch": 0.7789654748740632, + "grad_norm": 1.0034257929036368, + "learning_rate": 4.247711415373198e-06, + "loss": 0.5738, + "step": 6340 + }, + { + "epoch": 0.7790883400909202, + "grad_norm": 1.282751498064715, + "learning_rate": 4.243226730902785e-06, + "loss": 0.6502, + "step": 6341 + }, + { + "epoch": 0.7792112053077773, + "grad_norm": 1.1988018394083266, + "learning_rate": 4.238744025071055e-06, + "loss": 0.5843, + "step": 6342 + }, + { + "epoch": 0.7793340705246344, + "grad_norm": 1.1466223617884441, + "learning_rate": 4.234263298702576e-06, + "loss": 0.6241, + "step": 6343 + }, + { + "epoch": 0.7794569357414916, + "grad_norm": 1.3020636761124067, + "learning_rate": 4.229784552621541e-06, + "loss": 0.6531, + "step": 6344 + }, + { + "epoch": 0.7795798009583487, + "grad_norm": 1.195616737250621, + "learning_rate": 4.2253077876517914e-06, + "loss": 0.5644, + "step": 6345 + }, + { + "epoch": 0.7797026661752058, + "grad_norm": 1.191497457784127, + "learning_rate": 4.220833004616796e-06, + "loss": 0.5805, + "step": 6346 + }, + { + "epoch": 0.7798255313920629, + "grad_norm": 1.0790726158480264, + "learning_rate": 4.2163602043396696e-06, + "loss": 0.63, + "step": 6347 + }, + { + "epoch": 0.77994839660892, + "grad_norm": 1.2802460160323088, + "learning_rate": 4.211889387643145e-06, + "loss": 0.6217, + "step": 6348 + }, + { + "epoch": 0.7800712618257771, + "grad_norm": 1.102591487733063, + "learning_rate": 4.207420555349603e-06, + "loss": 0.5816, + "step": 6349 + }, + { + "epoch": 0.7801941270426342, + "grad_norm": 1.4522210464894152, + "learning_rate": 4.202953708281059e-06, + "loss": 0.599, + "step": 6350 + }, + { + "epoch": 0.7803169922594914, + "grad_norm": 1.4264709161255151, + "learning_rate": 4.198488847259163e-06, + "loss": 0.6184, + "step": 6351 + }, + { + "epoch": 0.7804398574763485, + "grad_norm": 1.063731599709147, + "learning_rate": 4.19402597310519e-06, + "loss": 0.4693, + "step": 6352 + }, + { + "epoch": 0.7805627226932056, + "grad_norm": 1.0267937233786617, + "learning_rate": 4.189565086640057e-06, + "loss": 0.5871, + "step": 6353 + }, + { + "epoch": 0.7806855879100627, + "grad_norm": 1.3485291621133302, + "learning_rate": 4.185106188684325e-06, + "loss": 0.5714, + "step": 6354 + }, + { + "epoch": 0.7808084531269198, + "grad_norm": 1.179742233015615, + "learning_rate": 4.180649280058168e-06, + "loss": 0.551, + "step": 6355 + }, + { + "epoch": 0.7809313183437768, + "grad_norm": 2.4295551956194257, + "learning_rate": 4.176194361581414e-06, + "loss": 0.7863, + "step": 6356 + }, + { + "epoch": 0.7810541835606339, + "grad_norm": 1.1464936677641009, + "learning_rate": 4.1717414340735025e-06, + "loss": 0.4728, + "step": 6357 + }, + { + "epoch": 0.7811770487774911, + "grad_norm": 1.6943456124160596, + "learning_rate": 4.167290498353541e-06, + "loss": 0.6278, + "step": 6358 + }, + { + "epoch": 0.7812999139943482, + "grad_norm": 1.482910948166948, + "learning_rate": 4.162841555240234e-06, + "loss": 0.472, + "step": 6359 + }, + { + "epoch": 0.7814227792112053, + "grad_norm": 1.2982297333127846, + "learning_rate": 4.158394605551946e-06, + "loss": 0.6667, + "step": 6360 + }, + { + "epoch": 0.7815456444280624, + "grad_norm": 1.0813169674145438, + "learning_rate": 4.153949650106658e-06, + "loss": 0.5995, + "step": 6361 + }, + { + "epoch": 0.7816685096449195, + "grad_norm": 1.1032079273278157, + "learning_rate": 4.149506689721989e-06, + "loss": 0.5762, + "step": 6362 + }, + { + "epoch": 0.7817913748617766, + "grad_norm": 1.267656571558536, + "learning_rate": 4.1450657252152035e-06, + "loss": 0.5645, + "step": 6363 + }, + { + "epoch": 0.7819142400786337, + "grad_norm": 1.229325390236225, + "learning_rate": 4.140626757403176e-06, + "loss": 0.4978, + "step": 6364 + }, + { + "epoch": 0.7820371052954909, + "grad_norm": 1.2457260499198592, + "learning_rate": 4.1361897871024315e-06, + "loss": 0.5622, + "step": 6365 + }, + { + "epoch": 0.782159970512348, + "grad_norm": 1.2108514941095834, + "learning_rate": 4.13175481512912e-06, + "loss": 0.5225, + "step": 6366 + }, + { + "epoch": 0.7822828357292051, + "grad_norm": 1.1185234792134309, + "learning_rate": 4.127321842299034e-06, + "loss": 0.6205, + "step": 6367 + }, + { + "epoch": 0.7824057009460622, + "grad_norm": 1.309468851924351, + "learning_rate": 4.122890869427572e-06, + "loss": 0.5279, + "step": 6368 + }, + { + "epoch": 0.7825285661629193, + "grad_norm": 1.3403369336864561, + "learning_rate": 4.118461897329804e-06, + "loss": 0.5178, + "step": 6369 + }, + { + "epoch": 0.7826514313797763, + "grad_norm": 1.1035733711454436, + "learning_rate": 4.114034926820396e-06, + "loss": 0.5318, + "step": 6370 + }, + { + "epoch": 0.7827742965966334, + "grad_norm": 1.3898582923800196, + "learning_rate": 4.10960995871367e-06, + "loss": 0.5926, + "step": 6371 + }, + { + "epoch": 0.7828971618134906, + "grad_norm": 1.173248597209357, + "learning_rate": 4.10518699382356e-06, + "loss": 0.6342, + "step": 6372 + }, + { + "epoch": 0.7830200270303477, + "grad_norm": 1.1123426866405737, + "learning_rate": 4.1007660329636484e-06, + "loss": 0.5247, + "step": 6373 + }, + { + "epoch": 0.7831428922472048, + "grad_norm": 1.031864611776161, + "learning_rate": 4.096347076947145e-06, + "loss": 0.5623, + "step": 6374 + }, + { + "epoch": 0.7832657574640619, + "grad_norm": 1.1399836391964697, + "learning_rate": 4.091930126586879e-06, + "loss": 0.5731, + "step": 6375 + }, + { + "epoch": 0.783388622680919, + "grad_norm": 1.0851796072308377, + "learning_rate": 4.087515182695326e-06, + "loss": 0.5729, + "step": 6376 + }, + { + "epoch": 0.7835114878977761, + "grad_norm": 1.2949402561003092, + "learning_rate": 4.083102246084584e-06, + "loss": 0.6382, + "step": 6377 + }, + { + "epoch": 0.7836343531146333, + "grad_norm": 1.2171780648030777, + "learning_rate": 4.078691317566392e-06, + "loss": 0.6878, + "step": 6378 + }, + { + "epoch": 0.7837572183314904, + "grad_norm": 1.1558877417037818, + "learning_rate": 4.074282397952097e-06, + "loss": 0.6211, + "step": 6379 + }, + { + "epoch": 0.7838800835483475, + "grad_norm": 1.0627479289185289, + "learning_rate": 4.069875488052702e-06, + "loss": 0.5469, + "step": 6380 + }, + { + "epoch": 0.7840029487652046, + "grad_norm": 1.1780304198041518, + "learning_rate": 4.06547058867883e-06, + "loss": 0.5747, + "step": 6381 + }, + { + "epoch": 0.7841258139820617, + "grad_norm": 1.1747232771691627, + "learning_rate": 4.061067700640726e-06, + "loss": 0.5794, + "step": 6382 + }, + { + "epoch": 0.7842486791989188, + "grad_norm": 1.2538153459748345, + "learning_rate": 4.056666824748282e-06, + "loss": 0.5801, + "step": 6383 + }, + { + "epoch": 0.784371544415776, + "grad_norm": 1.153206407016388, + "learning_rate": 4.052267961810995e-06, + "loss": 0.6959, + "step": 6384 + }, + { + "epoch": 0.784494409632633, + "grad_norm": 1.203354041344826, + "learning_rate": 4.047871112638029e-06, + "loss": 0.5536, + "step": 6385 + }, + { + "epoch": 0.7846172748494901, + "grad_norm": 1.0981025476444854, + "learning_rate": 4.043476278038139e-06, + "loss": 0.5449, + "step": 6386 + }, + { + "epoch": 0.7847401400663472, + "grad_norm": 1.179212524859334, + "learning_rate": 4.039083458819736e-06, + "loss": 0.5676, + "step": 6387 + }, + { + "epoch": 0.7848630052832043, + "grad_norm": 1.2657799036720427, + "learning_rate": 4.034692655790839e-06, + "loss": 0.5442, + "step": 6388 + }, + { + "epoch": 0.7849858705000614, + "grad_norm": 1.3260802507243543, + "learning_rate": 4.030303869759124e-06, + "loss": 0.6277, + "step": 6389 + }, + { + "epoch": 0.7851087357169185, + "grad_norm": 1.0487944038845685, + "learning_rate": 4.025917101531866e-06, + "loss": 0.5743, + "step": 6390 + }, + { + "epoch": 0.7852316009337756, + "grad_norm": 1.0636905915856605, + "learning_rate": 4.0215323519159896e-06, + "loss": 0.4998, + "step": 6391 + }, + { + "epoch": 0.7853544661506328, + "grad_norm": 1.4193590246917769, + "learning_rate": 4.017149621718043e-06, + "loss": 0.5759, + "step": 6392 + }, + { + "epoch": 0.7854773313674899, + "grad_norm": 1.368373316466973, + "learning_rate": 4.012768911744192e-06, + "loss": 0.548, + "step": 6393 + }, + { + "epoch": 0.785600196584347, + "grad_norm": 1.067448448843834, + "learning_rate": 4.0083902228002495e-06, + "loss": 0.5173, + "step": 6394 + }, + { + "epoch": 0.7857230618012041, + "grad_norm": 1.0718766898679024, + "learning_rate": 4.004013555691633e-06, + "loss": 0.5448, + "step": 6395 + }, + { + "epoch": 0.7858459270180612, + "grad_norm": 1.0818973722182088, + "learning_rate": 3.999638911223422e-06, + "loss": 0.671, + "step": 6396 + }, + { + "epoch": 0.7859687922349183, + "grad_norm": 1.1303400345549652, + "learning_rate": 3.9952662902002886e-06, + "loss": 0.599, + "step": 6397 + }, + { + "epoch": 0.7860916574517754, + "grad_norm": 1.1877748364497664, + "learning_rate": 3.990895693426557e-06, + "loss": 0.704, + "step": 6398 + }, + { + "epoch": 0.7862145226686326, + "grad_norm": 1.3437048796530693, + "learning_rate": 3.98652712170616e-06, + "loss": 0.5602, + "step": 6399 + }, + { + "epoch": 0.7863373878854896, + "grad_norm": 1.1595183508342337, + "learning_rate": 3.982160575842675e-06, + "loss": 0.581, + "step": 6400 + }, + { + "epoch": 0.7864602531023467, + "grad_norm": 1.287196411361099, + "learning_rate": 3.977796056639304e-06, + "loss": 0.4728, + "step": 6401 + }, + { + "epoch": 0.7865831183192038, + "grad_norm": 1.3882436086320296, + "learning_rate": 3.973433564898863e-06, + "loss": 0.6688, + "step": 6402 + }, + { + "epoch": 0.7867059835360609, + "grad_norm": 0.9757612698364961, + "learning_rate": 3.9690731014238066e-06, + "loss": 0.573, + "step": 6403 + }, + { + "epoch": 0.786828848752918, + "grad_norm": 1.1751548173421602, + "learning_rate": 3.964714667016216e-06, + "loss": 0.5683, + "step": 6404 + }, + { + "epoch": 0.7869517139697751, + "grad_norm": 1.1070720775167464, + "learning_rate": 3.960358262477801e-06, + "loss": 0.6687, + "step": 6405 + }, + { + "epoch": 0.7870745791866323, + "grad_norm": 1.6455586454867985, + "learning_rate": 3.956003888609883e-06, + "loss": 0.5319, + "step": 6406 + }, + { + "epoch": 0.7871974444034894, + "grad_norm": 1.0312317327339884, + "learning_rate": 3.951651546213428e-06, + "loss": 0.6023, + "step": 6407 + }, + { + "epoch": 0.7873203096203465, + "grad_norm": 1.4600606412307755, + "learning_rate": 3.94730123608902e-06, + "loss": 0.4955, + "step": 6408 + }, + { + "epoch": 0.7874431748372036, + "grad_norm": 1.4330159089196322, + "learning_rate": 3.942952959036874e-06, + "loss": 0.6677, + "step": 6409 + }, + { + "epoch": 0.7875660400540607, + "grad_norm": 1.4061565404207061, + "learning_rate": 3.938606715856821e-06, + "loss": 0.5714, + "step": 6410 + }, + { + "epoch": 0.7876889052709178, + "grad_norm": 1.312156009631735, + "learning_rate": 3.934262507348325e-06, + "loss": 0.6172, + "step": 6411 + }, + { + "epoch": 0.787811770487775, + "grad_norm": 1.2080318682454945, + "learning_rate": 3.929920334310481e-06, + "loss": 0.6, + "step": 6412 + }, + { + "epoch": 0.7879346357046321, + "grad_norm": 1.2783011805927, + "learning_rate": 3.925580197541996e-06, + "loss": 0.522, + "step": 6413 + }, + { + "epoch": 0.7880575009214891, + "grad_norm": 1.1007626578127319, + "learning_rate": 3.921242097841214e-06, + "loss": 0.6709, + "step": 6414 + }, + { + "epoch": 0.7881803661383462, + "grad_norm": 1.2565768194215137, + "learning_rate": 3.916906036006101e-06, + "loss": 0.572, + "step": 6415 + }, + { + "epoch": 0.7883032313552033, + "grad_norm": 1.598188579422172, + "learning_rate": 3.912572012834248e-06, + "loss": 0.6368, + "step": 6416 + }, + { + "epoch": 0.7884260965720604, + "grad_norm": 1.10086950677803, + "learning_rate": 3.908240029122865e-06, + "loss": 0.6444, + "step": 6417 + }, + { + "epoch": 0.7885489617889175, + "grad_norm": 1.2413225643550114, + "learning_rate": 3.903910085668798e-06, + "loss": 0.6921, + "step": 6418 + }, + { + "epoch": 0.7886718270057747, + "grad_norm": 1.2069566058805543, + "learning_rate": 3.899582183268512e-06, + "loss": 0.7292, + "step": 6419 + }, + { + "epoch": 0.7887946922226318, + "grad_norm": 1.1687523206241253, + "learning_rate": 3.895256322718091e-06, + "loss": 0.5883, + "step": 6420 + }, + { + "epoch": 0.7889175574394889, + "grad_norm": 1.299960982080885, + "learning_rate": 3.890932504813258e-06, + "loss": 0.5473, + "step": 6421 + }, + { + "epoch": 0.789040422656346, + "grad_norm": 1.1088226563377537, + "learning_rate": 3.886610730349337e-06, + "loss": 0.4827, + "step": 6422 + }, + { + "epoch": 0.7891632878732031, + "grad_norm": 1.2337964141967341, + "learning_rate": 3.882291000121308e-06, + "loss": 0.5231, + "step": 6423 + }, + { + "epoch": 0.7892861530900602, + "grad_norm": 1.3270369468301646, + "learning_rate": 3.877973314923744e-06, + "loss": 0.4989, + "step": 6424 + }, + { + "epoch": 0.7894090183069173, + "grad_norm": 1.294735577890553, + "learning_rate": 3.873657675550864e-06, + "loss": 0.5597, + "step": 6425 + }, + { + "epoch": 0.7895318835237745, + "grad_norm": 1.2623850877297624, + "learning_rate": 3.869344082796489e-06, + "loss": 0.6133, + "step": 6426 + }, + { + "epoch": 0.7896547487406316, + "grad_norm": 1.1894550431539992, + "learning_rate": 3.8650325374540935e-06, + "loss": 0.5833, + "step": 6427 + }, + { + "epoch": 0.7897776139574887, + "grad_norm": 1.2423876386517976, + "learning_rate": 3.860723040316747e-06, + "loss": 0.4913, + "step": 6428 + }, + { + "epoch": 0.7899004791743457, + "grad_norm": 1.3239590937875159, + "learning_rate": 3.8564155921771585e-06, + "loss": 0.6653, + "step": 6429 + }, + { + "epoch": 0.7900233443912028, + "grad_norm": 1.3466981035405838, + "learning_rate": 3.852110193827651e-06, + "loss": 0.5313, + "step": 6430 + }, + { + "epoch": 0.7901462096080599, + "grad_norm": 1.1524080876702048, + "learning_rate": 3.847806846060175e-06, + "loss": 0.6373, + "step": 6431 + }, + { + "epoch": 0.790269074824917, + "grad_norm": 1.0949654452342286, + "learning_rate": 3.843505549666311e-06, + "loss": 0.5441, + "step": 6432 + }, + { + "epoch": 0.7903919400417742, + "grad_norm": 1.162059359962414, + "learning_rate": 3.839206305437239e-06, + "loss": 0.4976, + "step": 6433 + }, + { + "epoch": 0.7905148052586313, + "grad_norm": 1.2171041007481742, + "learning_rate": 3.834909114163797e-06, + "loss": 0.619, + "step": 6434 + }, + { + "epoch": 0.7906376704754884, + "grad_norm": 1.31162893276971, + "learning_rate": 3.830613976636408e-06, + "loss": 0.6234, + "step": 6435 + }, + { + "epoch": 0.7907605356923455, + "grad_norm": 1.1807717292303859, + "learning_rate": 3.826320893645149e-06, + "loss": 0.5433, + "step": 6436 + }, + { + "epoch": 0.7908834009092026, + "grad_norm": 2.853773176406448, + "learning_rate": 3.822029865979693e-06, + "loss": 0.7286, + "step": 6437 + }, + { + "epoch": 0.7910062661260597, + "grad_norm": 1.1253641783960042, + "learning_rate": 3.817740894429352e-06, + "loss": 0.5828, + "step": 6438 + }, + { + "epoch": 0.7911291313429168, + "grad_norm": 1.184639768440339, + "learning_rate": 3.8134539797830557e-06, + "loss": 0.6848, + "step": 6439 + }, + { + "epoch": 0.791251996559774, + "grad_norm": 1.2819660339827694, + "learning_rate": 3.8091691228293515e-06, + "loss": 0.649, + "step": 6440 + }, + { + "epoch": 0.7913748617766311, + "grad_norm": 1.002655236324231, + "learning_rate": 3.804886324356409e-06, + "loss": 0.4944, + "step": 6441 + }, + { + "epoch": 0.7914977269934882, + "grad_norm": 1.5206286696622926, + "learning_rate": 3.8006055851520262e-06, + "loss": 0.5149, + "step": 6442 + }, + { + "epoch": 0.7916205922103452, + "grad_norm": 1.2708695289601533, + "learning_rate": 3.796326906003619e-06, + "loss": 0.508, + "step": 6443 + }, + { + "epoch": 0.7917434574272023, + "grad_norm": 1.1695576949414879, + "learning_rate": 3.792050287698216e-06, + "loss": 0.5756, + "step": 6444 + }, + { + "epoch": 0.7918663226440594, + "grad_norm": 1.3830810609824422, + "learning_rate": 3.7877757310224753e-06, + "loss": 0.5615, + "step": 6445 + }, + { + "epoch": 0.7919891878609165, + "grad_norm": 1.5352070262979256, + "learning_rate": 3.783503236762674e-06, + "loss": 0.607, + "step": 6446 + }, + { + "epoch": 0.7921120530777737, + "grad_norm": 1.650364656551014, + "learning_rate": 3.7792328057047175e-06, + "loss": 0.6522, + "step": 6447 + }, + { + "epoch": 0.7922349182946308, + "grad_norm": 1.1134995808141652, + "learning_rate": 3.774964438634112e-06, + "loss": 0.4638, + "step": 6448 + }, + { + "epoch": 0.7923577835114879, + "grad_norm": 1.1743103533394614, + "learning_rate": 3.7706981363359995e-06, + "loss": 0.6567, + "step": 6449 + }, + { + "epoch": 0.792480648728345, + "grad_norm": 1.2365906247199938, + "learning_rate": 3.766433899595147e-06, + "loss": 0.554, + "step": 6450 + }, + { + "epoch": 0.7926035139452021, + "grad_norm": 1.4940534148559228, + "learning_rate": 3.762171729195921e-06, + "loss": 0.4774, + "step": 6451 + }, + { + "epoch": 0.7927263791620592, + "grad_norm": 1.3142908503713426, + "learning_rate": 3.757911625922325e-06, + "loss": 0.5259, + "step": 6452 + }, + { + "epoch": 0.7928492443789164, + "grad_norm": 1.0788817864967324, + "learning_rate": 3.7536535905579785e-06, + "loss": 0.755, + "step": 6453 + }, + { + "epoch": 0.7929721095957735, + "grad_norm": 1.0395672633324722, + "learning_rate": 3.7493976238861223e-06, + "loss": 0.5966, + "step": 6454 + }, + { + "epoch": 0.7930949748126306, + "grad_norm": 1.2735877734770218, + "learning_rate": 3.745143726689607e-06, + "loss": 0.6048, + "step": 6455 + }, + { + "epoch": 0.7932178400294877, + "grad_norm": 1.0759749326843477, + "learning_rate": 3.7408918997509125e-06, + "loss": 0.5823, + "step": 6456 + }, + { + "epoch": 0.7933407052463448, + "grad_norm": 1.299802548455004, + "learning_rate": 3.73664214385214e-06, + "loss": 0.5973, + "step": 6457 + }, + { + "epoch": 0.7934635704632018, + "grad_norm": 1.0749359097794697, + "learning_rate": 3.732394459774996e-06, + "loss": 0.5226, + "step": 6458 + }, + { + "epoch": 0.7935864356800589, + "grad_norm": 1.1163897907588094, + "learning_rate": 3.728148848300821e-06, + "loss": 0.566, + "step": 6459 + }, + { + "epoch": 0.793709300896916, + "grad_norm": 1.2248027817620533, + "learning_rate": 3.7239053102105568e-06, + "loss": 0.5907, + "step": 6460 + }, + { + "epoch": 0.7938321661137732, + "grad_norm": 1.2392874280595396, + "learning_rate": 3.7196638462847916e-06, + "loss": 0.5529, + "step": 6461 + }, + { + "epoch": 0.7939550313306303, + "grad_norm": 1.0669637873349986, + "learning_rate": 3.715424457303702e-06, + "loss": 0.6596, + "step": 6462 + }, + { + "epoch": 0.7940778965474874, + "grad_norm": 1.279486137450477, + "learning_rate": 3.7111871440471036e-06, + "loss": 0.5699, + "step": 6463 + }, + { + "epoch": 0.7942007617643445, + "grad_norm": 1.0776713305288672, + "learning_rate": 3.7069519072944168e-06, + "loss": 0.5934, + "step": 6464 + }, + { + "epoch": 0.7943236269812016, + "grad_norm": 1.3487538206012244, + "learning_rate": 3.702718747824688e-06, + "loss": 0.6345, + "step": 6465 + }, + { + "epoch": 0.7944464921980587, + "grad_norm": 1.1561151168377748, + "learning_rate": 3.6984876664165845e-06, + "loss": 0.635, + "step": 6466 + }, + { + "epoch": 0.7945693574149159, + "grad_norm": 1.1255545324116007, + "learning_rate": 3.6942586638483768e-06, + "loss": 0.6435, + "step": 6467 + }, + { + "epoch": 0.794692222631773, + "grad_norm": 1.1343031204131906, + "learning_rate": 3.690031740897968e-06, + "loss": 0.6131, + "step": 6468 + }, + { + "epoch": 0.7948150878486301, + "grad_norm": 1.2551486680015362, + "learning_rate": 3.6858068983428745e-06, + "loss": 0.5694, + "step": 6469 + }, + { + "epoch": 0.7949379530654872, + "grad_norm": 0.9951622270732979, + "learning_rate": 3.6815841369602297e-06, + "loss": 0.5369, + "step": 6470 + }, + { + "epoch": 0.7950608182823443, + "grad_norm": 1.1457846567177274, + "learning_rate": 3.677363457526775e-06, + "loss": 0.6058, + "step": 6471 + }, + { + "epoch": 0.7951836834992013, + "grad_norm": 1.1182311954029494, + "learning_rate": 3.673144860818884e-06, + "loss": 0.548, + "step": 6472 + }, + { + "epoch": 0.7953065487160584, + "grad_norm": 1.2428482839916495, + "learning_rate": 3.6689283476125392e-06, + "loss": 0.5412, + "step": 6473 + }, + { + "epoch": 0.7954294139329156, + "grad_norm": 1.196242476377996, + "learning_rate": 3.6647139186833435e-06, + "loss": 0.5551, + "step": 6474 + }, + { + "epoch": 0.7955522791497727, + "grad_norm": 1.1210831106864632, + "learning_rate": 3.6605015748065053e-06, + "loss": 0.68, + "step": 6475 + }, + { + "epoch": 0.7956751443666298, + "grad_norm": 1.128707496770722, + "learning_rate": 3.6562913167568645e-06, + "loss": 0.4951, + "step": 6476 + }, + { + "epoch": 0.7957980095834869, + "grad_norm": 1.1555575232062891, + "learning_rate": 3.652083145308874e-06, + "loss": 0.5059, + "step": 6477 + }, + { + "epoch": 0.795920874800344, + "grad_norm": 1.0472352098244835, + "learning_rate": 3.6478770612365902e-06, + "loss": 0.5267, + "step": 6478 + }, + { + "epoch": 0.7960437400172011, + "grad_norm": 1.266101226426438, + "learning_rate": 3.6436730653136986e-06, + "loss": 0.5999, + "step": 6479 + }, + { + "epoch": 0.7961666052340582, + "grad_norm": 1.5855026794215894, + "learning_rate": 3.6394711583135e-06, + "loss": 0.5402, + "step": 6480 + }, + { + "epoch": 0.7962894704509154, + "grad_norm": 1.183273506368117, + "learning_rate": 3.635271341008911e-06, + "loss": 0.6562, + "step": 6481 + }, + { + "epoch": 0.7964123356677725, + "grad_norm": 1.4979611962037622, + "learning_rate": 3.631073614172449e-06, + "loss": 0.6049, + "step": 6482 + }, + { + "epoch": 0.7965352008846296, + "grad_norm": 1.1127330546860483, + "learning_rate": 3.6268779785762686e-06, + "loss": 0.5996, + "step": 6483 + }, + { + "epoch": 0.7966580661014867, + "grad_norm": 1.2520995254215717, + "learning_rate": 3.6226844349921294e-06, + "loss": 0.54, + "step": 6484 + }, + { + "epoch": 0.7967809313183438, + "grad_norm": 1.3189069142640883, + "learning_rate": 3.6184929841914004e-06, + "loss": 0.5324, + "step": 6485 + }, + { + "epoch": 0.7969037965352009, + "grad_norm": 1.2180682272858359, + "learning_rate": 3.6143036269450796e-06, + "loss": 0.5755, + "step": 6486 + }, + { + "epoch": 0.7970266617520579, + "grad_norm": 1.0362377923757387, + "learning_rate": 3.610116364023759e-06, + "loss": 0.6174, + "step": 6487 + }, + { + "epoch": 0.7971495269689151, + "grad_norm": 1.3702172091879075, + "learning_rate": 3.6059311961976756e-06, + "loss": 0.5918, + "step": 6488 + }, + { + "epoch": 0.7972723921857722, + "grad_norm": 1.1040644710679723, + "learning_rate": 3.6017481242366503e-06, + "loss": 0.608, + "step": 6489 + }, + { + "epoch": 0.7973952574026293, + "grad_norm": 1.724860920292001, + "learning_rate": 3.5975671489101423e-06, + "loss": 0.64, + "step": 6490 + }, + { + "epoch": 0.7975181226194864, + "grad_norm": 1.3299069267958865, + "learning_rate": 3.5933882709872023e-06, + "loss": 0.6253, + "step": 6491 + }, + { + "epoch": 0.7976409878363435, + "grad_norm": 1.2000463483718757, + "learning_rate": 3.589211491236523e-06, + "loss": 0.4869, + "step": 6492 + }, + { + "epoch": 0.7977638530532006, + "grad_norm": 1.0423761785376433, + "learning_rate": 3.5850368104263836e-06, + "loss": 0.5248, + "step": 6493 + }, + { + "epoch": 0.7978867182700577, + "grad_norm": 1.2807673961064288, + "learning_rate": 3.5808642293246995e-06, + "loss": 0.5331, + "step": 6494 + }, + { + "epoch": 0.7980095834869149, + "grad_norm": 1.2152272169696805, + "learning_rate": 3.5766937486989802e-06, + "loss": 0.6368, + "step": 6495 + }, + { + "epoch": 0.798132448703772, + "grad_norm": 1.123028475489048, + "learning_rate": 3.572525369316364e-06, + "loss": 0.527, + "step": 6496 + }, + { + "epoch": 0.7982553139206291, + "grad_norm": 1.0539683958423678, + "learning_rate": 3.568359091943599e-06, + "loss": 0.5376, + "step": 6497 + }, + { + "epoch": 0.7983781791374862, + "grad_norm": 1.0733199216504214, + "learning_rate": 3.564194917347035e-06, + "loss": 0.5713, + "step": 6498 + }, + { + "epoch": 0.7985010443543433, + "grad_norm": 1.2949409283278837, + "learning_rate": 3.56003284629266e-06, + "loss": 0.5234, + "step": 6499 + }, + { + "epoch": 0.7986239095712004, + "grad_norm": 1.144117602338913, + "learning_rate": 3.5558728795460467e-06, + "loss": 0.5744, + "step": 6500 + }, + { + "epoch": 0.7987467747880576, + "grad_norm": 1.4407706395219206, + "learning_rate": 3.5517150178724058e-06, + "loss": 0.5821, + "step": 6501 + }, + { + "epoch": 0.7988696400049146, + "grad_norm": 1.1478281570061284, + "learning_rate": 3.547559262036537e-06, + "loss": 0.5775, + "step": 6502 + }, + { + "epoch": 0.7989925052217717, + "grad_norm": 1.120519339995065, + "learning_rate": 3.5434056128028715e-06, + "loss": 0.638, + "step": 6503 + }, + { + "epoch": 0.7991153704386288, + "grad_norm": 1.4326891972859441, + "learning_rate": 3.5392540709354486e-06, + "loss": 0.6414, + "step": 6504 + }, + { + "epoch": 0.7992382356554859, + "grad_norm": 1.2533094667647149, + "learning_rate": 3.5351046371979084e-06, + "loss": 0.6828, + "step": 6505 + }, + { + "epoch": 0.799361100872343, + "grad_norm": 1.3200050300972437, + "learning_rate": 3.5309573123535184e-06, + "loss": 0.4258, + "step": 6506 + }, + { + "epoch": 0.7994839660892001, + "grad_norm": 1.1545565272102847, + "learning_rate": 3.5268120971651528e-06, + "loss": 0.5393, + "step": 6507 + }, + { + "epoch": 0.7996068313060573, + "grad_norm": 1.2758716249348616, + "learning_rate": 3.5226689923952975e-06, + "loss": 0.5941, + "step": 6508 + }, + { + "epoch": 0.7997296965229144, + "grad_norm": 1.2230129715819809, + "learning_rate": 3.518527998806046e-06, + "loss": 0.4965, + "step": 6509 + }, + { + "epoch": 0.7998525617397715, + "grad_norm": 1.109488418005055, + "learning_rate": 3.5143891171591088e-06, + "loss": 0.649, + "step": 6510 + }, + { + "epoch": 0.7999754269566286, + "grad_norm": 1.0660614383764564, + "learning_rate": 3.510252348215805e-06, + "loss": 0.5315, + "step": 6511 + }, + { + "epoch": 0.8000982921734857, + "grad_norm": 1.3935135243921934, + "learning_rate": 3.5061176927370745e-06, + "loss": 0.6089, + "step": 6512 + }, + { + "epoch": 0.8002211573903428, + "grad_norm": 1.1552127567676278, + "learning_rate": 3.5019851514834476e-06, + "loss": 0.6253, + "step": 6513 + }, + { + "epoch": 0.8003440226072, + "grad_norm": 1.0911819562987974, + "learning_rate": 3.4978547252150862e-06, + "loss": 0.5231, + "step": 6514 + }, + { + "epoch": 0.8004668878240571, + "grad_norm": 1.4479780269284768, + "learning_rate": 3.4937264146917587e-06, + "loss": 0.7726, + "step": 6515 + }, + { + "epoch": 0.8005897530409141, + "grad_norm": 1.415097129929438, + "learning_rate": 3.4896002206728313e-06, + "loss": 0.648, + "step": 6516 + }, + { + "epoch": 0.8007126182577712, + "grad_norm": 1.3822310322939217, + "learning_rate": 3.485476143917295e-06, + "loss": 0.5831, + "step": 6517 + }, + { + "epoch": 0.8008354834746283, + "grad_norm": 1.1504583571140703, + "learning_rate": 3.4813541851837498e-06, + "loss": 0.498, + "step": 6518 + }, + { + "epoch": 0.8009583486914854, + "grad_norm": 1.126825296275138, + "learning_rate": 3.4772343452304047e-06, + "loss": 0.4883, + "step": 6519 + }, + { + "epoch": 0.8010812139083425, + "grad_norm": 1.3701444040188937, + "learning_rate": 3.4731166248150693e-06, + "loss": 0.4841, + "step": 6520 + }, + { + "epoch": 0.8012040791251996, + "grad_norm": 1.1787381677791764, + "learning_rate": 3.4690010246951765e-06, + "loss": 0.5674, + "step": 6521 + }, + { + "epoch": 0.8013269443420568, + "grad_norm": 1.184600074263304, + "learning_rate": 3.464887545627767e-06, + "loss": 0.5442, + "step": 6522 + }, + { + "epoch": 0.8014498095589139, + "grad_norm": 1.2398135947911038, + "learning_rate": 3.4607761883694834e-06, + "loss": 0.5566, + "step": 6523 + }, + { + "epoch": 0.801572674775771, + "grad_norm": 1.0421044571567675, + "learning_rate": 3.4566669536765893e-06, + "loss": 0.4775, + "step": 6524 + }, + { + "epoch": 0.8016955399926281, + "grad_norm": 1.1805060052244447, + "learning_rate": 3.452559842304938e-06, + "loss": 0.6005, + "step": 6525 + }, + { + "epoch": 0.8018184052094852, + "grad_norm": 1.3150545272747156, + "learning_rate": 3.4484548550100254e-06, + "loss": 0.645, + "step": 6526 + }, + { + "epoch": 0.8019412704263423, + "grad_norm": 1.0933424802301084, + "learning_rate": 3.4443519925469236e-06, + "loss": 0.5705, + "step": 6527 + }, + { + "epoch": 0.8020641356431994, + "grad_norm": 1.2106395564031611, + "learning_rate": 3.440251255670337e-06, + "loss": 0.5279, + "step": 6528 + }, + { + "epoch": 0.8021870008600566, + "grad_norm": 1.1414703758341649, + "learning_rate": 3.4361526451345536e-06, + "loss": 0.5074, + "step": 6529 + }, + { + "epoch": 0.8023098660769137, + "grad_norm": 1.1687683168115293, + "learning_rate": 3.4320561616935076e-06, + "loss": 0.5943, + "step": 6530 + }, + { + "epoch": 0.8024327312937707, + "grad_norm": 1.0685306735384221, + "learning_rate": 3.427961806100704e-06, + "loss": 0.5826, + "step": 6531 + }, + { + "epoch": 0.8025555965106278, + "grad_norm": 1.4429497636960236, + "learning_rate": 3.423869579109284e-06, + "loss": 0.6924, + "step": 6532 + }, + { + "epoch": 0.8026784617274849, + "grad_norm": 1.3157979336501653, + "learning_rate": 3.4197794814719768e-06, + "loss": 0.5831, + "step": 6533 + }, + { + "epoch": 0.802801326944342, + "grad_norm": 1.2762501416464123, + "learning_rate": 3.4156915139411343e-06, + "loss": 0.4966, + "step": 6534 + }, + { + "epoch": 0.8029241921611991, + "grad_norm": 1.305026729866893, + "learning_rate": 3.4116056772687147e-06, + "loss": 0.4747, + "step": 6535 + }, + { + "epoch": 0.8030470573780563, + "grad_norm": 1.1426892692237653, + "learning_rate": 3.407521972206272e-06, + "loss": 0.5286, + "step": 6536 + }, + { + "epoch": 0.8031699225949134, + "grad_norm": 1.170717748742045, + "learning_rate": 3.403440399504984e-06, + "loss": 0.4411, + "step": 6537 + }, + { + "epoch": 0.8032927878117705, + "grad_norm": 1.120335291988548, + "learning_rate": 3.3993609599156277e-06, + "loss": 0.5401, + "step": 6538 + }, + { + "epoch": 0.8034156530286276, + "grad_norm": 1.0338392852373568, + "learning_rate": 3.3952836541885933e-06, + "loss": 0.5786, + "step": 6539 + }, + { + "epoch": 0.8035385182454847, + "grad_norm": 1.2562124891847526, + "learning_rate": 3.3912084830738695e-06, + "loss": 0.5953, + "step": 6540 + }, + { + "epoch": 0.8036613834623418, + "grad_norm": 1.3367626403916293, + "learning_rate": 3.3871354473210573e-06, + "loss": 0.5625, + "step": 6541 + }, + { + "epoch": 0.803784248679199, + "grad_norm": 1.1696854725086632, + "learning_rate": 3.383064547679374e-06, + "loss": 0.6206, + "step": 6542 + }, + { + "epoch": 0.8039071138960561, + "grad_norm": 1.0364021400769534, + "learning_rate": 3.378995784897622e-06, + "loss": 0.5824, + "step": 6543 + }, + { + "epoch": 0.8040299791129132, + "grad_norm": 1.1830244366176506, + "learning_rate": 3.3749291597242327e-06, + "loss": 0.5857, + "step": 6544 + }, + { + "epoch": 0.8041528443297702, + "grad_norm": 1.1018824268245122, + "learning_rate": 3.370864672907232e-06, + "loss": 0.649, + "step": 6545 + }, + { + "epoch": 0.8042757095466273, + "grad_norm": 1.2032933683254468, + "learning_rate": 3.3668023251942615e-06, + "loss": 0.5454, + "step": 6546 + }, + { + "epoch": 0.8043985747634844, + "grad_norm": 1.2298423546020991, + "learning_rate": 3.362742117332554e-06, + "loss": 0.5497, + "step": 6547 + }, + { + "epoch": 0.8045214399803415, + "grad_norm": 1.1713759609424717, + "learning_rate": 3.358684050068965e-06, + "loss": 0.6088, + "step": 6548 + }, + { + "epoch": 0.8046443051971987, + "grad_norm": 1.1516574952450425, + "learning_rate": 3.35462812414995e-06, + "loss": 0.5273, + "step": 6549 + }, + { + "epoch": 0.8047671704140558, + "grad_norm": 1.3588516006229703, + "learning_rate": 3.3505743403215712e-06, + "loss": 0.5416, + "step": 6550 + }, + { + "epoch": 0.8048900356309129, + "grad_norm": 1.2483880473124298, + "learning_rate": 3.346522699329489e-06, + "loss": 0.5545, + "step": 6551 + }, + { + "epoch": 0.80501290084777, + "grad_norm": 1.4884849568339045, + "learning_rate": 3.3424732019189806e-06, + "loss": 0.6376, + "step": 6552 + }, + { + "epoch": 0.8051357660646271, + "grad_norm": 1.1524273865848518, + "learning_rate": 3.338425848834929e-06, + "loss": 0.6671, + "step": 6553 + }, + { + "epoch": 0.8052586312814842, + "grad_norm": 1.0296901305919288, + "learning_rate": 3.3343806408218116e-06, + "loss": 0.5595, + "step": 6554 + }, + { + "epoch": 0.8053814964983413, + "grad_norm": 1.0960006745538549, + "learning_rate": 3.3303375786237244e-06, + "loss": 0.5795, + "step": 6555 + }, + { + "epoch": 0.8055043617151985, + "grad_norm": 1.295671907826558, + "learning_rate": 3.32629666298435e-06, + "loss": 0.5491, + "step": 6556 + }, + { + "epoch": 0.8056272269320556, + "grad_norm": 1.2197572237751704, + "learning_rate": 3.3222578946470085e-06, + "loss": 0.632, + "step": 6557 + }, + { + "epoch": 0.8057500921489127, + "grad_norm": 1.178294746153841, + "learning_rate": 3.3182212743545885e-06, + "loss": 0.6364, + "step": 6558 + }, + { + "epoch": 0.8058729573657698, + "grad_norm": 1.2512185146064205, + "learning_rate": 3.314186802849607e-06, + "loss": 0.6382, + "step": 6559 + }, + { + "epoch": 0.8059958225826268, + "grad_norm": 1.2462941782107417, + "learning_rate": 3.3101544808741813e-06, + "loss": 0.5694, + "step": 6560 + }, + { + "epoch": 0.8061186877994839, + "grad_norm": 1.2878580790091343, + "learning_rate": 3.306124309170023e-06, + "loss": 0.6091, + "step": 6561 + }, + { + "epoch": 0.806241553016341, + "grad_norm": 1.1091711123365207, + "learning_rate": 3.3020962884784667e-06, + "loss": 0.4651, + "step": 6562 + }, + { + "epoch": 0.8063644182331982, + "grad_norm": 1.1402474036791126, + "learning_rate": 3.2980704195404237e-06, + "loss": 0.5105, + "step": 6563 + }, + { + "epoch": 0.8064872834500553, + "grad_norm": 1.418235739398273, + "learning_rate": 3.2940467030964472e-06, + "loss": 0.5351, + "step": 6564 + }, + { + "epoch": 0.8066101486669124, + "grad_norm": 1.1131578010291614, + "learning_rate": 3.2900251398866598e-06, + "loss": 0.5852, + "step": 6565 + }, + { + "epoch": 0.8067330138837695, + "grad_norm": 1.3631557088535295, + "learning_rate": 3.28600573065081e-06, + "loss": 0.6633, + "step": 6566 + }, + { + "epoch": 0.8068558791006266, + "grad_norm": 1.170326759645731, + "learning_rate": 3.28198847612823e-06, + "loss": 0.7074, + "step": 6567 + }, + { + "epoch": 0.8069787443174837, + "grad_norm": 1.1319388595560822, + "learning_rate": 3.2779733770578846e-06, + "loss": 0.4765, + "step": 6568 + }, + { + "epoch": 0.8071016095343408, + "grad_norm": 1.2090452419940494, + "learning_rate": 3.2739604341783103e-06, + "loss": 0.5317, + "step": 6569 + }, + { + "epoch": 0.807224474751198, + "grad_norm": 1.449425099630093, + "learning_rate": 3.2699496482276747e-06, + "loss": 0.5568, + "step": 6570 + }, + { + "epoch": 0.8073473399680551, + "grad_norm": 1.2652417209073439, + "learning_rate": 3.265941019943723e-06, + "loss": 0.4969, + "step": 6571 + }, + { + "epoch": 0.8074702051849122, + "grad_norm": 1.2355378110096031, + "learning_rate": 3.2619345500638246e-06, + "loss": 0.6878, + "step": 6572 + }, + { + "epoch": 0.8075930704017693, + "grad_norm": 1.000399883983508, + "learning_rate": 3.2579302393249446e-06, + "loss": 0.551, + "step": 6573 + }, + { + "epoch": 0.8077159356186263, + "grad_norm": 1.108589831742819, + "learning_rate": 3.2539280884636422e-06, + "loss": 0.5607, + "step": 6574 + }, + { + "epoch": 0.8078388008354834, + "grad_norm": 1.204251196327704, + "learning_rate": 3.2499280982160934e-06, + "loss": 0.5106, + "step": 6575 + }, + { + "epoch": 0.8079616660523405, + "grad_norm": 1.0341993736579531, + "learning_rate": 3.2459302693180686e-06, + "loss": 0.5324, + "step": 6576 + }, + { + "epoch": 0.8080845312691977, + "grad_norm": 1.270308190385964, + "learning_rate": 3.2419346025049483e-06, + "loss": 0.5823, + "step": 6577 + }, + { + "epoch": 0.8082073964860548, + "grad_norm": 1.2307863995294144, + "learning_rate": 3.237941098511698e-06, + "loss": 0.6035, + "step": 6578 + }, + { + "epoch": 0.8083302617029119, + "grad_norm": 1.0968860615438496, + "learning_rate": 3.233949758072905e-06, + "loss": 0.496, + "step": 6579 + }, + { + "epoch": 0.808453126919769, + "grad_norm": 1.342930169260937, + "learning_rate": 3.22996058192275e-06, + "loss": 0.5478, + "step": 6580 + }, + { + "epoch": 0.8085759921366261, + "grad_norm": 1.3865298866296984, + "learning_rate": 3.2259735707950117e-06, + "loss": 0.6669, + "step": 6581 + }, + { + "epoch": 0.8086988573534832, + "grad_norm": 1.1780942708161426, + "learning_rate": 3.2219887254230797e-06, + "loss": 0.5148, + "step": 6582 + }, + { + "epoch": 0.8088217225703404, + "grad_norm": 1.2916964346630626, + "learning_rate": 3.2180060465399357e-06, + "loss": 0.5725, + "step": 6583 + }, + { + "epoch": 0.8089445877871975, + "grad_norm": 1.0829214169878858, + "learning_rate": 3.214025534878176e-06, + "loss": 0.501, + "step": 6584 + }, + { + "epoch": 0.8090674530040546, + "grad_norm": 2.837902364840793, + "learning_rate": 3.2100471911699796e-06, + "loss": 0.663, + "step": 6585 + }, + { + "epoch": 0.8091903182209117, + "grad_norm": 1.3447867045849686, + "learning_rate": 3.2060710161471427e-06, + "loss": 0.5673, + "step": 6586 + }, + { + "epoch": 0.8093131834377688, + "grad_norm": 1.1506329687023507, + "learning_rate": 3.2020970105410607e-06, + "loss": 0.5806, + "step": 6587 + }, + { + "epoch": 0.8094360486546259, + "grad_norm": 1.028433289490699, + "learning_rate": 3.198125175082717e-06, + "loss": 0.4876, + "step": 6588 + }, + { + "epoch": 0.8095589138714829, + "grad_norm": 1.034826025604305, + "learning_rate": 3.1941555105027115e-06, + "loss": 0.4956, + "step": 6589 + }, + { + "epoch": 0.80968177908834, + "grad_norm": 1.4820237847318491, + "learning_rate": 3.1901880175312307e-06, + "loss": 0.7092, + "step": 6590 + }, + { + "epoch": 0.8098046443051972, + "grad_norm": 1.0181243270664089, + "learning_rate": 3.1862226968980813e-06, + "loss": 0.549, + "step": 6591 + }, + { + "epoch": 0.8099275095220543, + "grad_norm": 1.2610479771501715, + "learning_rate": 3.182259549332649e-06, + "loss": 0.5888, + "step": 6592 + }, + { + "epoch": 0.8100503747389114, + "grad_norm": 1.3764671316543609, + "learning_rate": 3.1782985755639344e-06, + "loss": 0.4851, + "step": 6593 + }, + { + "epoch": 0.8101732399557685, + "grad_norm": 1.0011270257374725, + "learning_rate": 3.174339776320523e-06, + "loss": 0.6529, + "step": 6594 + }, + { + "epoch": 0.8102961051726256, + "grad_norm": 1.506668905236413, + "learning_rate": 3.170383152330627e-06, + "loss": 0.5768, + "step": 6595 + }, + { + "epoch": 0.8104189703894827, + "grad_norm": 1.1439017130230327, + "learning_rate": 3.1664287043220265e-06, + "loss": 0.5449, + "step": 6596 + }, + { + "epoch": 0.8105418356063399, + "grad_norm": 1.242880301756144, + "learning_rate": 3.162476433022127e-06, + "loss": 0.6261, + "step": 6597 + }, + { + "epoch": 0.810664700823197, + "grad_norm": 1.236972669281351, + "learning_rate": 3.158526339157915e-06, + "loss": 0.6504, + "step": 6598 + }, + { + "epoch": 0.8107875660400541, + "grad_norm": 1.580301141338173, + "learning_rate": 3.1545784234559883e-06, + "loss": 0.5171, + "step": 6599 + }, + { + "epoch": 0.8109104312569112, + "grad_norm": 1.259991584902611, + "learning_rate": 3.1506326866425445e-06, + "loss": 0.5164, + "step": 6600 + }, + { + "epoch": 0.8110332964737683, + "grad_norm": 1.2390048422112785, + "learning_rate": 3.146689129443368e-06, + "loss": 0.5066, + "step": 6601 + }, + { + "epoch": 0.8111561616906254, + "grad_norm": 0.9852001304042092, + "learning_rate": 3.142747752583854e-06, + "loss": 0.6965, + "step": 6602 + }, + { + "epoch": 0.8112790269074824, + "grad_norm": 1.1927690156634907, + "learning_rate": 3.1388085567889934e-06, + "loss": 0.5818, + "step": 6603 + }, + { + "epoch": 0.8114018921243396, + "grad_norm": 1.378240931673566, + "learning_rate": 3.1348715427833824e-06, + "loss": 0.4956, + "step": 6604 + }, + { + "epoch": 0.8115247573411967, + "grad_norm": 1.1216324039789205, + "learning_rate": 3.130936711291198e-06, + "loss": 0.6209, + "step": 6605 + }, + { + "epoch": 0.8116476225580538, + "grad_norm": 1.2084548067295575, + "learning_rate": 3.1270040630362313e-06, + "loss": 0.5898, + "step": 6606 + }, + { + "epoch": 0.8117704877749109, + "grad_norm": 1.0142923020282795, + "learning_rate": 3.1230735987418733e-06, + "loss": 0.5087, + "step": 6607 + }, + { + "epoch": 0.811893352991768, + "grad_norm": 1.0622935571471055, + "learning_rate": 3.1191453191310967e-06, + "loss": 0.5655, + "step": 6608 + }, + { + "epoch": 0.8120162182086251, + "grad_norm": 1.2306787755934865, + "learning_rate": 3.1152192249264907e-06, + "loss": 0.5305, + "step": 6609 + }, + { + "epoch": 0.8121390834254822, + "grad_norm": 1.5002231141946876, + "learning_rate": 3.111295316850231e-06, + "loss": 0.6466, + "step": 6610 + }, + { + "epoch": 0.8122619486423394, + "grad_norm": 1.0940522239791095, + "learning_rate": 3.107373595624101e-06, + "loss": 0.5179, + "step": 6611 + }, + { + "epoch": 0.8123848138591965, + "grad_norm": 1.0757731537945112, + "learning_rate": 3.1034540619694683e-06, + "loss": 0.6137, + "step": 6612 + }, + { + "epoch": 0.8125076790760536, + "grad_norm": 1.2142603080287282, + "learning_rate": 3.09953671660731e-06, + "loss": 0.6075, + "step": 6613 + }, + { + "epoch": 0.8126305442929107, + "grad_norm": 1.274661046750979, + "learning_rate": 3.0956215602581933e-06, + "loss": 0.5427, + "step": 6614 + }, + { + "epoch": 0.8127534095097678, + "grad_norm": 1.1376622293187173, + "learning_rate": 3.0917085936422934e-06, + "loss": 0.6093, + "step": 6615 + }, + { + "epoch": 0.8128762747266249, + "grad_norm": 1.436701806909365, + "learning_rate": 3.0877978174793642e-06, + "loss": 0.6159, + "step": 6616 + }, + { + "epoch": 0.812999139943482, + "grad_norm": 1.2170212514902943, + "learning_rate": 3.083889232488775e-06, + "loss": 0.5898, + "step": 6617 + }, + { + "epoch": 0.8131220051603391, + "grad_norm": 1.454776465415841, + "learning_rate": 3.0799828393894863e-06, + "loss": 0.6446, + "step": 6618 + }, + { + "epoch": 0.8132448703771962, + "grad_norm": 1.2473799239599976, + "learning_rate": 3.076078638900046e-06, + "loss": 0.5544, + "step": 6619 + }, + { + "epoch": 0.8133677355940533, + "grad_norm": 1.3037776451640537, + "learning_rate": 3.0721766317386153e-06, + "loss": 0.6522, + "step": 6620 + }, + { + "epoch": 0.8134906008109104, + "grad_norm": 1.2859992804700793, + "learning_rate": 3.068276818622929e-06, + "loss": 0.4743, + "step": 6621 + }, + { + "epoch": 0.8136134660277675, + "grad_norm": 1.239383242025873, + "learning_rate": 3.0643792002703515e-06, + "loss": 0.513, + "step": 6622 + }, + { + "epoch": 0.8137363312446246, + "grad_norm": 1.0895889890579138, + "learning_rate": 3.0604837773978095e-06, + "loss": 0.5914, + "step": 6623 + }, + { + "epoch": 0.8138591964614817, + "grad_norm": 1.1279186187079415, + "learning_rate": 3.0565905507218473e-06, + "loss": 0.5089, + "step": 6624 + }, + { + "epoch": 0.8139820616783389, + "grad_norm": 1.2144969041894462, + "learning_rate": 3.0526995209586016e-06, + "loss": 0.5679, + "step": 6625 + }, + { + "epoch": 0.814104926895196, + "grad_norm": 1.2419345354860063, + "learning_rate": 3.048810688823794e-06, + "loss": 0.674, + "step": 6626 + }, + { + "epoch": 0.8142277921120531, + "grad_norm": 1.1755492401778183, + "learning_rate": 3.0449240550327577e-06, + "loss": 0.6689, + "step": 6627 + }, + { + "epoch": 0.8143506573289102, + "grad_norm": 1.2801635186562725, + "learning_rate": 3.041039620300402e-06, + "loss": 0.6315, + "step": 6628 + }, + { + "epoch": 0.8144735225457673, + "grad_norm": 1.0727600928886059, + "learning_rate": 3.03715738534126e-06, + "loss": 0.6236, + "step": 6629 + }, + { + "epoch": 0.8145963877626244, + "grad_norm": 1.246386858577507, + "learning_rate": 3.0332773508694302e-06, + "loss": 0.5527, + "step": 6630 + }, + { + "epoch": 0.8147192529794816, + "grad_norm": 1.2286201785416135, + "learning_rate": 3.02939951759863e-06, + "loss": 0.6076, + "step": 6631 + }, + { + "epoch": 0.8148421181963387, + "grad_norm": 1.590263955294201, + "learning_rate": 3.0255238862421474e-06, + "loss": 0.6019, + "step": 6632 + }, + { + "epoch": 0.8149649834131957, + "grad_norm": 1.2710184031908658, + "learning_rate": 3.021650457512897e-06, + "loss": 0.5708, + "step": 6633 + }, + { + "epoch": 0.8150878486300528, + "grad_norm": 1.0609259617985503, + "learning_rate": 3.0177792321233595e-06, + "loss": 0.5744, + "step": 6634 + }, + { + "epoch": 0.8152107138469099, + "grad_norm": 1.4519166753218147, + "learning_rate": 3.013910210785629e-06, + "loss": 0.5812, + "step": 6635 + }, + { + "epoch": 0.815333579063767, + "grad_norm": 1.138053853944056, + "learning_rate": 3.0100433942113776e-06, + "loss": 0.5444, + "step": 6636 + }, + { + "epoch": 0.8154564442806241, + "grad_norm": 1.357586394165309, + "learning_rate": 3.006178783111887e-06, + "loss": 0.497, + "step": 6637 + }, + { + "epoch": 0.8155793094974813, + "grad_norm": 1.1485955357223068, + "learning_rate": 3.002316378198029e-06, + "loss": 0.5243, + "step": 6638 + }, + { + "epoch": 0.8157021747143384, + "grad_norm": 1.0310259360235285, + "learning_rate": 2.9984561801802635e-06, + "loss": 0.6582, + "step": 6639 + }, + { + "epoch": 0.8158250399311955, + "grad_norm": 1.1665346582627323, + "learning_rate": 2.994598189768649e-06, + "loss": 0.6266, + "step": 6640 + }, + { + "epoch": 0.8159479051480526, + "grad_norm": 1.448259035797744, + "learning_rate": 2.9907424076728417e-06, + "loss": 0.5615, + "step": 6641 + }, + { + "epoch": 0.8160707703649097, + "grad_norm": 1.0820879805593566, + "learning_rate": 2.986888834602089e-06, + "loss": 0.5001, + "step": 6642 + }, + { + "epoch": 0.8161936355817668, + "grad_norm": 1.0661010836289169, + "learning_rate": 2.9830374712652235e-06, + "loss": 0.4813, + "step": 6643 + }, + { + "epoch": 0.816316500798624, + "grad_norm": 1.2151525275855166, + "learning_rate": 2.9791883183706823e-06, + "loss": 0.6267, + "step": 6644 + }, + { + "epoch": 0.8164393660154811, + "grad_norm": 1.0278831543573936, + "learning_rate": 2.975341376626496e-06, + "loss": 0.5935, + "step": 6645 + }, + { + "epoch": 0.8165622312323382, + "grad_norm": 1.02205664773775, + "learning_rate": 2.971496646740276e-06, + "loss": 0.531, + "step": 6646 + }, + { + "epoch": 0.8166850964491952, + "grad_norm": 1.6279077271845546, + "learning_rate": 2.9676541294192423e-06, + "loss": 0.5692, + "step": 6647 + }, + { + "epoch": 0.8168079616660523, + "grad_norm": 1.584983072270823, + "learning_rate": 2.9638138253701974e-06, + "loss": 0.5804, + "step": 6648 + }, + { + "epoch": 0.8169308268829094, + "grad_norm": 1.1046145144490644, + "learning_rate": 2.9599757352995466e-06, + "loss": 0.5285, + "step": 6649 + }, + { + "epoch": 0.8170536920997665, + "grad_norm": 1.164620131032551, + "learning_rate": 2.9561398599132733e-06, + "loss": 0.5489, + "step": 6650 + }, + { + "epoch": 0.8171765573166236, + "grad_norm": 1.2014289776083942, + "learning_rate": 2.9523061999169646e-06, + "loss": 0.6245, + "step": 6651 + }, + { + "epoch": 0.8172994225334808, + "grad_norm": 1.1420238064805255, + "learning_rate": 2.9484747560157986e-06, + "loss": 0.6696, + "step": 6652 + }, + { + "epoch": 0.8174222877503379, + "grad_norm": 1.0226800960630062, + "learning_rate": 2.944645528914548e-06, + "loss": 0.6125, + "step": 6653 + }, + { + "epoch": 0.817545152967195, + "grad_norm": 1.263234788677675, + "learning_rate": 2.9408185193175673e-06, + "loss": 0.6009, + "step": 6654 + }, + { + "epoch": 0.8176680181840521, + "grad_norm": 1.0963894815570328, + "learning_rate": 2.9369937279288138e-06, + "loss": 0.5181, + "step": 6655 + }, + { + "epoch": 0.8177908834009092, + "grad_norm": 1.0929934948098048, + "learning_rate": 2.9331711554518364e-06, + "loss": 0.626, + "step": 6656 + }, + { + "epoch": 0.8179137486177663, + "grad_norm": 1.4359150319593135, + "learning_rate": 2.9293508025897644e-06, + "loss": 0.678, + "step": 6657 + }, + { + "epoch": 0.8180366138346234, + "grad_norm": 1.2301269771260959, + "learning_rate": 2.9255326700453365e-06, + "loss": 0.7046, + "step": 6658 + }, + { + "epoch": 0.8181594790514806, + "grad_norm": 1.4431381161082344, + "learning_rate": 2.9217167585208587e-06, + "loss": 0.652, + "step": 6659 + }, + { + "epoch": 0.8182823442683377, + "grad_norm": 1.2741470987590664, + "learning_rate": 2.917903068718262e-06, + "loss": 0.6521, + "step": 6660 + }, + { + "epoch": 0.8184052094851948, + "grad_norm": 1.0947300207977313, + "learning_rate": 2.914091601339036e-06, + "loss": 0.5886, + "step": 6661 + }, + { + "epoch": 0.8185280747020518, + "grad_norm": 1.6204572721787587, + "learning_rate": 2.9102823570842846e-06, + "loss": 0.6271, + "step": 6662 + }, + { + "epoch": 0.8186509399189089, + "grad_norm": 1.110354373559304, + "learning_rate": 2.9064753366546836e-06, + "loss": 0.5593, + "step": 6663 + }, + { + "epoch": 0.818773805135766, + "grad_norm": 1.2220960792925708, + "learning_rate": 2.9026705407505165e-06, + "loss": 0.5542, + "step": 6664 + }, + { + "epoch": 0.8188966703526231, + "grad_norm": 1.0459882029403638, + "learning_rate": 2.8988679700716534e-06, + "loss": 0.565, + "step": 6665 + }, + { + "epoch": 0.8190195355694803, + "grad_norm": 1.3742594376107566, + "learning_rate": 2.89506762531754e-06, + "loss": 0.5362, + "step": 6666 + }, + { + "epoch": 0.8191424007863374, + "grad_norm": 1.2677968403144895, + "learning_rate": 2.891269507187242e-06, + "loss": 0.5853, + "step": 6667 + }, + { + "epoch": 0.8192652660031945, + "grad_norm": 1.376411475787457, + "learning_rate": 2.887473616379387e-06, + "loss": 0.6112, + "step": 6668 + }, + { + "epoch": 0.8193881312200516, + "grad_norm": 1.3405592903620076, + "learning_rate": 2.8836799535922116e-06, + "loss": 0.6079, + "step": 6669 + }, + { + "epoch": 0.8195109964369087, + "grad_norm": 1.1794673365212178, + "learning_rate": 2.8798885195235224e-06, + "loss": 0.5116, + "step": 6670 + }, + { + "epoch": 0.8196338616537658, + "grad_norm": 1.3914273727394872, + "learning_rate": 2.876099314870747e-06, + "loss": 0.5637, + "step": 6671 + }, + { + "epoch": 0.819756726870623, + "grad_norm": 1.0127705318110645, + "learning_rate": 2.8723123403308726e-06, + "loss": 0.5782, + "step": 6672 + }, + { + "epoch": 0.8198795920874801, + "grad_norm": 1.1326996966192378, + "learning_rate": 2.868527596600497e-06, + "loss": 0.6257, + "step": 6673 + }, + { + "epoch": 0.8200024573043372, + "grad_norm": 1.6723540848975844, + "learning_rate": 2.86474508437579e-06, + "loss": 0.521, + "step": 6674 + }, + { + "epoch": 0.8201253225211943, + "grad_norm": 1.219717050945167, + "learning_rate": 2.860964804352525e-06, + "loss": 0.5168, + "step": 6675 + }, + { + "epoch": 0.8202481877380513, + "grad_norm": 1.1192611111718203, + "learning_rate": 2.8571867572260626e-06, + "loss": 0.566, + "step": 6676 + }, + { + "epoch": 0.8203710529549084, + "grad_norm": 1.0999282645537292, + "learning_rate": 2.8534109436913445e-06, + "loss": 0.5611, + "step": 6677 + }, + { + "epoch": 0.8204939181717655, + "grad_norm": 1.1246203680190694, + "learning_rate": 2.8496373644429095e-06, + "loss": 0.6206, + "step": 6678 + }, + { + "epoch": 0.8206167833886227, + "grad_norm": 1.0551399699641502, + "learning_rate": 2.8458660201748836e-06, + "loss": 0.6029, + "step": 6679 + }, + { + "epoch": 0.8207396486054798, + "grad_norm": 1.4174703812547376, + "learning_rate": 2.842096911580985e-06, + "loss": 0.508, + "step": 6680 + }, + { + "epoch": 0.8208625138223369, + "grad_norm": 1.2352559103495444, + "learning_rate": 2.8383300393545098e-06, + "loss": 0.593, + "step": 6681 + }, + { + "epoch": 0.820985379039194, + "grad_norm": 1.7575841772027923, + "learning_rate": 2.834565404188351e-06, + "loss": 0.5423, + "step": 6682 + }, + { + "epoch": 0.8211082442560511, + "grad_norm": 1.3678715395975665, + "learning_rate": 2.8308030067749955e-06, + "loss": 0.7476, + "step": 6683 + }, + { + "epoch": 0.8212311094729082, + "grad_norm": 1.1013341787213586, + "learning_rate": 2.8270428478065015e-06, + "loss": 0.4219, + "step": 6684 + }, + { + "epoch": 0.8213539746897653, + "grad_norm": 1.234816175529207, + "learning_rate": 2.8232849279745366e-06, + "loss": 0.5597, + "step": 6685 + }, + { + "epoch": 0.8214768399066225, + "grad_norm": 1.1505280510947504, + "learning_rate": 2.8195292479703315e-06, + "loss": 0.5565, + "step": 6686 + }, + { + "epoch": 0.8215997051234796, + "grad_norm": 1.293565021906243, + "learning_rate": 2.815775808484737e-06, + "loss": 0.4525, + "step": 6687 + }, + { + "epoch": 0.8217225703403367, + "grad_norm": 1.2246973199185827, + "learning_rate": 2.8120246102081614e-06, + "loss": 0.6332, + "step": 6688 + }, + { + "epoch": 0.8218454355571938, + "grad_norm": 1.1981459450721315, + "learning_rate": 2.808275653830617e-06, + "loss": 0.5691, + "step": 6689 + }, + { + "epoch": 0.8219683007740509, + "grad_norm": 1.0864360521171215, + "learning_rate": 2.804528940041699e-06, + "loss": 0.4738, + "step": 6690 + }, + { + "epoch": 0.8220911659909079, + "grad_norm": 1.549297906294064, + "learning_rate": 2.800784469530596e-06, + "loss": 0.5843, + "step": 6691 + }, + { + "epoch": 0.822214031207765, + "grad_norm": 1.0538714499954862, + "learning_rate": 2.797042242986071e-06, + "loss": 0.5612, + "step": 6692 + }, + { + "epoch": 0.8223368964246222, + "grad_norm": 1.2166665343929424, + "learning_rate": 2.7933022610964877e-06, + "loss": 0.6322, + "step": 6693 + }, + { + "epoch": 0.8224597616414793, + "grad_norm": 1.6505162137711726, + "learning_rate": 2.7895645245497926e-06, + "loss": 0.5546, + "step": 6694 + }, + { + "epoch": 0.8225826268583364, + "grad_norm": 1.4206748954461306, + "learning_rate": 2.7858290340335126e-06, + "loss": 0.479, + "step": 6695 + }, + { + "epoch": 0.8227054920751935, + "grad_norm": 1.1580981686423113, + "learning_rate": 2.7820957902347744e-06, + "loss": 0.6716, + "step": 6696 + }, + { + "epoch": 0.8228283572920506, + "grad_norm": 0.9362086848140213, + "learning_rate": 2.7783647938402724e-06, + "loss": 0.5586, + "step": 6697 + }, + { + "epoch": 0.8229512225089077, + "grad_norm": 1.1955384393283492, + "learning_rate": 2.7746360455363123e-06, + "loss": 0.5219, + "step": 6698 + }, + { + "epoch": 0.8230740877257648, + "grad_norm": 1.2716538952923222, + "learning_rate": 2.7709095460087656e-06, + "loss": 0.5186, + "step": 6699 + }, + { + "epoch": 0.823196952942622, + "grad_norm": 1.202886058399535, + "learning_rate": 2.767185295943101e-06, + "loss": 0.5794, + "step": 6700 + }, + { + "epoch": 0.8233198181594791, + "grad_norm": 1.1275291781855898, + "learning_rate": 2.7634632960243667e-06, + "loss": 0.5676, + "step": 6701 + }, + { + "epoch": 0.8234426833763362, + "grad_norm": 1.0508388977069862, + "learning_rate": 2.759743546937202e-06, + "loss": 0.4392, + "step": 6702 + }, + { + "epoch": 0.8235655485931933, + "grad_norm": 1.3596213040145566, + "learning_rate": 2.756026049365834e-06, + "loss": 0.5566, + "step": 6703 + }, + { + "epoch": 0.8236884138100504, + "grad_norm": 1.2148379880683073, + "learning_rate": 2.7523108039940662e-06, + "loss": 0.5347, + "step": 6704 + }, + { + "epoch": 0.8238112790269074, + "grad_norm": 1.1893431121837401, + "learning_rate": 2.748597811505297e-06, + "loss": 0.5564, + "step": 6705 + }, + { + "epoch": 0.8239341442437645, + "grad_norm": 1.05767644230728, + "learning_rate": 2.744887072582507e-06, + "loss": 0.4321, + "step": 6706 + }, + { + "epoch": 0.8240570094606217, + "grad_norm": 0.9834984731299086, + "learning_rate": 2.7411785879082663e-06, + "loss": 0.5399, + "step": 6707 + }, + { + "epoch": 0.8241798746774788, + "grad_norm": 1.2461144278014782, + "learning_rate": 2.737472358164721e-06, + "loss": 0.3974, + "step": 6708 + }, + { + "epoch": 0.8243027398943359, + "grad_norm": 0.9509187069299544, + "learning_rate": 2.7337683840336074e-06, + "loss": 0.6535, + "step": 6709 + }, + { + "epoch": 0.824425605111193, + "grad_norm": 1.4227799917916462, + "learning_rate": 2.7300666661962558e-06, + "loss": 0.612, + "step": 6710 + }, + { + "epoch": 0.8245484703280501, + "grad_norm": 1.2731069826355923, + "learning_rate": 2.726367205333563e-06, + "loss": 0.694, + "step": 6711 + }, + { + "epoch": 0.8246713355449072, + "grad_norm": 1.1192784133038312, + "learning_rate": 2.7226700021260267e-06, + "loss": 0.5372, + "step": 6712 + }, + { + "epoch": 0.8247942007617644, + "grad_norm": 1.1586262840332504, + "learning_rate": 2.718975057253722e-06, + "loss": 0.6421, + "step": 6713 + }, + { + "epoch": 0.8249170659786215, + "grad_norm": 1.103265875031216, + "learning_rate": 2.7152823713963125e-06, + "loss": 0.5731, + "step": 6714 + }, + { + "epoch": 0.8250399311954786, + "grad_norm": 1.2326790460630666, + "learning_rate": 2.7115919452330403e-06, + "loss": 0.7605, + "step": 6715 + }, + { + "epoch": 0.8251627964123357, + "grad_norm": 1.09978782657009, + "learning_rate": 2.7079037794427346e-06, + "loss": 0.5048, + "step": 6716 + }, + { + "epoch": 0.8252856616291928, + "grad_norm": 1.221544360755342, + "learning_rate": 2.704217874703812e-06, + "loss": 0.5354, + "step": 6717 + }, + { + "epoch": 0.8254085268460499, + "grad_norm": 1.3188020376134162, + "learning_rate": 2.7005342316942748e-06, + "loss": 0.6741, + "step": 6718 + }, + { + "epoch": 0.825531392062907, + "grad_norm": 1.1799220851659824, + "learning_rate": 2.696852851091696e-06, + "loss": 0.5387, + "step": 6719 + }, + { + "epoch": 0.825654257279764, + "grad_norm": 1.0437313632359728, + "learning_rate": 2.6931737335732476e-06, + "loss": 0.5898, + "step": 6720 + }, + { + "epoch": 0.8257771224966212, + "grad_norm": 1.2828370993469216, + "learning_rate": 2.689496879815681e-06, + "loss": 0.6639, + "step": 6721 + }, + { + "epoch": 0.8258999877134783, + "grad_norm": 1.2000903972313532, + "learning_rate": 2.685822290495324e-06, + "loss": 0.5169, + "step": 6722 + }, + { + "epoch": 0.8260228529303354, + "grad_norm": 1.2453176661728391, + "learning_rate": 2.6821499662881004e-06, + "loss": 0.5914, + "step": 6723 + }, + { + "epoch": 0.8261457181471925, + "grad_norm": 1.2348465635188044, + "learning_rate": 2.6784799078694987e-06, + "loss": 0.622, + "step": 6724 + }, + { + "epoch": 0.8262685833640496, + "grad_norm": 1.103949748518898, + "learning_rate": 2.674812115914617e-06, + "loss": 0.6275, + "step": 6725 + }, + { + "epoch": 0.8263914485809067, + "grad_norm": 1.3712147806994244, + "learning_rate": 2.6711465910981125e-06, + "loss": 0.5289, + "step": 6726 + }, + { + "epoch": 0.8265143137977639, + "grad_norm": 1.0656816338626431, + "learning_rate": 2.667483334094239e-06, + "loss": 0.5292, + "step": 6727 + }, + { + "epoch": 0.826637179014621, + "grad_norm": 1.1026289375048874, + "learning_rate": 2.6638223455768242e-06, + "loss": 0.5908, + "step": 6728 + }, + { + "epoch": 0.8267600442314781, + "grad_norm": 1.1950040681203076, + "learning_rate": 2.6601636262192874e-06, + "loss": 0.5318, + "step": 6729 + }, + { + "epoch": 0.8268829094483352, + "grad_norm": 1.2993865168631362, + "learning_rate": 2.6565071766946277e-06, + "loss": 0.5659, + "step": 6730 + }, + { + "epoch": 0.8270057746651923, + "grad_norm": 1.1142046733826005, + "learning_rate": 2.6528529976754128e-06, + "loss": 0.5457, + "step": 6731 + }, + { + "epoch": 0.8271286398820494, + "grad_norm": 1.1057480213489241, + "learning_rate": 2.649201089833826e-06, + "loss": 0.6505, + "step": 6732 + }, + { + "epoch": 0.8272515050989065, + "grad_norm": 1.0775837318671488, + "learning_rate": 2.6455514538415943e-06, + "loss": 0.5442, + "step": 6733 + }, + { + "epoch": 0.8273743703157636, + "grad_norm": 1.1715722326599842, + "learning_rate": 2.641904090370056e-06, + "loss": 0.5296, + "step": 6734 + }, + { + "epoch": 0.8274972355326207, + "grad_norm": 1.133066026055284, + "learning_rate": 2.638259000090109e-06, + "loss": 0.5006, + "step": 6735 + }, + { + "epoch": 0.8276201007494778, + "grad_norm": 1.2965043905697884, + "learning_rate": 2.634616183672256e-06, + "loss": 0.5812, + "step": 6736 + }, + { + "epoch": 0.8277429659663349, + "grad_norm": 1.1362180896367655, + "learning_rate": 2.6309756417865607e-06, + "loss": 0.5852, + "step": 6737 + }, + { + "epoch": 0.827865831183192, + "grad_norm": 1.2819979184423702, + "learning_rate": 2.6273373751026837e-06, + "loss": 0.6023, + "step": 6738 + }, + { + "epoch": 0.8279886964000491, + "grad_norm": 1.1075058326751737, + "learning_rate": 2.6237013842898533e-06, + "loss": 0.5113, + "step": 6739 + }, + { + "epoch": 0.8281115616169062, + "grad_norm": 1.0374971612355373, + "learning_rate": 2.6200676700168898e-06, + "loss": 0.6453, + "step": 6740 + }, + { + "epoch": 0.8282344268337634, + "grad_norm": 1.1985649477886615, + "learning_rate": 2.616436232952196e-06, + "loss": 0.7429, + "step": 6741 + }, + { + "epoch": 0.8283572920506205, + "grad_norm": 1.2428549876462966, + "learning_rate": 2.6128070737637437e-06, + "loss": 0.5348, + "step": 6742 + }, + { + "epoch": 0.8284801572674776, + "grad_norm": 1.3843463063007098, + "learning_rate": 2.609180193119095e-06, + "loss": 0.4913, + "step": 6743 + }, + { + "epoch": 0.8286030224843347, + "grad_norm": 1.17806790704285, + "learning_rate": 2.6055555916853945e-06, + "loss": 0.5002, + "step": 6744 + }, + { + "epoch": 0.8287258877011918, + "grad_norm": 1.3855038103947404, + "learning_rate": 2.601933270129364e-06, + "loss": 0.5611, + "step": 6745 + }, + { + "epoch": 0.8288487529180489, + "grad_norm": 1.4201895144681331, + "learning_rate": 2.5983132291173007e-06, + "loss": 0.6038, + "step": 6746 + }, + { + "epoch": 0.828971618134906, + "grad_norm": 1.2804528512646467, + "learning_rate": 2.5946954693150915e-06, + "loss": 0.5383, + "step": 6747 + }, + { + "epoch": 0.8290944833517632, + "grad_norm": 1.4133322981603207, + "learning_rate": 2.591079991388203e-06, + "loss": 0.5701, + "step": 6748 + }, + { + "epoch": 0.8292173485686202, + "grad_norm": 1.356532178487605, + "learning_rate": 2.5874667960016725e-06, + "loss": 0.6107, + "step": 6749 + }, + { + "epoch": 0.8293402137854773, + "grad_norm": 1.3090798478797694, + "learning_rate": 2.5838558838201304e-06, + "loss": 0.5284, + "step": 6750 + }, + { + "epoch": 0.8294630790023344, + "grad_norm": 1.3690586069530413, + "learning_rate": 2.580247255507769e-06, + "loss": 0.636, + "step": 6751 + }, + { + "epoch": 0.8295859442191915, + "grad_norm": 1.206100248302592, + "learning_rate": 2.576640911728387e-06, + "loss": 0.5986, + "step": 6752 + }, + { + "epoch": 0.8297088094360486, + "grad_norm": 1.3717278332705034, + "learning_rate": 2.573036853145337e-06, + "loss": 0.4865, + "step": 6753 + }, + { + "epoch": 0.8298316746529057, + "grad_norm": 1.2346591351248921, + "learning_rate": 2.569435080421567e-06, + "loss": 0.559, + "step": 6754 + }, + { + "epoch": 0.8299545398697629, + "grad_norm": 0.9919722849938025, + "learning_rate": 2.5658355942195994e-06, + "loss": 0.6822, + "step": 6755 + }, + { + "epoch": 0.83007740508662, + "grad_norm": 1.163215737948197, + "learning_rate": 2.5622383952015386e-06, + "loss": 0.5657, + "step": 6756 + }, + { + "epoch": 0.8302002703034771, + "grad_norm": 1.2348485850347681, + "learning_rate": 2.5586434840290597e-06, + "loss": 0.5667, + "step": 6757 + }, + { + "epoch": 0.8303231355203342, + "grad_norm": 1.2415098018323636, + "learning_rate": 2.555050861363428e-06, + "loss": 0.5982, + "step": 6758 + }, + { + "epoch": 0.8304460007371913, + "grad_norm": 1.090014099957309, + "learning_rate": 2.5514605278654844e-06, + "loss": 0.5505, + "step": 6759 + }, + { + "epoch": 0.8305688659540484, + "grad_norm": 1.0384056362094325, + "learning_rate": 2.547872484195642e-06, + "loss": 0.5823, + "step": 6760 + }, + { + "epoch": 0.8306917311709056, + "grad_norm": 1.024749825732001, + "learning_rate": 2.544286731013905e-06, + "loss": 0.5605, + "step": 6761 + }, + { + "epoch": 0.8308145963877627, + "grad_norm": 1.256143566067871, + "learning_rate": 2.540703268979838e-06, + "loss": 0.5543, + "step": 6762 + }, + { + "epoch": 0.8309374616046198, + "grad_norm": 1.3706659372737529, + "learning_rate": 2.5371220987526105e-06, + "loss": 0.5796, + "step": 6763 + }, + { + "epoch": 0.8310603268214768, + "grad_norm": 1.7010787757748296, + "learning_rate": 2.533543220990944e-06, + "loss": 0.6681, + "step": 6764 + }, + { + "epoch": 0.8311831920383339, + "grad_norm": 1.0019446906974296, + "learning_rate": 2.5299666363531594e-06, + "loss": 0.614, + "step": 6765 + }, + { + "epoch": 0.831306057255191, + "grad_norm": 1.181270799211324, + "learning_rate": 2.526392345497136e-06, + "loss": 0.6422, + "step": 6766 + }, + { + "epoch": 0.8314289224720481, + "grad_norm": 1.3342129485230054, + "learning_rate": 2.522820349080348e-06, + "loss": 0.5605, + "step": 6767 + }, + { + "epoch": 0.8315517876889053, + "grad_norm": 1.1678601118515612, + "learning_rate": 2.5192506477598415e-06, + "loss": 0.499, + "step": 6768 + }, + { + "epoch": 0.8316746529057624, + "grad_norm": 1.4431158706911964, + "learning_rate": 2.515683242192236e-06, + "loss": 0.6446, + "step": 6769 + }, + { + "epoch": 0.8317975181226195, + "grad_norm": 0.7927753618197991, + "learning_rate": 2.5121181330337336e-06, + "loss": 0.576, + "step": 6770 + }, + { + "epoch": 0.8319203833394766, + "grad_norm": 1.1640749253824674, + "learning_rate": 2.5085553209401123e-06, + "loss": 0.4917, + "step": 6771 + }, + { + "epoch": 0.8320432485563337, + "grad_norm": 1.3415999355854327, + "learning_rate": 2.5049948065667355e-06, + "loss": 0.5644, + "step": 6772 + }, + { + "epoch": 0.8321661137731908, + "grad_norm": 1.3670645786417723, + "learning_rate": 2.5014365905685237e-06, + "loss": 0.7449, + "step": 6773 + }, + { + "epoch": 0.832288978990048, + "grad_norm": 1.2439839368707077, + "learning_rate": 2.497880673600002e-06, + "loss": 0.6078, + "step": 6774 + }, + { + "epoch": 0.8324118442069051, + "grad_norm": 1.0061908624524034, + "learning_rate": 2.494327056315247e-06, + "loss": 0.585, + "step": 6775 + }, + { + "epoch": 0.8325347094237622, + "grad_norm": 1.1512145509299403, + "learning_rate": 2.4907757393679326e-06, + "loss": 0.6337, + "step": 6776 + }, + { + "epoch": 0.8326575746406193, + "grad_norm": 1.3345155391633208, + "learning_rate": 2.487226723411291e-06, + "loss": 0.5735, + "step": 6777 + }, + { + "epoch": 0.8327804398574763, + "grad_norm": 1.40636569427536, + "learning_rate": 2.4836800090981455e-06, + "loss": 0.7069, + "step": 6778 + }, + { + "epoch": 0.8329033050743334, + "grad_norm": 1.2005396560504944, + "learning_rate": 2.4801355970808955e-06, + "loss": 0.6887, + "step": 6779 + }, + { + "epoch": 0.8330261702911905, + "grad_norm": 1.2277900921105345, + "learning_rate": 2.4765934880115042e-06, + "loss": 0.516, + "step": 6780 + }, + { + "epoch": 0.8331490355080476, + "grad_norm": 1.1887043397465535, + "learning_rate": 2.4730536825415247e-06, + "loss": 0.5651, + "step": 6781 + }, + { + "epoch": 0.8332719007249048, + "grad_norm": 1.365192714989377, + "learning_rate": 2.4695161813220783e-06, + "loss": 0.5267, + "step": 6782 + }, + { + "epoch": 0.8333947659417619, + "grad_norm": 0.9567203897450611, + "learning_rate": 2.4659809850038724e-06, + "loss": 0.4923, + "step": 6783 + }, + { + "epoch": 0.833517631158619, + "grad_norm": 1.2590127137019946, + "learning_rate": 2.462448094237174e-06, + "loss": 0.652, + "step": 6784 + }, + { + "epoch": 0.8336404963754761, + "grad_norm": 0.9913490018858003, + "learning_rate": 2.458917509671839e-06, + "loss": 0.4959, + "step": 6785 + }, + { + "epoch": 0.8337633615923332, + "grad_norm": 1.347642208923607, + "learning_rate": 2.4553892319573012e-06, + "loss": 0.5232, + "step": 6786 + }, + { + "epoch": 0.8338862268091903, + "grad_norm": 1.0302702059179398, + "learning_rate": 2.4518632617425563e-06, + "loss": 0.6331, + "step": 6787 + }, + { + "epoch": 0.8340090920260474, + "grad_norm": 1.195327860553905, + "learning_rate": 2.4483395996761903e-06, + "loss": 0.6027, + "step": 6788 + }, + { + "epoch": 0.8341319572429046, + "grad_norm": 1.1057681542069673, + "learning_rate": 2.444818246406347e-06, + "loss": 0.6788, + "step": 6789 + }, + { + "epoch": 0.8342548224597617, + "grad_norm": 1.171803819367584, + "learning_rate": 2.4412992025807708e-06, + "loss": 0.5732, + "step": 6790 + }, + { + "epoch": 0.8343776876766188, + "grad_norm": 1.5541570071004414, + "learning_rate": 2.437782468846756e-06, + "loss": 0.5699, + "step": 6791 + }, + { + "epoch": 0.8345005528934759, + "grad_norm": 1.365700958918804, + "learning_rate": 2.4342680458511916e-06, + "loss": 0.5762, + "step": 6792 + }, + { + "epoch": 0.8346234181103329, + "grad_norm": 1.3817968884933955, + "learning_rate": 2.4307559342405227e-06, + "loss": 0.5013, + "step": 6793 + }, + { + "epoch": 0.83474628332719, + "grad_norm": 1.4007872617384005, + "learning_rate": 2.4272461346607904e-06, + "loss": 0.5618, + "step": 6794 + }, + { + "epoch": 0.8348691485440471, + "grad_norm": 1.3563995428829838, + "learning_rate": 2.4237386477575917e-06, + "loss": 0.5614, + "step": 6795 + }, + { + "epoch": 0.8349920137609043, + "grad_norm": 1.1285463595879426, + "learning_rate": 2.420233474176109e-06, + "loss": 0.5616, + "step": 6796 + }, + { + "epoch": 0.8351148789777614, + "grad_norm": 1.1488647706957578, + "learning_rate": 2.4167306145610996e-06, + "loss": 0.6441, + "step": 6797 + }, + { + "epoch": 0.8352377441946185, + "grad_norm": 1.1702438936311688, + "learning_rate": 2.413230069556885e-06, + "loss": 0.6706, + "step": 6798 + }, + { + "epoch": 0.8353606094114756, + "grad_norm": 1.2814855723793075, + "learning_rate": 2.409731839807375e-06, + "loss": 0.7008, + "step": 6799 + }, + { + "epoch": 0.8354834746283327, + "grad_norm": 1.1007601819765673, + "learning_rate": 2.4062359259560348e-06, + "loss": 0.5221, + "step": 6800 + }, + { + "epoch": 0.8356063398451898, + "grad_norm": 1.2670593208790457, + "learning_rate": 2.4027423286459284e-06, + "loss": 0.5589, + "step": 6801 + }, + { + "epoch": 0.835729205062047, + "grad_norm": 1.443193546471224, + "learning_rate": 2.3992510485196716e-06, + "loss": 0.6503, + "step": 6802 + }, + { + "epoch": 0.8358520702789041, + "grad_norm": 1.1912053206381632, + "learning_rate": 2.3957620862194695e-06, + "loss": 0.5141, + "step": 6803 + }, + { + "epoch": 0.8359749354957612, + "grad_norm": 1.1670943890320036, + "learning_rate": 2.392275442387087e-06, + "loss": 0.5424, + "step": 6804 + }, + { + "epoch": 0.8360978007126183, + "grad_norm": 1.0959559406062556, + "learning_rate": 2.3887911176638737e-06, + "loss": 0.5653, + "step": 6805 + }, + { + "epoch": 0.8362206659294754, + "grad_norm": 1.103361456052754, + "learning_rate": 2.3853091126907493e-06, + "loss": 0.4627, + "step": 6806 + }, + { + "epoch": 0.8363435311463324, + "grad_norm": 1.348824232266682, + "learning_rate": 2.381829428108203e-06, + "loss": 0.4428, + "step": 6807 + }, + { + "epoch": 0.8364663963631895, + "grad_norm": 1.2575225502546108, + "learning_rate": 2.3783520645562996e-06, + "loss": 0.5699, + "step": 6808 + }, + { + "epoch": 0.8365892615800467, + "grad_norm": 1.14253230087687, + "learning_rate": 2.374877022674682e-06, + "loss": 0.5433, + "step": 6809 + }, + { + "epoch": 0.8367121267969038, + "grad_norm": 1.1844506622575386, + "learning_rate": 2.3714043031025608e-06, + "loss": 0.6338, + "step": 6810 + }, + { + "epoch": 0.8368349920137609, + "grad_norm": 1.2001261929452505, + "learning_rate": 2.3679339064787165e-06, + "loss": 0.5371, + "step": 6811 + }, + { + "epoch": 0.836957857230618, + "grad_norm": 1.2108825857238563, + "learning_rate": 2.364465833441507e-06, + "loss": 0.584, + "step": 6812 + }, + { + "epoch": 0.8370807224474751, + "grad_norm": 1.358888245891515, + "learning_rate": 2.3610000846288637e-06, + "loss": 0.5448, + "step": 6813 + }, + { + "epoch": 0.8372035876643322, + "grad_norm": 1.1173590740611345, + "learning_rate": 2.3575366606782916e-06, + "loss": 0.583, + "step": 6814 + }, + { + "epoch": 0.8373264528811893, + "grad_norm": 1.1850474700561449, + "learning_rate": 2.3540755622268597e-06, + "loss": 0.5614, + "step": 6815 + }, + { + "epoch": 0.8374493180980465, + "grad_norm": 1.137570536277114, + "learning_rate": 2.3506167899112146e-06, + "loss": 0.4902, + "step": 6816 + }, + { + "epoch": 0.8375721833149036, + "grad_norm": 1.4700998487200152, + "learning_rate": 2.34716034436758e-06, + "loss": 0.5428, + "step": 6817 + }, + { + "epoch": 0.8376950485317607, + "grad_norm": 1.3601037801587244, + "learning_rate": 2.3437062262317398e-06, + "loss": 0.5704, + "step": 6818 + }, + { + "epoch": 0.8378179137486178, + "grad_norm": 1.3198350595135102, + "learning_rate": 2.3402544361390614e-06, + "loss": 0.623, + "step": 6819 + }, + { + "epoch": 0.8379407789654749, + "grad_norm": 1.329421282363694, + "learning_rate": 2.3368049747244786e-06, + "loss": 0.6589, + "step": 6820 + }, + { + "epoch": 0.838063644182332, + "grad_norm": 0.990077082537294, + "learning_rate": 2.3333578426225e-06, + "loss": 0.6808, + "step": 6821 + }, + { + "epoch": 0.838186509399189, + "grad_norm": 1.2249565343764999, + "learning_rate": 2.329913040467195e-06, + "loss": 0.5145, + "step": 6822 + }, + { + "epoch": 0.8383093746160462, + "grad_norm": 1.0652389721538855, + "learning_rate": 2.326470568892221e-06, + "loss": 0.6507, + "step": 6823 + }, + { + "epoch": 0.8384322398329033, + "grad_norm": 1.2572113511689809, + "learning_rate": 2.3230304285307956e-06, + "loss": 0.5925, + "step": 6824 + }, + { + "epoch": 0.8385551050497604, + "grad_norm": 1.363281602968501, + "learning_rate": 2.319592620015708e-06, + "loss": 0.566, + "step": 6825 + }, + { + "epoch": 0.8386779702666175, + "grad_norm": 1.2778184282473029, + "learning_rate": 2.3161571439793255e-06, + "loss": 0.4766, + "step": 6826 + }, + { + "epoch": 0.8388008354834746, + "grad_norm": 1.2556696258744082, + "learning_rate": 2.3127240010535728e-06, + "loss": 0.7007, + "step": 6827 + }, + { + "epoch": 0.8389237007003317, + "grad_norm": 1.1960655347535225, + "learning_rate": 2.309293191869966e-06, + "loss": 0.5521, + "step": 6828 + }, + { + "epoch": 0.8390465659171888, + "grad_norm": 1.164009283262368, + "learning_rate": 2.305864717059571e-06, + "loss": 0.6342, + "step": 6829 + }, + { + "epoch": 0.839169431134046, + "grad_norm": 1.2694664033478418, + "learning_rate": 2.3024385772530408e-06, + "loss": 0.5156, + "step": 6830 + }, + { + "epoch": 0.8392922963509031, + "grad_norm": 1.7940408437758117, + "learning_rate": 2.2990147730805855e-06, + "loss": 0.5489, + "step": 6831 + }, + { + "epoch": 0.8394151615677602, + "grad_norm": 1.4314802588855344, + "learning_rate": 2.2955933051719924e-06, + "loss": 0.5945, + "step": 6832 + }, + { + "epoch": 0.8395380267846173, + "grad_norm": 1.2956245552840162, + "learning_rate": 2.292174174156623e-06, + "loss": 0.633, + "step": 6833 + }, + { + "epoch": 0.8396608920014744, + "grad_norm": 1.1032378598724923, + "learning_rate": 2.2887573806633983e-06, + "loss": 0.5005, + "step": 6834 + }, + { + "epoch": 0.8397837572183315, + "grad_norm": 1.4436948606897222, + "learning_rate": 2.285342925320818e-06, + "loss": 0.573, + "step": 6835 + }, + { + "epoch": 0.8399066224351885, + "grad_norm": 1.0262916654716179, + "learning_rate": 2.2819308087569502e-06, + "loss": 0.6227, + "step": 6836 + }, + { + "epoch": 0.8400294876520457, + "grad_norm": 1.4602043654642334, + "learning_rate": 2.2785210315994325e-06, + "loss": 0.6326, + "step": 6837 + }, + { + "epoch": 0.8401523528689028, + "grad_norm": 1.2735939799020335, + "learning_rate": 2.2751135944754637e-06, + "loss": 0.6044, + "step": 6838 + }, + { + "epoch": 0.8402752180857599, + "grad_norm": 1.0284444574082756, + "learning_rate": 2.2717084980118304e-06, + "loss": 0.5668, + "step": 6839 + }, + { + "epoch": 0.840398083302617, + "grad_norm": 1.0186080998247287, + "learning_rate": 2.2683057428348715e-06, + "loss": 0.609, + "step": 6840 + }, + { + "epoch": 0.8405209485194741, + "grad_norm": 1.220851595482856, + "learning_rate": 2.264905329570506e-06, + "loss": 0.5973, + "step": 6841 + }, + { + "epoch": 0.8406438137363312, + "grad_norm": 1.2431263052500283, + "learning_rate": 2.2615072588442116e-06, + "loss": 0.6312, + "step": 6842 + }, + { + "epoch": 0.8407666789531884, + "grad_norm": 1.0751511835196335, + "learning_rate": 2.258111531281045e-06, + "loss": 0.4625, + "step": 6843 + }, + { + "epoch": 0.8408895441700455, + "grad_norm": 1.1118636615094226, + "learning_rate": 2.2547181475056313e-06, + "loss": 0.5287, + "step": 6844 + }, + { + "epoch": 0.8410124093869026, + "grad_norm": 1.2008393179658705, + "learning_rate": 2.251327108142155e-06, + "loss": 0.5407, + "step": 6845 + }, + { + "epoch": 0.8411352746037597, + "grad_norm": 1.1586495062323667, + "learning_rate": 2.2479384138143794e-06, + "loss": 0.56, + "step": 6846 + }, + { + "epoch": 0.8412581398206168, + "grad_norm": 1.0806378989840746, + "learning_rate": 2.2445520651456326e-06, + "loss": 0.4778, + "step": 6847 + }, + { + "epoch": 0.8413810050374739, + "grad_norm": 1.343216190248116, + "learning_rate": 2.2411680627588143e-06, + "loss": 0.7237, + "step": 6848 + }, + { + "epoch": 0.841503870254331, + "grad_norm": 1.196412644606064, + "learning_rate": 2.237786407276384e-06, + "loss": 0.5438, + "step": 6849 + }, + { + "epoch": 0.8416267354711882, + "grad_norm": 1.177508184370154, + "learning_rate": 2.234407099320378e-06, + "loss": 0.5144, + "step": 6850 + }, + { + "epoch": 0.8417496006880452, + "grad_norm": 1.0593085512927205, + "learning_rate": 2.2310301395124016e-06, + "loss": 0.6224, + "step": 6851 + }, + { + "epoch": 0.8418724659049023, + "grad_norm": 1.0191860816396656, + "learning_rate": 2.227655528473618e-06, + "loss": 0.6072, + "step": 6852 + }, + { + "epoch": 0.8419953311217594, + "grad_norm": 1.218209509101871, + "learning_rate": 2.224283266824773e-06, + "loss": 0.6005, + "step": 6853 + }, + { + "epoch": 0.8421181963386165, + "grad_norm": 1.1194156406000617, + "learning_rate": 2.22091335518616e-06, + "loss": 0.5623, + "step": 6854 + }, + { + "epoch": 0.8422410615554736, + "grad_norm": 1.1784403862192825, + "learning_rate": 2.2175457941776654e-06, + "loss": 0.5665, + "step": 6855 + }, + { + "epoch": 0.8423639267723307, + "grad_norm": 1.1971085763264901, + "learning_rate": 2.214180584418723e-06, + "loss": 0.5925, + "step": 6856 + }, + { + "epoch": 0.8424867919891879, + "grad_norm": 1.1235386142270245, + "learning_rate": 2.2108177265283468e-06, + "loss": 0.5701, + "step": 6857 + }, + { + "epoch": 0.842609657206045, + "grad_norm": 1.310039064734899, + "learning_rate": 2.207457221125101e-06, + "loss": 0.581, + "step": 6858 + }, + { + "epoch": 0.8427325224229021, + "grad_norm": 1.133331282736554, + "learning_rate": 2.204099068827144e-06, + "loss": 0.562, + "step": 6859 + }, + { + "epoch": 0.8428553876397592, + "grad_norm": 1.1924379415524147, + "learning_rate": 2.200743270252177e-06, + "loss": 0.6492, + "step": 6860 + }, + { + "epoch": 0.8429782528566163, + "grad_norm": 1.1779305776589508, + "learning_rate": 2.1973898260174773e-06, + "loss": 0.5777, + "step": 6861 + }, + { + "epoch": 0.8431011180734734, + "grad_norm": 1.3360749686118727, + "learning_rate": 2.1940387367398956e-06, + "loss": 0.6504, + "step": 6862 + }, + { + "epoch": 0.8432239832903305, + "grad_norm": 1.124216894356352, + "learning_rate": 2.1906900030358353e-06, + "loss": 0.4269, + "step": 6863 + }, + { + "epoch": 0.8433468485071877, + "grad_norm": 1.161785291957812, + "learning_rate": 2.1873436255212814e-06, + "loss": 0.5938, + "step": 6864 + }, + { + "epoch": 0.8434697137240448, + "grad_norm": 1.111500258402825, + "learning_rate": 2.183999604811767e-06, + "loss": 0.663, + "step": 6865 + }, + { + "epoch": 0.8435925789409018, + "grad_norm": 1.1016112115122996, + "learning_rate": 2.1806579415224172e-06, + "loss": 0.584, + "step": 6866 + }, + { + "epoch": 0.8437154441577589, + "grad_norm": 1.0978664405714536, + "learning_rate": 2.1773186362678993e-06, + "loss": 0.6065, + "step": 6867 + }, + { + "epoch": 0.843838309374616, + "grad_norm": 1.2102102693034091, + "learning_rate": 2.1739816896624643e-06, + "loss": 0.687, + "step": 6868 + }, + { + "epoch": 0.8439611745914731, + "grad_norm": 1.183715431575639, + "learning_rate": 2.170647102319914e-06, + "loss": 0.558, + "step": 6869 + }, + { + "epoch": 0.8440840398083302, + "grad_norm": 1.043688855163674, + "learning_rate": 2.1673148748536287e-06, + "loss": 0.5754, + "step": 6870 + }, + { + "epoch": 0.8442069050251874, + "grad_norm": 1.1655764093126952, + "learning_rate": 2.1639850078765523e-06, + "loss": 0.6076, + "step": 6871 + }, + { + "epoch": 0.8443297702420445, + "grad_norm": 1.410950804261355, + "learning_rate": 2.1606575020011864e-06, + "loss": 0.5584, + "step": 6872 + }, + { + "epoch": 0.8444526354589016, + "grad_norm": 1.638778174905786, + "learning_rate": 2.157332357839607e-06, + "loss": 0.4989, + "step": 6873 + }, + { + "epoch": 0.8445755006757587, + "grad_norm": 1.410103514895907, + "learning_rate": 2.1540095760034513e-06, + "loss": 0.5667, + "step": 6874 + }, + { + "epoch": 0.8446983658926158, + "grad_norm": 1.4912663814902771, + "learning_rate": 2.15068915710393e-06, + "loss": 0.5584, + "step": 6875 + }, + { + "epoch": 0.8448212311094729, + "grad_norm": 1.1005152512652243, + "learning_rate": 2.1473711017518032e-06, + "loss": 0.538, + "step": 6876 + }, + { + "epoch": 0.84494409632633, + "grad_norm": 1.0538920725111258, + "learning_rate": 2.1440554105574097e-06, + "loss": 0.5884, + "step": 6877 + }, + { + "epoch": 0.8450669615431872, + "grad_norm": 1.099772022386202, + "learning_rate": 2.140742084130649e-06, + "loss": 0.5476, + "step": 6878 + }, + { + "epoch": 0.8451898267600443, + "grad_norm": 1.2179953505895624, + "learning_rate": 2.137431123080991e-06, + "loss": 0.5237, + "step": 6879 + }, + { + "epoch": 0.8453126919769013, + "grad_norm": 1.1458454787268766, + "learning_rate": 2.1341225280174586e-06, + "loss": 0.5099, + "step": 6880 + }, + { + "epoch": 0.8454355571937584, + "grad_norm": 1.1233660811502177, + "learning_rate": 2.13081629954865e-06, + "loss": 0.4907, + "step": 6881 + }, + { + "epoch": 0.8455584224106155, + "grad_norm": 1.698029156120956, + "learning_rate": 2.1275124382827243e-06, + "loss": 0.6413, + "step": 6882 + }, + { + "epoch": 0.8456812876274726, + "grad_norm": 1.3451009561641756, + "learning_rate": 2.1242109448274015e-06, + "loss": 0.578, + "step": 6883 + }, + { + "epoch": 0.8458041528443297, + "grad_norm": 1.1795169964937318, + "learning_rate": 2.120911819789974e-06, + "loss": 0.5302, + "step": 6884 + }, + { + "epoch": 0.8459270180611869, + "grad_norm": 1.3545255521853616, + "learning_rate": 2.117615063777293e-06, + "loss": 0.6243, + "step": 6885 + }, + { + "epoch": 0.846049883278044, + "grad_norm": 1.2876256826176304, + "learning_rate": 2.1143206773957797e-06, + "loss": 0.5721, + "step": 6886 + }, + { + "epoch": 0.8461727484949011, + "grad_norm": 0.9964197761213719, + "learning_rate": 2.1110286612514077e-06, + "loss": 0.4817, + "step": 6887 + }, + { + "epoch": 0.8462956137117582, + "grad_norm": 1.1855114907052011, + "learning_rate": 2.107739015949725e-06, + "loss": 0.6919, + "step": 6888 + }, + { + "epoch": 0.8464184789286153, + "grad_norm": 1.3483264434918811, + "learning_rate": 2.104451742095845e-06, + "loss": 0.5636, + "step": 6889 + }, + { + "epoch": 0.8465413441454724, + "grad_norm": 1.1447133896646586, + "learning_rate": 2.101166840294433e-06, + "loss": 0.5695, + "step": 6890 + }, + { + "epoch": 0.8466642093623296, + "grad_norm": 1.1413622721630428, + "learning_rate": 2.0978843111497324e-06, + "loss": 0.6007, + "step": 6891 + }, + { + "epoch": 0.8467870745791867, + "grad_norm": 1.0922169550209095, + "learning_rate": 2.0946041552655314e-06, + "loss": 0.6444, + "step": 6892 + }, + { + "epoch": 0.8469099397960438, + "grad_norm": 1.3552531632554912, + "learning_rate": 2.0913263732452093e-06, + "loss": 0.5907, + "step": 6893 + }, + { + "epoch": 0.8470328050129009, + "grad_norm": 1.3288302354622983, + "learning_rate": 2.0880509656916836e-06, + "loss": 0.6255, + "step": 6894 + }, + { + "epoch": 0.8471556702297579, + "grad_norm": 1.2917033007095524, + "learning_rate": 2.0847779332074475e-06, + "loss": 0.6118, + "step": 6895 + }, + { + "epoch": 0.847278535446615, + "grad_norm": 1.1367789630646372, + "learning_rate": 2.081507276394544e-06, + "loss": 0.5518, + "step": 6896 + }, + { + "epoch": 0.8474014006634721, + "grad_norm": 1.3603466013678194, + "learning_rate": 2.078238995854608e-06, + "loss": 0.6155, + "step": 6897 + }, + { + "epoch": 0.8475242658803293, + "grad_norm": 1.194132798791412, + "learning_rate": 2.0749730921888022e-06, + "loss": 0.5517, + "step": 6898 + }, + { + "epoch": 0.8476471310971864, + "grad_norm": 1.1443761555235443, + "learning_rate": 2.0717095659978784e-06, + "loss": 0.5325, + "step": 6899 + }, + { + "epoch": 0.8477699963140435, + "grad_norm": 1.1272635695565338, + "learning_rate": 2.0684484178821333e-06, + "loss": 0.6538, + "step": 6900 + }, + { + "epoch": 0.8478928615309006, + "grad_norm": 1.213715841112267, + "learning_rate": 2.0651896484414383e-06, + "loss": 0.5976, + "step": 6901 + }, + { + "epoch": 0.8480157267477577, + "grad_norm": 1.4333793843316127, + "learning_rate": 2.061933258275226e-06, + "loss": 0.5875, + "step": 6902 + }, + { + "epoch": 0.8481385919646148, + "grad_norm": 1.0616829268804249, + "learning_rate": 2.0586792479824766e-06, + "loss": 0.5784, + "step": 6903 + }, + { + "epoch": 0.848261457181472, + "grad_norm": 1.107578331787731, + "learning_rate": 2.0554276181617603e-06, + "loss": 0.5128, + "step": 6904 + }, + { + "epoch": 0.8483843223983291, + "grad_norm": 1.2149614568204523, + "learning_rate": 2.05217836941118e-06, + "loss": 0.5298, + "step": 6905 + }, + { + "epoch": 0.8485071876151862, + "grad_norm": 1.2307210025882571, + "learning_rate": 2.0489315023284244e-06, + "loss": 0.5418, + "step": 6906 + }, + { + "epoch": 0.8486300528320433, + "grad_norm": 1.0724232649889842, + "learning_rate": 2.045687017510724e-06, + "loss": 0.6216, + "step": 6907 + }, + { + "epoch": 0.8487529180489004, + "grad_norm": 1.507047269704493, + "learning_rate": 2.0424449155548846e-06, + "loss": 0.5742, + "step": 6908 + }, + { + "epoch": 0.8488757832657574, + "grad_norm": 0.9910833262866477, + "learning_rate": 2.039205197057273e-06, + "loss": 0.5996, + "step": 6909 + }, + { + "epoch": 0.8489986484826145, + "grad_norm": 1.213373555139544, + "learning_rate": 2.0359678626138102e-06, + "loss": 0.633, + "step": 6910 + }, + { + "epoch": 0.8491215136994716, + "grad_norm": 1.1462146537578735, + "learning_rate": 2.0327329128199834e-06, + "loss": 0.5674, + "step": 6911 + }, + { + "epoch": 0.8492443789163288, + "grad_norm": 1.1367267939728634, + "learning_rate": 2.029500348270842e-06, + "loss": 0.5652, + "step": 6912 + }, + { + "epoch": 0.8493672441331859, + "grad_norm": 1.0850826097956563, + "learning_rate": 2.026270169560998e-06, + "loss": 0.5876, + "step": 6913 + }, + { + "epoch": 0.849490109350043, + "grad_norm": 1.309295820087309, + "learning_rate": 2.023042377284615e-06, + "loss": 0.5028, + "step": 6914 + }, + { + "epoch": 0.8496129745669001, + "grad_norm": 1.9365358584180028, + "learning_rate": 2.0198169720354283e-06, + "loss": 0.7945, + "step": 6915 + }, + { + "epoch": 0.8497358397837572, + "grad_norm": 1.1309566622436364, + "learning_rate": 2.0165939544067306e-06, + "loss": 0.7024, + "step": 6916 + }, + { + "epoch": 0.8498587050006143, + "grad_norm": 1.098094640221295, + "learning_rate": 2.013373324991377e-06, + "loss": 0.5145, + "step": 6917 + }, + { + "epoch": 0.8499815702174714, + "grad_norm": 1.2024433350938737, + "learning_rate": 2.0101550843817768e-06, + "loss": 0.5744, + "step": 6918 + }, + { + "epoch": 0.8501044354343286, + "grad_norm": 1.0517680626265715, + "learning_rate": 2.0069392331699077e-06, + "loss": 0.5968, + "step": 6919 + }, + { + "epoch": 0.8502273006511857, + "grad_norm": 1.1395959042948411, + "learning_rate": 2.003725771947305e-06, + "loss": 0.5915, + "step": 6920 + }, + { + "epoch": 0.8503501658680428, + "grad_norm": 1.0433597535311392, + "learning_rate": 2.0005147013050594e-06, + "loss": 0.5973, + "step": 6921 + }, + { + "epoch": 0.8504730310848999, + "grad_norm": 1.286462184145451, + "learning_rate": 1.997306021833832e-06, + "loss": 0.5009, + "step": 6922 + }, + { + "epoch": 0.850595896301757, + "grad_norm": 1.193368401201695, + "learning_rate": 1.9940997341238347e-06, + "loss": 0.6671, + "step": 6923 + }, + { + "epoch": 0.850718761518614, + "grad_norm": 1.2097771036693499, + "learning_rate": 1.9908958387648485e-06, + "loss": 0.6892, + "step": 6924 + }, + { + "epoch": 0.8508416267354711, + "grad_norm": 1.3006088168187133, + "learning_rate": 1.987694336346203e-06, + "loss": 0.5591, + "step": 6925 + }, + { + "epoch": 0.8509644919523283, + "grad_norm": 1.313741289111501, + "learning_rate": 1.9844952274567955e-06, + "loss": 0.4287, + "step": 6926 + }, + { + "epoch": 0.8510873571691854, + "grad_norm": 2.04714638823587, + "learning_rate": 1.9812985126850875e-06, + "loss": 0.6746, + "step": 6927 + }, + { + "epoch": 0.8512102223860425, + "grad_norm": 1.3033301003987963, + "learning_rate": 1.9781041926190847e-06, + "loss": 0.5689, + "step": 6928 + }, + { + "epoch": 0.8513330876028996, + "grad_norm": 1.1584400759685762, + "learning_rate": 1.974912267846369e-06, + "loss": 0.5034, + "step": 6929 + }, + { + "epoch": 0.8514559528197567, + "grad_norm": 1.0380747842346651, + "learning_rate": 1.971722738954064e-06, + "loss": 0.5506, + "step": 6930 + }, + { + "epoch": 0.8515788180366138, + "grad_norm": 1.1506462666680672, + "learning_rate": 1.968535606528877e-06, + "loss": 0.5168, + "step": 6931 + }, + { + "epoch": 0.851701683253471, + "grad_norm": 1.6536542869417012, + "learning_rate": 1.965350871157049e-06, + "loss": 0.7033, + "step": 6932 + }, + { + "epoch": 0.8518245484703281, + "grad_norm": 1.2476242297632292, + "learning_rate": 1.9621685334243984e-06, + "loss": 0.4957, + "step": 6933 + }, + { + "epoch": 0.8519474136871852, + "grad_norm": 1.2067622590062625, + "learning_rate": 1.9589885939162917e-06, + "loss": 0.5024, + "step": 6934 + }, + { + "epoch": 0.8520702789040423, + "grad_norm": 1.0688003805477082, + "learning_rate": 1.9558110532176576e-06, + "loss": 0.6746, + "step": 6935 + }, + { + "epoch": 0.8521931441208994, + "grad_norm": 1.3168755791970088, + "learning_rate": 1.9526359119129856e-06, + "loss": 0.6329, + "step": 6936 + }, + { + "epoch": 0.8523160093377565, + "grad_norm": 1.3088445244887286, + "learning_rate": 1.9494631705863265e-06, + "loss": 0.5642, + "step": 6937 + }, + { + "epoch": 0.8524388745546135, + "grad_norm": 1.0715370081174134, + "learning_rate": 1.9462928298212785e-06, + "loss": 0.5486, + "step": 6938 + }, + { + "epoch": 0.8525617397714707, + "grad_norm": 0.9892201425241608, + "learning_rate": 1.943124890201007e-06, + "loss": 0.5847, + "step": 6939 + }, + { + "epoch": 0.8526846049883278, + "grad_norm": 1.1338600906539757, + "learning_rate": 1.9399593523082387e-06, + "loss": 0.5466, + "step": 6940 + }, + { + "epoch": 0.8528074702051849, + "grad_norm": 1.3080046668792473, + "learning_rate": 1.9367962167252483e-06, + "loss": 0.5203, + "step": 6941 + }, + { + "epoch": 0.852930335422042, + "grad_norm": 1.2719614784794118, + "learning_rate": 1.9336354840338737e-06, + "loss": 0.42, + "step": 6942 + }, + { + "epoch": 0.8530532006388991, + "grad_norm": 1.269753491834709, + "learning_rate": 1.9304771548155148e-06, + "loss": 0.5189, + "step": 6943 + }, + { + "epoch": 0.8531760658557562, + "grad_norm": 1.2914249075605218, + "learning_rate": 1.927321229651128e-06, + "loss": 0.69, + "step": 6944 + }, + { + "epoch": 0.8532989310726133, + "grad_norm": 1.030882424251882, + "learning_rate": 1.9241677091212183e-06, + "loss": 0.5431, + "step": 6945 + }, + { + "epoch": 0.8534217962894705, + "grad_norm": 0.9406385365443406, + "learning_rate": 1.9210165938058594e-06, + "loss": 0.5874, + "step": 6946 + }, + { + "epoch": 0.8535446615063276, + "grad_norm": 1.322416090185305, + "learning_rate": 1.917867884284679e-06, + "loss": 0.5417, + "step": 6947 + }, + { + "epoch": 0.8536675267231847, + "grad_norm": 1.357858429908527, + "learning_rate": 1.9147215811368597e-06, + "loss": 0.5167, + "step": 6948 + }, + { + "epoch": 0.8537903919400418, + "grad_norm": 1.1075828713061624, + "learning_rate": 1.9115776849411425e-06, + "loss": 0.5343, + "step": 6949 + }, + { + "epoch": 0.8539132571568989, + "grad_norm": 1.3059370309061946, + "learning_rate": 1.9084361962758306e-06, + "loss": 0.5945, + "step": 6950 + }, + { + "epoch": 0.854036122373756, + "grad_norm": 1.2339655922983717, + "learning_rate": 1.9052971157187816e-06, + "loss": 0.5938, + "step": 6951 + }, + { + "epoch": 0.8541589875906132, + "grad_norm": 1.2000268039538502, + "learning_rate": 1.9021604438474016e-06, + "loss": 0.61, + "step": 6952 + }, + { + "epoch": 0.8542818528074702, + "grad_norm": 1.2388664340200246, + "learning_rate": 1.899026181238666e-06, + "loss": 0.5537, + "step": 6953 + }, + { + "epoch": 0.8544047180243273, + "grad_norm": 1.1882052729697237, + "learning_rate": 1.8958943284691056e-06, + "loss": 0.5258, + "step": 6954 + }, + { + "epoch": 0.8545275832411844, + "grad_norm": 1.3013620235602423, + "learning_rate": 1.8927648861147956e-06, + "loss": 0.5707, + "step": 6955 + }, + { + "epoch": 0.8546504484580415, + "grad_norm": 1.350830549946907, + "learning_rate": 1.889637854751386e-06, + "loss": 0.5224, + "step": 6956 + }, + { + "epoch": 0.8547733136748986, + "grad_norm": 1.1941854450805582, + "learning_rate": 1.8865132349540615e-06, + "loss": 0.5617, + "step": 6957 + }, + { + "epoch": 0.8548961788917557, + "grad_norm": 1.143490838333734, + "learning_rate": 1.8833910272975906e-06, + "loss": 0.485, + "step": 6958 + }, + { + "epoch": 0.8550190441086128, + "grad_norm": 1.149179124144171, + "learning_rate": 1.8802712323562742e-06, + "loss": 0.5827, + "step": 6959 + }, + { + "epoch": 0.85514190932547, + "grad_norm": 1.1784717216400384, + "learning_rate": 1.8771538507039815e-06, + "loss": 0.5557, + "step": 6960 + }, + { + "epoch": 0.8552647745423271, + "grad_norm": 1.4158572751560874, + "learning_rate": 1.8740388829141285e-06, + "loss": 0.4622, + "step": 6961 + }, + { + "epoch": 0.8553876397591842, + "grad_norm": 1.0294180093329781, + "learning_rate": 1.8709263295597023e-06, + "loss": 0.4801, + "step": 6962 + }, + { + "epoch": 0.8555105049760413, + "grad_norm": 1.2427983419234006, + "learning_rate": 1.8678161912132313e-06, + "loss": 0.6624, + "step": 6963 + }, + { + "epoch": 0.8556333701928984, + "grad_norm": 1.2850339634974473, + "learning_rate": 1.8647084684468096e-06, + "loss": 0.5114, + "step": 6964 + }, + { + "epoch": 0.8557562354097555, + "grad_norm": 1.303928495224064, + "learning_rate": 1.8616031618320767e-06, + "loss": 0.5127, + "step": 6965 + }, + { + "epoch": 0.8558791006266127, + "grad_norm": 1.3782415102497176, + "learning_rate": 1.8585002719402372e-06, + "loss": 0.4942, + "step": 6966 + }, + { + "epoch": 0.8560019658434697, + "grad_norm": 1.5420213797043345, + "learning_rate": 1.8553997993420495e-06, + "loss": 0.5815, + "step": 6967 + }, + { + "epoch": 0.8561248310603268, + "grad_norm": 1.3218386199489882, + "learning_rate": 1.852301744607816e-06, + "loss": 0.6343, + "step": 6968 + }, + { + "epoch": 0.8562476962771839, + "grad_norm": 1.150760340776776, + "learning_rate": 1.8492061083074174e-06, + "loss": 0.5536, + "step": 6969 + }, + { + "epoch": 0.856370561494041, + "grad_norm": 1.1124392236196428, + "learning_rate": 1.8461128910102665e-06, + "loss": 0.4925, + "step": 6970 + }, + { + "epoch": 0.8564934267108981, + "grad_norm": 1.1796815921183443, + "learning_rate": 1.8430220932853465e-06, + "loss": 0.6406, + "step": 6971 + }, + { + "epoch": 0.8566162919277552, + "grad_norm": 1.0542176200180555, + "learning_rate": 1.8399337157011842e-06, + "loss": 0.4744, + "step": 6972 + }, + { + "epoch": 0.8567391571446124, + "grad_norm": 1.1402604262844094, + "learning_rate": 1.836847758825867e-06, + "loss": 0.5848, + "step": 6973 + }, + { + "epoch": 0.8568620223614695, + "grad_norm": 1.2323245121129194, + "learning_rate": 1.8337642232270424e-06, + "loss": 0.6379, + "step": 6974 + }, + { + "epoch": 0.8569848875783266, + "grad_norm": 1.3151526352205773, + "learning_rate": 1.8306831094719002e-06, + "loss": 0.4878, + "step": 6975 + }, + { + "epoch": 0.8571077527951837, + "grad_norm": 1.2222669056633022, + "learning_rate": 1.8276044181271935e-06, + "loss": 0.5826, + "step": 6976 + }, + { + "epoch": 0.8572306180120408, + "grad_norm": 1.2115603213200874, + "learning_rate": 1.8245281497592293e-06, + "loss": 0.7264, + "step": 6977 + }, + { + "epoch": 0.8573534832288979, + "grad_norm": 1.4250920653554906, + "learning_rate": 1.8214543049338683e-06, + "loss": 0.5281, + "step": 6978 + }, + { + "epoch": 0.857476348445755, + "grad_norm": 1.3240510946209247, + "learning_rate": 1.8183828842165183e-06, + "loss": 0.6343, + "step": 6979 + }, + { + "epoch": 0.8575992136626122, + "grad_norm": 1.4336218261160918, + "learning_rate": 1.815313888172151e-06, + "loss": 0.4703, + "step": 6980 + }, + { + "epoch": 0.8577220788794693, + "grad_norm": 0.9834406254437499, + "learning_rate": 1.8122473173652893e-06, + "loss": 0.5953, + "step": 6981 + }, + { + "epoch": 0.8578449440963263, + "grad_norm": 1.1653727760671861, + "learning_rate": 1.8091831723600105e-06, + "loss": 0.4802, + "step": 6982 + }, + { + "epoch": 0.8579678093131834, + "grad_norm": 1.0051222058779414, + "learning_rate": 1.8061214537199388e-06, + "loss": 0.5984, + "step": 6983 + }, + { + "epoch": 0.8580906745300405, + "grad_norm": 1.0848273315454016, + "learning_rate": 1.8030621620082604e-06, + "loss": 0.5147, + "step": 6984 + }, + { + "epoch": 0.8582135397468976, + "grad_norm": 1.1093433114785107, + "learning_rate": 1.8000052977877152e-06, + "loss": 0.5835, + "step": 6985 + }, + { + "epoch": 0.8583364049637547, + "grad_norm": 1.4387666054370678, + "learning_rate": 1.7969508616205866e-06, + "loss": 0.5357, + "step": 6986 + }, + { + "epoch": 0.8584592701806119, + "grad_norm": 1.1465275255673681, + "learning_rate": 1.7938988540687233e-06, + "loss": 0.5735, + "step": 6987 + }, + { + "epoch": 0.858582135397469, + "grad_norm": 1.2617703657295776, + "learning_rate": 1.7908492756935203e-06, + "loss": 0.5287, + "step": 6988 + }, + { + "epoch": 0.8587050006143261, + "grad_norm": 1.172331008901022, + "learning_rate": 1.787802127055933e-06, + "loss": 0.6038, + "step": 6989 + }, + { + "epoch": 0.8588278658311832, + "grad_norm": 1.2455444627702117, + "learning_rate": 1.784757408716457e-06, + "loss": 0.5429, + "step": 6990 + }, + { + "epoch": 0.8589507310480403, + "grad_norm": 1.2102588200684823, + "learning_rate": 1.7817151212351507e-06, + "loss": 0.5744, + "step": 6991 + }, + { + "epoch": 0.8590735962648974, + "grad_norm": 1.0210023586401331, + "learning_rate": 1.7786752651716281e-06, + "loss": 0.5376, + "step": 6992 + }, + { + "epoch": 0.8591964614817545, + "grad_norm": 0.925224418069438, + "learning_rate": 1.7756378410850437e-06, + "loss": 0.5724, + "step": 6993 + }, + { + "epoch": 0.8593193266986117, + "grad_norm": 1.2105793335232833, + "learning_rate": 1.772602849534119e-06, + "loss": 0.5526, + "step": 6994 + }, + { + "epoch": 0.8594421919154688, + "grad_norm": 1.1716781064849016, + "learning_rate": 1.7695702910771106e-06, + "loss": 0.5483, + "step": 6995 + }, + { + "epoch": 0.8595650571323259, + "grad_norm": 1.11717812353179, + "learning_rate": 1.7665401662718522e-06, + "loss": 0.4881, + "step": 6996 + }, + { + "epoch": 0.8596879223491829, + "grad_norm": 1.2405850970821097, + "learning_rate": 1.7635124756757031e-06, + "loss": 0.5839, + "step": 6997 + }, + { + "epoch": 0.85981078756604, + "grad_norm": 1.4501021911516512, + "learning_rate": 1.760487219845598e-06, + "loss": 0.6727, + "step": 6998 + }, + { + "epoch": 0.8599336527828971, + "grad_norm": 1.3714016410083725, + "learning_rate": 1.7574643993379996e-06, + "loss": 0.5237, + "step": 6999 + }, + { + "epoch": 0.8600565179997542, + "grad_norm": 1.11266419771346, + "learning_rate": 1.7544440147089518e-06, + "loss": 0.5532, + "step": 7000 + }, + { + "epoch": 0.8601793832166114, + "grad_norm": 1.5139103375665575, + "learning_rate": 1.751426066514022e-06, + "loss": 0.6971, + "step": 7001 + }, + { + "epoch": 0.8603022484334685, + "grad_norm": 1.52116482761324, + "learning_rate": 1.7484105553083523e-06, + "loss": 0.5829, + "step": 7002 + }, + { + "epoch": 0.8604251136503256, + "grad_norm": 1.0828494802121706, + "learning_rate": 1.7453974816466162e-06, + "loss": 0.6066, + "step": 7003 + }, + { + "epoch": 0.8605479788671827, + "grad_norm": 1.103416675112805, + "learning_rate": 1.7423868460830566e-06, + "loss": 0.5162, + "step": 7004 + }, + { + "epoch": 0.8606708440840398, + "grad_norm": 1.414592567015034, + "learning_rate": 1.7393786491714591e-06, + "loss": 0.5037, + "step": 7005 + }, + { + "epoch": 0.8607937093008969, + "grad_norm": 1.1867347265682837, + "learning_rate": 1.7363728914651594e-06, + "loss": 0.5975, + "step": 7006 + }, + { + "epoch": 0.860916574517754, + "grad_norm": 1.2673898456234125, + "learning_rate": 1.7333695735170468e-06, + "loss": 0.6894, + "step": 7007 + }, + { + "epoch": 0.8610394397346112, + "grad_norm": 1.523925310308078, + "learning_rate": 1.7303686958795662e-06, + "loss": 0.5536, + "step": 7008 + }, + { + "epoch": 0.8611623049514683, + "grad_norm": 1.1039539643642007, + "learning_rate": 1.7273702591047091e-06, + "loss": 0.4574, + "step": 7009 + }, + { + "epoch": 0.8612851701683254, + "grad_norm": 1.2459524540369955, + "learning_rate": 1.7243742637440129e-06, + "loss": 0.5344, + "step": 7010 + }, + { + "epoch": 0.8614080353851824, + "grad_norm": 1.1935359030270527, + "learning_rate": 1.7213807103485768e-06, + "loss": 0.6423, + "step": 7011 + }, + { + "epoch": 0.8615309006020395, + "grad_norm": 1.2638107258611198, + "learning_rate": 1.7183895994690468e-06, + "loss": 0.5832, + "step": 7012 + }, + { + "epoch": 0.8616537658188966, + "grad_norm": 1.3637484158193363, + "learning_rate": 1.715400931655613e-06, + "loss": 0.5644, + "step": 7013 + }, + { + "epoch": 0.8617766310357537, + "grad_norm": 1.8028596450034893, + "learning_rate": 1.7124147074580254e-06, + "loss": 0.666, + "step": 7014 + }, + { + "epoch": 0.8618994962526109, + "grad_norm": 1.2148301748151145, + "learning_rate": 1.7094309274255764e-06, + "loss": 0.6183, + "step": 7015 + }, + { + "epoch": 0.862022361469468, + "grad_norm": 1.0831737724984019, + "learning_rate": 1.7064495921071221e-06, + "loss": 0.6582, + "step": 7016 + }, + { + "epoch": 0.8621452266863251, + "grad_norm": 1.2636343927659575, + "learning_rate": 1.7034707020510487e-06, + "loss": 0.5009, + "step": 7017 + }, + { + "epoch": 0.8622680919031822, + "grad_norm": 1.0384607457139157, + "learning_rate": 1.7004942578053078e-06, + "loss": 0.5758, + "step": 7018 + }, + { + "epoch": 0.8623909571200393, + "grad_norm": 1.1022218046466152, + "learning_rate": 1.6975202599174e-06, + "loss": 0.4964, + "step": 7019 + }, + { + "epoch": 0.8625138223368964, + "grad_norm": 1.2267089559772537, + "learning_rate": 1.6945487089343725e-06, + "loss": 0.6413, + "step": 7020 + }, + { + "epoch": 0.8626366875537536, + "grad_norm": 1.2513225642663799, + "learning_rate": 1.6915796054028182e-06, + "loss": 0.5207, + "step": 7021 + }, + { + "epoch": 0.8627595527706107, + "grad_norm": 1.047820304884196, + "learning_rate": 1.6886129498688884e-06, + "loss": 0.5801, + "step": 7022 + }, + { + "epoch": 0.8628824179874678, + "grad_norm": 1.1681106870146072, + "learning_rate": 1.6856487428782802e-06, + "loss": 0.7013, + "step": 7023 + }, + { + "epoch": 0.8630052832043249, + "grad_norm": 1.2342146354294474, + "learning_rate": 1.6826869849762372e-06, + "loss": 0.5288, + "step": 7024 + }, + { + "epoch": 0.863128148421182, + "grad_norm": 1.1498088409923326, + "learning_rate": 1.679727676707562e-06, + "loss": 0.445, + "step": 7025 + }, + { + "epoch": 0.863251013638039, + "grad_norm": 1.2817437015163842, + "learning_rate": 1.6767708186165875e-06, + "loss": 0.5838, + "step": 7026 + }, + { + "epoch": 0.8633738788548961, + "grad_norm": 1.472674684126238, + "learning_rate": 1.6738164112472238e-06, + "loss": 0.5498, + "step": 7027 + }, + { + "epoch": 0.8634967440717533, + "grad_norm": 1.277735778340668, + "learning_rate": 1.6708644551429043e-06, + "loss": 0.5215, + "step": 7028 + }, + { + "epoch": 0.8636196092886104, + "grad_norm": 1.3584413336729237, + "learning_rate": 1.6679149508466263e-06, + "loss": 0.6749, + "step": 7029 + }, + { + "epoch": 0.8637424745054675, + "grad_norm": 1.1674375309846945, + "learning_rate": 1.6649678989009343e-06, + "loss": 0.4971, + "step": 7030 + }, + { + "epoch": 0.8638653397223246, + "grad_norm": 1.1350581361139884, + "learning_rate": 1.6620232998479129e-06, + "loss": 0.5352, + "step": 7031 + }, + { + "epoch": 0.8639882049391817, + "grad_norm": 1.7075849849952407, + "learning_rate": 1.659081154229209e-06, + "loss": 0.5882, + "step": 7032 + }, + { + "epoch": 0.8641110701560388, + "grad_norm": 1.3868460704418455, + "learning_rate": 1.6561414625860028e-06, + "loss": 0.5754, + "step": 7033 + }, + { + "epoch": 0.864233935372896, + "grad_norm": 1.4713950086020007, + "learning_rate": 1.6532042254590418e-06, + "loss": 0.6252, + "step": 7034 + }, + { + "epoch": 0.8643568005897531, + "grad_norm": 1.2853126277694031, + "learning_rate": 1.650269443388604e-06, + "loss": 0.5728, + "step": 7035 + }, + { + "epoch": 0.8644796658066102, + "grad_norm": 1.4181585997693573, + "learning_rate": 1.647337116914529e-06, + "loss": 0.6623, + "step": 7036 + }, + { + "epoch": 0.8646025310234673, + "grad_norm": 1.2088387068654736, + "learning_rate": 1.644407246576189e-06, + "loss": 0.4785, + "step": 7037 + }, + { + "epoch": 0.8647253962403244, + "grad_norm": 1.0130331985027898, + "learning_rate": 1.6414798329125291e-06, + "loss": 0.4989, + "step": 7038 + }, + { + "epoch": 0.8648482614571815, + "grad_norm": 1.4802572732257813, + "learning_rate": 1.6385548764620174e-06, + "loss": 0.523, + "step": 7039 + }, + { + "epoch": 0.8649711266740385, + "grad_norm": 1.3892402936820858, + "learning_rate": 1.635632377762688e-06, + "loss": 0.6487, + "step": 7040 + }, + { + "epoch": 0.8650939918908956, + "grad_norm": 1.9640204494950129, + "learning_rate": 1.632712337352108e-06, + "loss": 0.6086, + "step": 7041 + }, + { + "epoch": 0.8652168571077528, + "grad_norm": 1.393553010404348, + "learning_rate": 1.6297947557674042e-06, + "loss": 0.6103, + "step": 7042 + }, + { + "epoch": 0.8653397223246099, + "grad_norm": 1.1926917944949256, + "learning_rate": 1.626879633545249e-06, + "loss": 0.5878, + "step": 7043 + }, + { + "epoch": 0.865462587541467, + "grad_norm": 1.1242083930028437, + "learning_rate": 1.6239669712218553e-06, + "loss": 0.5462, + "step": 7044 + }, + { + "epoch": 0.8655854527583241, + "grad_norm": 0.9836032656220917, + "learning_rate": 1.6210567693329892e-06, + "loss": 0.6053, + "step": 7045 + }, + { + "epoch": 0.8657083179751812, + "grad_norm": 1.2183314120866056, + "learning_rate": 1.6181490284139645e-06, + "loss": 0.5135, + "step": 7046 + }, + { + "epoch": 0.8658311831920383, + "grad_norm": 1.3344259569077528, + "learning_rate": 1.6152437489996464e-06, + "loss": 0.5712, + "step": 7047 + }, + { + "epoch": 0.8659540484088954, + "grad_norm": 1.2079925289884772, + "learning_rate": 1.612340931624434e-06, + "loss": 0.5339, + "step": 7048 + }, + { + "epoch": 0.8660769136257526, + "grad_norm": 1.1565710592005356, + "learning_rate": 1.6094405768222841e-06, + "loss": 0.5281, + "step": 7049 + }, + { + "epoch": 0.8661997788426097, + "grad_norm": 1.1383293487968251, + "learning_rate": 1.606542685126703e-06, + "loss": 0.5722, + "step": 7050 + }, + { + "epoch": 0.8663226440594668, + "grad_norm": 1.18490809261576, + "learning_rate": 1.6036472570707323e-06, + "loss": 0.4752, + "step": 7051 + }, + { + "epoch": 0.8664455092763239, + "grad_norm": 1.0727401508173766, + "learning_rate": 1.6007542931869712e-06, + "loss": 0.5904, + "step": 7052 + }, + { + "epoch": 0.866568374493181, + "grad_norm": 1.0811734476034245, + "learning_rate": 1.597863794007559e-06, + "loss": 0.5379, + "step": 7053 + }, + { + "epoch": 0.8666912397100381, + "grad_norm": 1.408237194399131, + "learning_rate": 1.5949757600641906e-06, + "loss": 0.6241, + "step": 7054 + }, + { + "epoch": 0.8668141049268951, + "grad_norm": 1.265418244899921, + "learning_rate": 1.5920901918880925e-06, + "loss": 0.5395, + "step": 7055 + }, + { + "epoch": 0.8669369701437523, + "grad_norm": 1.253680255529607, + "learning_rate": 1.5892070900100503e-06, + "loss": 0.5904, + "step": 7056 + }, + { + "epoch": 0.8670598353606094, + "grad_norm": 1.14718609523142, + "learning_rate": 1.5863264549603945e-06, + "loss": 0.5418, + "step": 7057 + }, + { + "epoch": 0.8671827005774665, + "grad_norm": 1.0820255735845286, + "learning_rate": 1.5834482872689949e-06, + "loss": 0.5951, + "step": 7058 + }, + { + "epoch": 0.8673055657943236, + "grad_norm": 1.333439127795634, + "learning_rate": 1.5805725874652726e-06, + "loss": 0.5771, + "step": 7059 + }, + { + "epoch": 0.8674284310111807, + "grad_norm": 1.2702119107341001, + "learning_rate": 1.5776993560781948e-06, + "loss": 0.5751, + "step": 7060 + }, + { + "epoch": 0.8675512962280378, + "grad_norm": 1.2370088754475632, + "learning_rate": 1.5748285936362772e-06, + "loss": 0.5756, + "step": 7061 + }, + { + "epoch": 0.867674161444895, + "grad_norm": 1.1079391742519422, + "learning_rate": 1.5719603006675703e-06, + "loss": 0.5454, + "step": 7062 + }, + { + "epoch": 0.8677970266617521, + "grad_norm": 1.0804770487569744, + "learning_rate": 1.5690944776996875e-06, + "loss": 0.5139, + "step": 7063 + }, + { + "epoch": 0.8679198918786092, + "grad_norm": 1.2477276117500156, + "learning_rate": 1.566231125259765e-06, + "loss": 0.4859, + "step": 7064 + }, + { + "epoch": 0.8680427570954663, + "grad_norm": 1.4361049527422436, + "learning_rate": 1.5633702438745118e-06, + "loss": 0.5755, + "step": 7065 + }, + { + "epoch": 0.8681656223123234, + "grad_norm": 1.0831917465394987, + "learning_rate": 1.5605118340701602e-06, + "loss": 0.5373, + "step": 7066 + }, + { + "epoch": 0.8682884875291805, + "grad_norm": 1.0379245691782841, + "learning_rate": 1.5576558963725029e-06, + "loss": 0.6023, + "step": 7067 + }, + { + "epoch": 0.8684113527460376, + "grad_norm": 1.1112577675981596, + "learning_rate": 1.5548024313068633e-06, + "loss": 0.6331, + "step": 7068 + }, + { + "epoch": 0.8685342179628947, + "grad_norm": 1.2230260757221156, + "learning_rate": 1.5519514393981193e-06, + "loss": 0.696, + "step": 7069 + }, + { + "epoch": 0.8686570831797518, + "grad_norm": 1.2529479035369482, + "learning_rate": 1.5491029211706986e-06, + "loss": 0.4927, + "step": 7070 + }, + { + "epoch": 0.8687799483966089, + "grad_norm": 1.092910634144334, + "learning_rate": 1.5462568771485618e-06, + "loss": 0.6714, + "step": 7071 + }, + { + "epoch": 0.868902813613466, + "grad_norm": 0.9339830293641502, + "learning_rate": 1.5434133078552204e-06, + "loss": 0.5823, + "step": 7072 + }, + { + "epoch": 0.8690256788303231, + "grad_norm": 1.3300937505727808, + "learning_rate": 1.5405722138137323e-06, + "loss": 0.658, + "step": 7073 + }, + { + "epoch": 0.8691485440471802, + "grad_norm": 1.0575265164086565, + "learning_rate": 1.5377335955466997e-06, + "loss": 0.5947, + "step": 7074 + }, + { + "epoch": 0.8692714092640373, + "grad_norm": 1.328593842749922, + "learning_rate": 1.534897453576265e-06, + "loss": 0.5478, + "step": 7075 + }, + { + "epoch": 0.8693942744808945, + "grad_norm": 1.5099519162458095, + "learning_rate": 1.532063788424119e-06, + "loss": 0.6602, + "step": 7076 + }, + { + "epoch": 0.8695171396977516, + "grad_norm": 1.3700253309438115, + "learning_rate": 1.5292326006114998e-06, + "loss": 0.6138, + "step": 7077 + }, + { + "epoch": 0.8696400049146087, + "grad_norm": 1.2482873253650553, + "learning_rate": 1.5264038906591793e-06, + "loss": 0.488, + "step": 7078 + }, + { + "epoch": 0.8697628701314658, + "grad_norm": 1.1149725006587603, + "learning_rate": 1.5235776590874844e-06, + "loss": 0.5841, + "step": 7079 + }, + { + "epoch": 0.8698857353483229, + "grad_norm": 1.027152888425496, + "learning_rate": 1.5207539064162811e-06, + "loss": 0.5655, + "step": 7080 + }, + { + "epoch": 0.87000860056518, + "grad_norm": 1.1750479612510039, + "learning_rate": 1.5179326331649823e-06, + "loss": 0.6057, + "step": 7081 + }, + { + "epoch": 0.8701314657820372, + "grad_norm": 1.2513315492440755, + "learning_rate": 1.515113839852541e-06, + "loss": 0.5876, + "step": 7082 + }, + { + "epoch": 0.8702543309988943, + "grad_norm": 1.0363686623454096, + "learning_rate": 1.5122975269974542e-06, + "loss": 0.6052, + "step": 7083 + }, + { + "epoch": 0.8703771962157513, + "grad_norm": 1.0921375343662225, + "learning_rate": 1.5094836951177676e-06, + "loss": 0.5656, + "step": 7084 + }, + { + "epoch": 0.8705000614326084, + "grad_norm": 1.1329961916002316, + "learning_rate": 1.5066723447310688e-06, + "loss": 0.6701, + "step": 7085 + }, + { + "epoch": 0.8706229266494655, + "grad_norm": 1.2497184208770165, + "learning_rate": 1.5038634763544822e-06, + "loss": 0.6749, + "step": 7086 + }, + { + "epoch": 0.8707457918663226, + "grad_norm": 1.1890373489576522, + "learning_rate": 1.5010570905046833e-06, + "loss": 0.5459, + "step": 7087 + }, + { + "epoch": 0.8708686570831797, + "grad_norm": 1.333386363700467, + "learning_rate": 1.4982531876978923e-06, + "loss": 0.528, + "step": 7088 + }, + { + "epoch": 0.8709915223000368, + "grad_norm": 1.3694462028124077, + "learning_rate": 1.4954517684498614e-06, + "loss": 0.5794, + "step": 7089 + }, + { + "epoch": 0.871114387516894, + "grad_norm": 1.0712935556312295, + "learning_rate": 1.4926528332759005e-06, + "loss": 0.643, + "step": 7090 + }, + { + "epoch": 0.8712372527337511, + "grad_norm": 1.167536010826607, + "learning_rate": 1.489856382690849e-06, + "loss": 0.5695, + "step": 7091 + }, + { + "epoch": 0.8713601179506082, + "grad_norm": 1.3294777534214932, + "learning_rate": 1.4870624172091041e-06, + "loss": 0.5469, + "step": 7092 + }, + { + "epoch": 0.8714829831674653, + "grad_norm": 1.3048186449271808, + "learning_rate": 1.4842709373445896e-06, + "loss": 0.6042, + "step": 7093 + }, + { + "epoch": 0.8716058483843224, + "grad_norm": 1.0889874024113757, + "learning_rate": 1.4814819436107846e-06, + "loss": 0.5687, + "step": 7094 + }, + { + "epoch": 0.8717287136011795, + "grad_norm": 1.0007404611345647, + "learning_rate": 1.4786954365207072e-06, + "loss": 0.548, + "step": 7095 + }, + { + "epoch": 0.8718515788180367, + "grad_norm": 1.2848220843062472, + "learning_rate": 1.4759114165869126e-06, + "loss": 0.6026, + "step": 7096 + }, + { + "epoch": 0.8719744440348938, + "grad_norm": 1.1107676179413724, + "learning_rate": 1.4731298843215107e-06, + "loss": 0.5149, + "step": 7097 + }, + { + "epoch": 0.8720973092517508, + "grad_norm": 1.1445316196832824, + "learning_rate": 1.4703508402361343e-06, + "loss": 0.6012, + "step": 7098 + }, + { + "epoch": 0.8722201744686079, + "grad_norm": 1.1542008766709428, + "learning_rate": 1.4675742848419842e-06, + "loss": 0.5649, + "step": 7099 + }, + { + "epoch": 0.872343039685465, + "grad_norm": 1.2591490391002773, + "learning_rate": 1.4648002186497805e-06, + "loss": 0.5765, + "step": 7100 + }, + { + "epoch": 0.8724659049023221, + "grad_norm": 1.010989739203606, + "learning_rate": 1.4620286421698014e-06, + "loss": 0.5777, + "step": 7101 + }, + { + "epoch": 0.8725887701191792, + "grad_norm": 1.0995054096111203, + "learning_rate": 1.459259555911851e-06, + "loss": 0.6595, + "step": 7102 + }, + { + "epoch": 0.8727116353360364, + "grad_norm": 1.3659930608704987, + "learning_rate": 1.456492960385295e-06, + "loss": 0.5859, + "step": 7103 + }, + { + "epoch": 0.8728345005528935, + "grad_norm": 1.126439515338653, + "learning_rate": 1.4537288560990247e-06, + "loss": 0.5175, + "step": 7104 + }, + { + "epoch": 0.8729573657697506, + "grad_norm": 1.1954613743311027, + "learning_rate": 1.4509672435614819e-06, + "loss": 0.5957, + "step": 7105 + }, + { + "epoch": 0.8730802309866077, + "grad_norm": 1.6116689015974976, + "learning_rate": 1.448208123280645e-06, + "loss": 0.597, + "step": 7106 + }, + { + "epoch": 0.8732030962034648, + "grad_norm": 1.1034312208109913, + "learning_rate": 1.4454514957640363e-06, + "loss": 0.6195, + "step": 7107 + }, + { + "epoch": 0.8733259614203219, + "grad_norm": 1.3871281060603522, + "learning_rate": 1.4426973615187239e-06, + "loss": 0.5615, + "step": 7108 + }, + { + "epoch": 0.873448826637179, + "grad_norm": 1.1574308129566302, + "learning_rate": 1.4399457210513072e-06, + "loss": 0.6941, + "step": 7109 + }, + { + "epoch": 0.8735716918540362, + "grad_norm": 1.5862134571416906, + "learning_rate": 1.4371965748679333e-06, + "loss": 0.7052, + "step": 7110 + }, + { + "epoch": 0.8736945570708933, + "grad_norm": 1.2300268416404474, + "learning_rate": 1.4344499234742941e-06, + "loss": 0.4743, + "step": 7111 + }, + { + "epoch": 0.8738174222877504, + "grad_norm": 1.1719561756920505, + "learning_rate": 1.4317057673756172e-06, + "loss": 0.5731, + "step": 7112 + }, + { + "epoch": 0.8739402875046074, + "grad_norm": 1.163794583447879, + "learning_rate": 1.4289641070766674e-06, + "loss": 0.6094, + "step": 7113 + }, + { + "epoch": 0.8740631527214645, + "grad_norm": 1.156608843944235, + "learning_rate": 1.4262249430817609e-06, + "loss": 0.6023, + "step": 7114 + }, + { + "epoch": 0.8741860179383216, + "grad_norm": 1.2811697916874987, + "learning_rate": 1.4234882758947482e-06, + "loss": 0.4726, + "step": 7115 + }, + { + "epoch": 0.8743088831551787, + "grad_norm": 1.199707485651841, + "learning_rate": 1.4207541060190182e-06, + "loss": 0.6142, + "step": 7116 + }, + { + "epoch": 0.8744317483720359, + "grad_norm": 1.1997249773232852, + "learning_rate": 1.4180224339575055e-06, + "loss": 0.5768, + "step": 7117 + }, + { + "epoch": 0.874554613588893, + "grad_norm": 1.5744977257433705, + "learning_rate": 1.4152932602126844e-06, + "loss": 0.7534, + "step": 7118 + }, + { + "epoch": 0.8746774788057501, + "grad_norm": 1.5284889403095072, + "learning_rate": 1.4125665852865704e-06, + "loss": 0.5615, + "step": 7119 + }, + { + "epoch": 0.8748003440226072, + "grad_norm": 0.9640913126400859, + "learning_rate": 1.4098424096807138e-06, + "loss": 0.6022, + "step": 7120 + }, + { + "epoch": 0.8749232092394643, + "grad_norm": 1.1229616283881876, + "learning_rate": 1.407120733896209e-06, + "loss": 0.5803, + "step": 7121 + }, + { + "epoch": 0.8750460744563214, + "grad_norm": 1.315628239910807, + "learning_rate": 1.4044015584336934e-06, + "loss": 0.5135, + "step": 7122 + }, + { + "epoch": 0.8751689396731785, + "grad_norm": 1.0920323136835997, + "learning_rate": 1.401684883793342e-06, + "loss": 0.5707, + "step": 7123 + }, + { + "epoch": 0.8752918048900357, + "grad_norm": 1.0843033736582204, + "learning_rate": 1.3989707104748673e-06, + "loss": 0.5989, + "step": 7124 + }, + { + "epoch": 0.8754146701068928, + "grad_norm": 1.1224203367629808, + "learning_rate": 1.3962590389775242e-06, + "loss": 0.5152, + "step": 7125 + }, + { + "epoch": 0.8755375353237499, + "grad_norm": 1.0944664726660147, + "learning_rate": 1.3935498698001093e-06, + "loss": 0.475, + "step": 7126 + }, + { + "epoch": 0.875660400540607, + "grad_norm": 1.4178362902858213, + "learning_rate": 1.3908432034409518e-06, + "loss": 0.5777, + "step": 7127 + }, + { + "epoch": 0.875783265757464, + "grad_norm": 1.1638741385205535, + "learning_rate": 1.3881390403979321e-06, + "loss": 0.5209, + "step": 7128 + }, + { + "epoch": 0.8759061309743211, + "grad_norm": 1.3014281401697947, + "learning_rate": 1.3854373811684557e-06, + "loss": 0.4988, + "step": 7129 + }, + { + "epoch": 0.8760289961911782, + "grad_norm": 1.1122182990996692, + "learning_rate": 1.382738226249483e-06, + "loss": 0.5466, + "step": 7130 + }, + { + "epoch": 0.8761518614080354, + "grad_norm": 1.2336539463981147, + "learning_rate": 1.3800415761375007e-06, + "loss": 0.613, + "step": 7131 + }, + { + "epoch": 0.8762747266248925, + "grad_norm": 1.1077015809935749, + "learning_rate": 1.377347431328545e-06, + "loss": 0.478, + "step": 7132 + }, + { + "epoch": 0.8763975918417496, + "grad_norm": 1.1737832179164718, + "learning_rate": 1.3746557923181795e-06, + "loss": 0.5686, + "step": 7133 + }, + { + "epoch": 0.8765204570586067, + "grad_norm": 1.0736900985901194, + "learning_rate": 1.3719666596015184e-06, + "loss": 0.5569, + "step": 7134 + }, + { + "epoch": 0.8766433222754638, + "grad_norm": 1.1052829161600521, + "learning_rate": 1.3692800336732108e-06, + "loss": 0.6328, + "step": 7135 + }, + { + "epoch": 0.8767661874923209, + "grad_norm": 0.93739960073414, + "learning_rate": 1.3665959150274382e-06, + "loss": 0.5063, + "step": 7136 + }, + { + "epoch": 0.876889052709178, + "grad_norm": 0.9676139344106852, + "learning_rate": 1.3639143041579371e-06, + "loss": 0.5146, + "step": 7137 + }, + { + "epoch": 0.8770119179260352, + "grad_norm": 1.3099465946798965, + "learning_rate": 1.3612352015579631e-06, + "loss": 0.5391, + "step": 7138 + }, + { + "epoch": 0.8771347831428923, + "grad_norm": 1.2346471139935726, + "learning_rate": 1.358558607720327e-06, + "loss": 0.5512, + "step": 7139 + }, + { + "epoch": 0.8772576483597494, + "grad_norm": 1.186522653568312, + "learning_rate": 1.3558845231373617e-06, + "loss": 0.5045, + "step": 7140 + }, + { + "epoch": 0.8773805135766065, + "grad_norm": 1.4101778849544024, + "learning_rate": 1.353212948300957e-06, + "loss": 0.6003, + "step": 7141 + }, + { + "epoch": 0.8775033787934635, + "grad_norm": 1.4711073773646366, + "learning_rate": 1.3505438837025265e-06, + "loss": 0.5275, + "step": 7142 + }, + { + "epoch": 0.8776262440103206, + "grad_norm": 1.7882339920740296, + "learning_rate": 1.3478773298330322e-06, + "loss": 0.5225, + "step": 7143 + }, + { + "epoch": 0.8777491092271777, + "grad_norm": 1.1083825637459748, + "learning_rate": 1.345213287182962e-06, + "loss": 0.505, + "step": 7144 + }, + { + "epoch": 0.8778719744440349, + "grad_norm": 1.3739069083342748, + "learning_rate": 1.3425517562423539e-06, + "loss": 0.463, + "step": 7145 + }, + { + "epoch": 0.877994839660892, + "grad_norm": 1.2214116792634364, + "learning_rate": 1.3398927375007814e-06, + "loss": 0.5634, + "step": 7146 + }, + { + "epoch": 0.8781177048777491, + "grad_norm": 0.974087787109072, + "learning_rate": 1.3372362314473464e-06, + "loss": 0.5581, + "step": 7147 + }, + { + "epoch": 0.8782405700946062, + "grad_norm": 1.169104503227177, + "learning_rate": 1.334582238570703e-06, + "loss": 0.5423, + "step": 7148 + }, + { + "epoch": 0.8783634353114633, + "grad_norm": 1.280295937218546, + "learning_rate": 1.3319307593590325e-06, + "loss": 0.5904, + "step": 7149 + }, + { + "epoch": 0.8784863005283204, + "grad_norm": 1.5893455323214367, + "learning_rate": 1.3292817943000597e-06, + "loss": 0.5646, + "step": 7150 + }, + { + "epoch": 0.8786091657451776, + "grad_norm": 1.1637959353045442, + "learning_rate": 1.3266353438810414e-06, + "loss": 0.5679, + "step": 7151 + }, + { + "epoch": 0.8787320309620347, + "grad_norm": 1.1844183775195403, + "learning_rate": 1.3239914085887767e-06, + "loss": 0.4705, + "step": 7152 + }, + { + "epoch": 0.8788548961788918, + "grad_norm": 1.2746473388000763, + "learning_rate": 1.321349988909603e-06, + "loss": 0.4772, + "step": 7153 + }, + { + "epoch": 0.8789777613957489, + "grad_norm": 1.186276654090902, + "learning_rate": 1.318711085329387e-06, + "loss": 0.6164, + "step": 7154 + }, + { + "epoch": 0.879100626612606, + "grad_norm": 1.0850454851573872, + "learning_rate": 1.3160746983335437e-06, + "loss": 0.6293, + "step": 7155 + }, + { + "epoch": 0.8792234918294631, + "grad_norm": 1.0511073775664987, + "learning_rate": 1.3134408284070115e-06, + "loss": 0.6565, + "step": 7156 + }, + { + "epoch": 0.8793463570463201, + "grad_norm": 1.3999461297396405, + "learning_rate": 1.310809476034283e-06, + "loss": 0.6137, + "step": 7157 + }, + { + "epoch": 0.8794692222631773, + "grad_norm": 1.2566963277294994, + "learning_rate": 1.3081806416993714e-06, + "loss": 0.6139, + "step": 7158 + }, + { + "epoch": 0.8795920874800344, + "grad_norm": 1.086665724845854, + "learning_rate": 1.305554325885836e-06, + "loss": 0.5238, + "step": 7159 + }, + { + "epoch": 0.8797149526968915, + "grad_norm": 1.0383189725746282, + "learning_rate": 1.3029305290767708e-06, + "loss": 0.5885, + "step": 7160 + }, + { + "epoch": 0.8798378179137486, + "grad_norm": 1.176676143131754, + "learning_rate": 1.3003092517548076e-06, + "loss": 0.5437, + "step": 7161 + }, + { + "epoch": 0.8799606831306057, + "grad_norm": 1.0363111515423418, + "learning_rate": 1.2976904944021112e-06, + "loss": 0.5127, + "step": 7162 + }, + { + "epoch": 0.8800835483474628, + "grad_norm": 1.2338249051880787, + "learning_rate": 1.2950742575003843e-06, + "loss": 0.5631, + "step": 7163 + }, + { + "epoch": 0.88020641356432, + "grad_norm": 1.0953819034194399, + "learning_rate": 1.2924605415308722e-06, + "loss": 0.4927, + "step": 7164 + }, + { + "epoch": 0.8803292787811771, + "grad_norm": 0.9599801064516802, + "learning_rate": 1.2898493469743433e-06, + "loss": 0.651, + "step": 7165 + }, + { + "epoch": 0.8804521439980342, + "grad_norm": 1.1466090956054567, + "learning_rate": 1.287240674311117e-06, + "loss": 0.6026, + "step": 7166 + }, + { + "epoch": 0.8805750092148913, + "grad_norm": 1.5484140955339734, + "learning_rate": 1.284634524021031e-06, + "loss": 0.5622, + "step": 7167 + }, + { + "epoch": 0.8806978744317484, + "grad_norm": 1.5410232668661374, + "learning_rate": 1.2820308965834854e-06, + "loss": 0.5479, + "step": 7168 + }, + { + "epoch": 0.8808207396486055, + "grad_norm": 0.9846117607644819, + "learning_rate": 1.2794297924773868e-06, + "loss": 0.5958, + "step": 7169 + }, + { + "epoch": 0.8809436048654626, + "grad_norm": 1.2360515580937632, + "learning_rate": 1.2768312121812008e-06, + "loss": 0.5134, + "step": 7170 + }, + { + "epoch": 0.8810664700823196, + "grad_norm": 1.1697933108967524, + "learning_rate": 1.2742351561729138e-06, + "loss": 0.5869, + "step": 7171 + }, + { + "epoch": 0.8811893352991768, + "grad_norm": 1.324341639706237, + "learning_rate": 1.2716416249300532e-06, + "loss": 0.5236, + "step": 7172 + }, + { + "epoch": 0.8813122005160339, + "grad_norm": 1.1612275670285621, + "learning_rate": 1.269050618929688e-06, + "loss": 0.5429, + "step": 7173 + }, + { + "epoch": 0.881435065732891, + "grad_norm": 1.1795503643393226, + "learning_rate": 1.2664621386484098e-06, + "loss": 0.5946, + "step": 7174 + }, + { + "epoch": 0.8815579309497481, + "grad_norm": 1.1060931868689656, + "learning_rate": 1.2638761845623565e-06, + "loss": 0.6443, + "step": 7175 + }, + { + "epoch": 0.8816807961666052, + "grad_norm": 1.2874056093639015, + "learning_rate": 1.2612927571471972e-06, + "loss": 0.6115, + "step": 7176 + }, + { + "epoch": 0.8818036613834623, + "grad_norm": 1.0534853943589673, + "learning_rate": 1.2587118568781387e-06, + "loss": 0.487, + "step": 7177 + }, + { + "epoch": 0.8819265266003194, + "grad_norm": 1.1666029203287116, + "learning_rate": 1.2561334842299161e-06, + "loss": 0.4891, + "step": 7178 + }, + { + "epoch": 0.8820493918171766, + "grad_norm": 1.1689607538584867, + "learning_rate": 1.2535576396768085e-06, + "loss": 0.5274, + "step": 7179 + }, + { + "epoch": 0.8821722570340337, + "grad_norm": 1.188423261784822, + "learning_rate": 1.250984323692625e-06, + "loss": 0.611, + "step": 7180 + }, + { + "epoch": 0.8822951222508908, + "grad_norm": 1.4810712488683937, + "learning_rate": 1.248413536750707e-06, + "loss": 0.6182, + "step": 7181 + }, + { + "epoch": 0.8824179874677479, + "grad_norm": 1.123631806181217, + "learning_rate": 1.2458452793239383e-06, + "loss": 0.4948, + "step": 7182 + }, + { + "epoch": 0.882540852684605, + "grad_norm": 1.1709921634849327, + "learning_rate": 1.2432795518847306e-06, + "loss": 0.5025, + "step": 7183 + }, + { + "epoch": 0.8826637179014621, + "grad_norm": 1.2450626382885601, + "learning_rate": 1.2407163549050366e-06, + "loss": 0.503, + "step": 7184 + }, + { + "epoch": 0.8827865831183193, + "grad_norm": 1.2321987402173789, + "learning_rate": 1.2381556888563338e-06, + "loss": 0.6218, + "step": 7185 + }, + { + "epoch": 0.8829094483351763, + "grad_norm": 1.3067392918251524, + "learning_rate": 1.2355975542096444e-06, + "loss": 0.5989, + "step": 7186 + }, + { + "epoch": 0.8830323135520334, + "grad_norm": 1.123352387251633, + "learning_rate": 1.2330419514355195e-06, + "loss": 0.4611, + "step": 7187 + }, + { + "epoch": 0.8831551787688905, + "grad_norm": 1.2481998589799197, + "learning_rate": 1.2304888810040487e-06, + "loss": 0.5839, + "step": 7188 + }, + { + "epoch": 0.8832780439857476, + "grad_norm": 1.1376594699823634, + "learning_rate": 1.2279383433848462e-06, + "loss": 0.6097, + "step": 7189 + }, + { + "epoch": 0.8834009092026047, + "grad_norm": 1.228377304117327, + "learning_rate": 1.2253903390470717e-06, + "loss": 0.4284, + "step": 7190 + }, + { + "epoch": 0.8835237744194618, + "grad_norm": 1.3570942305967904, + "learning_rate": 1.222844868459415e-06, + "loss": 0.5888, + "step": 7191 + }, + { + "epoch": 0.883646639636319, + "grad_norm": 1.1423632676601039, + "learning_rate": 1.2203019320900938e-06, + "loss": 0.5084, + "step": 7192 + }, + { + "epoch": 0.8837695048531761, + "grad_norm": 1.209855595078981, + "learning_rate": 1.2177615304068702e-06, + "loss": 0.5875, + "step": 7193 + }, + { + "epoch": 0.8838923700700332, + "grad_norm": 1.1827062672941506, + "learning_rate": 1.215223663877027e-06, + "loss": 0.5442, + "step": 7194 + }, + { + "epoch": 0.8840152352868903, + "grad_norm": 1.2713788544819864, + "learning_rate": 1.2126883329673977e-06, + "loss": 0.5418, + "step": 7195 + }, + { + "epoch": 0.8841381005037474, + "grad_norm": 0.935256179299919, + "learning_rate": 1.2101555381443341e-06, + "loss": 0.5248, + "step": 7196 + }, + { + "epoch": 0.8842609657206045, + "grad_norm": 1.2719315501624957, + "learning_rate": 1.2076252798737318e-06, + "loss": 0.5417, + "step": 7197 + }, + { + "epoch": 0.8843838309374616, + "grad_norm": 1.3907404704377302, + "learning_rate": 1.2050975586210106e-06, + "loss": 0.5545, + "step": 7198 + }, + { + "epoch": 0.8845066961543188, + "grad_norm": 1.2597781208801382, + "learning_rate": 1.2025723748511297e-06, + "loss": 0.5836, + "step": 7199 + }, + { + "epoch": 0.8846295613711758, + "grad_norm": 1.1563419191438897, + "learning_rate": 1.2000497290285827e-06, + "loss": 0.5405, + "step": 7200 + }, + { + "epoch": 0.8847524265880329, + "grad_norm": 1.0795588042996045, + "learning_rate": 1.1975296216173887e-06, + "loss": 0.4926, + "step": 7201 + }, + { + "epoch": 0.88487529180489, + "grad_norm": 0.9615561487762116, + "learning_rate": 1.1950120530811131e-06, + "loss": 0.4986, + "step": 7202 + }, + { + "epoch": 0.8849981570217471, + "grad_norm": 1.2529066913400242, + "learning_rate": 1.1924970238828393e-06, + "loss": 0.5281, + "step": 7203 + }, + { + "epoch": 0.8851210222386042, + "grad_norm": 1.2438796072048526, + "learning_rate": 1.1899845344851951e-06, + "loss": 0.5438, + "step": 7204 + }, + { + "epoch": 0.8852438874554613, + "grad_norm": 1.1936971080322796, + "learning_rate": 1.1874745853503293e-06, + "loss": 0.5115, + "step": 7205 + }, + { + "epoch": 0.8853667526723185, + "grad_norm": 1.2284166379319617, + "learning_rate": 1.1849671769399427e-06, + "loss": 0.6159, + "step": 7206 + }, + { + "epoch": 0.8854896178891756, + "grad_norm": 1.0585751806795822, + "learning_rate": 1.1824623097152466e-06, + "loss": 0.595, + "step": 7207 + }, + { + "epoch": 0.8856124831060327, + "grad_norm": 1.1157502875188798, + "learning_rate": 1.179959984137002e-06, + "loss": 0.5019, + "step": 7208 + }, + { + "epoch": 0.8857353483228898, + "grad_norm": 1.9311462710645566, + "learning_rate": 1.1774602006654888e-06, + "loss": 0.6685, + "step": 7209 + }, + { + "epoch": 0.8858582135397469, + "grad_norm": 1.0373406267833987, + "learning_rate": 1.1749629597605299e-06, + "loss": 0.6161, + "step": 7210 + }, + { + "epoch": 0.885981078756604, + "grad_norm": 1.1183663905587788, + "learning_rate": 1.1724682618814792e-06, + "loss": 0.7076, + "step": 7211 + }, + { + "epoch": 0.8861039439734612, + "grad_norm": 1.3386467949589127, + "learning_rate": 1.1699761074872128e-06, + "loss": 0.6747, + "step": 7212 + }, + { + "epoch": 0.8862268091903183, + "grad_norm": 1.3666089790074722, + "learning_rate": 1.1674864970361527e-06, + "loss": 0.6983, + "step": 7213 + }, + { + "epoch": 0.8863496744071754, + "grad_norm": 1.5738903178389743, + "learning_rate": 1.164999430986242e-06, + "loss": 0.5801, + "step": 7214 + }, + { + "epoch": 0.8864725396240324, + "grad_norm": 1.1612066991584469, + "learning_rate": 1.1625149097949672e-06, + "loss": 0.5588, + "step": 7215 + }, + { + "epoch": 0.8865954048408895, + "grad_norm": 1.1786335109364945, + "learning_rate": 1.1600329339193321e-06, + "loss": 0.6341, + "step": 7216 + }, + { + "epoch": 0.8867182700577466, + "grad_norm": 1.2504332538046197, + "learning_rate": 1.1575535038158852e-06, + "loss": 0.577, + "step": 7217 + }, + { + "epoch": 0.8868411352746037, + "grad_norm": 1.4573122487253933, + "learning_rate": 1.1550766199407014e-06, + "loss": 0.5181, + "step": 7218 + }, + { + "epoch": 0.8869640004914608, + "grad_norm": 1.0602207128836274, + "learning_rate": 1.1526022827493832e-06, + "loss": 0.5152, + "step": 7219 + }, + { + "epoch": 0.887086865708318, + "grad_norm": 1.1291236613259268, + "learning_rate": 1.1501304926970728e-06, + "loss": 0.6097, + "step": 7220 + }, + { + "epoch": 0.8872097309251751, + "grad_norm": 1.0998580106317288, + "learning_rate": 1.1476612502384354e-06, + "loss": 0.5362, + "step": 7221 + }, + { + "epoch": 0.8873325961420322, + "grad_norm": 1.1634115628205506, + "learning_rate": 1.1451945558276788e-06, + "loss": 0.529, + "step": 7222 + }, + { + "epoch": 0.8874554613588893, + "grad_norm": 1.1261881269032434, + "learning_rate": 1.142730409918532e-06, + "loss": 0.4753, + "step": 7223 + }, + { + "epoch": 0.8875783265757464, + "grad_norm": 1.3045338525537384, + "learning_rate": 1.1402688129642575e-06, + "loss": 0.6128, + "step": 7224 + }, + { + "epoch": 0.8877011917926035, + "grad_norm": 0.945541942593487, + "learning_rate": 1.137809765417651e-06, + "loss": 0.5363, + "step": 7225 + }, + { + "epoch": 0.8878240570094607, + "grad_norm": 1.2117866789053169, + "learning_rate": 1.1353532677310413e-06, + "loss": 0.5872, + "step": 7226 + }, + { + "epoch": 0.8879469222263178, + "grad_norm": 1.130293352562666, + "learning_rate": 1.13289932035628e-06, + "loss": 0.633, + "step": 7227 + }, + { + "epoch": 0.8880697874431749, + "grad_norm": 0.9554885986758416, + "learning_rate": 1.1304479237447574e-06, + "loss": 0.6931, + "step": 7228 + }, + { + "epoch": 0.888192652660032, + "grad_norm": 1.1352119313475202, + "learning_rate": 1.1279990783473948e-06, + "loss": 0.5552, + "step": 7229 + }, + { + "epoch": 0.888315517876889, + "grad_norm": 1.640121584954948, + "learning_rate": 1.1255527846146369e-06, + "loss": 0.7, + "step": 7230 + }, + { + "epoch": 0.8884383830937461, + "grad_norm": 1.3523719108706085, + "learning_rate": 1.1231090429964668e-06, + "loss": 0.5682, + "step": 7231 + }, + { + "epoch": 0.8885612483106032, + "grad_norm": 1.4702060282505494, + "learning_rate": 1.1206678539423886e-06, + "loss": 0.484, + "step": 7232 + }, + { + "epoch": 0.8886841135274604, + "grad_norm": 1.0367980142722115, + "learning_rate": 1.118229217901453e-06, + "loss": 0.5421, + "step": 7233 + }, + { + "epoch": 0.8888069787443175, + "grad_norm": 1.0867618530217535, + "learning_rate": 1.1157931353222244e-06, + "loss": 0.5567, + "step": 7234 + }, + { + "epoch": 0.8889298439611746, + "grad_norm": 1.1299165092939212, + "learning_rate": 1.1133596066528079e-06, + "loss": 0.6433, + "step": 7235 + }, + { + "epoch": 0.8890527091780317, + "grad_norm": 1.2222239779128936, + "learning_rate": 1.1109286323408318e-06, + "loss": 0.5444, + "step": 7236 + }, + { + "epoch": 0.8891755743948888, + "grad_norm": 1.1696469270630028, + "learning_rate": 1.1085002128334603e-06, + "loss": 0.6137, + "step": 7237 + }, + { + "epoch": 0.8892984396117459, + "grad_norm": 1.0149359633313544, + "learning_rate": 1.1060743485773861e-06, + "loss": 0.5468, + "step": 7238 + }, + { + "epoch": 0.889421304828603, + "grad_norm": 1.0579148150558437, + "learning_rate": 1.1036510400188287e-06, + "loss": 0.498, + "step": 7239 + }, + { + "epoch": 0.8895441700454602, + "grad_norm": 1.1524838903702488, + "learning_rate": 1.101230287603542e-06, + "loss": 0.4918, + "step": 7240 + }, + { + "epoch": 0.8896670352623173, + "grad_norm": 1.1250495621250822, + "learning_rate": 1.0988120917768074e-06, + "loss": 0.5677, + "step": 7241 + }, + { + "epoch": 0.8897899004791744, + "grad_norm": 1.5174278810458959, + "learning_rate": 1.0963964529834381e-06, + "loss": 0.5923, + "step": 7242 + }, + { + "epoch": 0.8899127656960315, + "grad_norm": 1.0790610681057444, + "learning_rate": 1.0939833716677683e-06, + "loss": 0.6502, + "step": 7243 + }, + { + "epoch": 0.8900356309128885, + "grad_norm": 1.2230323642657168, + "learning_rate": 1.091572848273678e-06, + "loss": 0.5676, + "step": 7244 + }, + { + "epoch": 0.8901584961297456, + "grad_norm": 1.271777255624629, + "learning_rate": 1.0891648832445611e-06, + "loss": 0.6082, + "step": 7245 + }, + { + "epoch": 0.8902813613466027, + "grad_norm": 1.0587205141622902, + "learning_rate": 1.0867594770233514e-06, + "loss": 0.684, + "step": 7246 + }, + { + "epoch": 0.8904042265634599, + "grad_norm": 1.2313051131954105, + "learning_rate": 1.084356630052503e-06, + "loss": 0.4649, + "step": 7247 + }, + { + "epoch": 0.890527091780317, + "grad_norm": 1.1404455016178037, + "learning_rate": 1.0819563427740064e-06, + "loss": 0.7054, + "step": 7248 + }, + { + "epoch": 0.8906499569971741, + "grad_norm": 1.3193714657095157, + "learning_rate": 1.0795586156293814e-06, + "loss": 0.559, + "step": 7249 + }, + { + "epoch": 0.8907728222140312, + "grad_norm": 1.0501910562856978, + "learning_rate": 1.0771634490596683e-06, + "loss": 0.5741, + "step": 7250 + }, + { + "epoch": 0.8908956874308883, + "grad_norm": 0.9572939625227387, + "learning_rate": 1.0747708435054464e-06, + "loss": 0.5601, + "step": 7251 + }, + { + "epoch": 0.8910185526477454, + "grad_norm": 1.0899804295723317, + "learning_rate": 1.0723807994068208e-06, + "loss": 0.6158, + "step": 7252 + }, + { + "epoch": 0.8911414178646025, + "grad_norm": 1.2243014809440305, + "learning_rate": 1.0699933172034242e-06, + "loss": 0.6259, + "step": 7253 + }, + { + "epoch": 0.8912642830814597, + "grad_norm": 1.2603056220008562, + "learning_rate": 1.0676083973344158e-06, + "loss": 0.5946, + "step": 7254 + }, + { + "epoch": 0.8913871482983168, + "grad_norm": 1.1244771878363886, + "learning_rate": 1.0652260402384895e-06, + "loss": 0.5708, + "step": 7255 + }, + { + "epoch": 0.8915100135151739, + "grad_norm": 1.35318692698293, + "learning_rate": 1.062846246353863e-06, + "loss": 0.622, + "step": 7256 + }, + { + "epoch": 0.891632878732031, + "grad_norm": 1.1302563019519738, + "learning_rate": 1.0604690161182827e-06, + "loss": 0.4932, + "step": 7257 + }, + { + "epoch": 0.8917557439488881, + "grad_norm": 1.2787160669608806, + "learning_rate": 1.0580943499690277e-06, + "loss": 0.5568, + "step": 7258 + }, + { + "epoch": 0.8918786091657451, + "grad_norm": 1.226956107705599, + "learning_rate": 1.0557222483428962e-06, + "loss": 0.6041, + "step": 7259 + }, + { + "epoch": 0.8920014743826022, + "grad_norm": 1.2608832895937212, + "learning_rate": 1.0533527116762298e-06, + "loss": 0.495, + "step": 7260 + }, + { + "epoch": 0.8921243395994594, + "grad_norm": 1.2448200823464721, + "learning_rate": 1.0509857404048827e-06, + "loss": 0.4802, + "step": 7261 + }, + { + "epoch": 0.8922472048163165, + "grad_norm": 1.214137528680801, + "learning_rate": 1.0486213349642486e-06, + "loss": 0.6519, + "step": 7262 + }, + { + "epoch": 0.8923700700331736, + "grad_norm": 1.1970609304671151, + "learning_rate": 1.046259495789238e-06, + "loss": 0.5857, + "step": 7263 + }, + { + "epoch": 0.8924929352500307, + "grad_norm": 1.1675089442779372, + "learning_rate": 1.043900223314303e-06, + "loss": 0.5589, + "step": 7264 + }, + { + "epoch": 0.8926158004668878, + "grad_norm": 1.0915150767746276, + "learning_rate": 1.0415435179734118e-06, + "loss": 0.5837, + "step": 7265 + }, + { + "epoch": 0.8927386656837449, + "grad_norm": 1.375100870048378, + "learning_rate": 1.0391893802000674e-06, + "loss": 0.5644, + "step": 7266 + }, + { + "epoch": 0.892861530900602, + "grad_norm": 1.4742278386791212, + "learning_rate": 1.0368378104272986e-06, + "loss": 0.5946, + "step": 7267 + }, + { + "epoch": 0.8929843961174592, + "grad_norm": 1.3462286628849933, + "learning_rate": 1.0344888090876592e-06, + "loss": 0.5716, + "step": 7268 + }, + { + "epoch": 0.8931072613343163, + "grad_norm": 1.4163741265879155, + "learning_rate": 1.0321423766132354e-06, + "loss": 0.5364, + "step": 7269 + }, + { + "epoch": 0.8932301265511734, + "grad_norm": 1.3958915175805189, + "learning_rate": 1.0297985134356319e-06, + "loss": 0.6212, + "step": 7270 + }, + { + "epoch": 0.8933529917680305, + "grad_norm": 1.2738968099861134, + "learning_rate": 1.0274572199859972e-06, + "loss": 0.5785, + "step": 7271 + }, + { + "epoch": 0.8934758569848876, + "grad_norm": 1.3110429055982349, + "learning_rate": 1.0251184966949883e-06, + "loss": 0.4934, + "step": 7272 + }, + { + "epoch": 0.8935987222017446, + "grad_norm": 1.122914332710962, + "learning_rate": 1.0227823439928065e-06, + "loss": 0.5617, + "step": 7273 + }, + { + "epoch": 0.8937215874186017, + "grad_norm": 1.114688710310556, + "learning_rate": 1.0204487623091624e-06, + "loss": 0.5135, + "step": 7274 + }, + { + "epoch": 0.8938444526354589, + "grad_norm": 1.2671920452450514, + "learning_rate": 1.0181177520733082e-06, + "loss": 0.5827, + "step": 7275 + }, + { + "epoch": 0.893967317852316, + "grad_norm": 1.3284198256686657, + "learning_rate": 1.0157893137140206e-06, + "loss": 0.4771, + "step": 7276 + }, + { + "epoch": 0.8940901830691731, + "grad_norm": 1.0456814829348866, + "learning_rate": 1.0134634476595955e-06, + "loss": 0.647, + "step": 7277 + }, + { + "epoch": 0.8942130482860302, + "grad_norm": 1.2622693316726232, + "learning_rate": 1.011140154337864e-06, + "loss": 0.5347, + "step": 7278 + }, + { + "epoch": 0.8943359135028873, + "grad_norm": 1.2188678578124523, + "learning_rate": 1.0088194341761792e-06, + "loss": 0.5788, + "step": 7279 + }, + { + "epoch": 0.8944587787197444, + "grad_norm": 1.3005895432318344, + "learning_rate": 1.0065012876014261e-06, + "loss": 0.5629, + "step": 7280 + }, + { + "epoch": 0.8945816439366016, + "grad_norm": 1.155093049983235, + "learning_rate": 1.0041857150400075e-06, + "loss": 0.5285, + "step": 7281 + }, + { + "epoch": 0.8947045091534587, + "grad_norm": 0.9982231635566494, + "learning_rate": 1.0018727169178604e-06, + "loss": 0.5235, + "step": 7282 + }, + { + "epoch": 0.8948273743703158, + "grad_norm": 1.3670901712552659, + "learning_rate": 9.995622936604465e-07, + "loss": 0.6342, + "step": 7283 + }, + { + "epoch": 0.8949502395871729, + "grad_norm": 1.3825952581684497, + "learning_rate": 9.972544456927556e-07, + "loss": 0.6068, + "step": 7284 + }, + { + "epoch": 0.89507310480403, + "grad_norm": 1.0775106795531932, + "learning_rate": 9.949491734392952e-07, + "loss": 0.6023, + "step": 7285 + }, + { + "epoch": 0.8951959700208871, + "grad_norm": 1.3810480867200448, + "learning_rate": 9.926464773241089e-07, + "loss": 0.5659, + "step": 7286 + }, + { + "epoch": 0.8953188352377442, + "grad_norm": 1.2297569102728276, + "learning_rate": 9.90346357770765e-07, + "loss": 0.5416, + "step": 7287 + }, + { + "epoch": 0.8954417004546013, + "grad_norm": 1.320118790468435, + "learning_rate": 9.880488152023499e-07, + "loss": 0.5088, + "step": 7288 + }, + { + "epoch": 0.8955645656714584, + "grad_norm": 1.3582348185876296, + "learning_rate": 9.857538500414837e-07, + "loss": 0.5407, + "step": 7289 + }, + { + "epoch": 0.8956874308883155, + "grad_norm": 1.20407267056099, + "learning_rate": 9.834614627103123e-07, + "loss": 0.5158, + "step": 7290 + }, + { + "epoch": 0.8958102961051726, + "grad_norm": 1.085646674353756, + "learning_rate": 9.811716536305066e-07, + "loss": 0.6043, + "step": 7291 + }, + { + "epoch": 0.8959331613220297, + "grad_norm": 1.4813585261395066, + "learning_rate": 9.788844232232563e-07, + "loss": 0.5506, + "step": 7292 + }, + { + "epoch": 0.8960560265388868, + "grad_norm": 1.4126614811120668, + "learning_rate": 9.765997719092867e-07, + "loss": 0.5474, + "step": 7293 + }, + { + "epoch": 0.896178891755744, + "grad_norm": 1.328891301200044, + "learning_rate": 9.743177001088482e-07, + "loss": 0.605, + "step": 7294 + }, + { + "epoch": 0.8963017569726011, + "grad_norm": 1.4395184420919336, + "learning_rate": 9.720382082417052e-07, + "loss": 0.6257, + "step": 7295 + }, + { + "epoch": 0.8964246221894582, + "grad_norm": 1.1894733726966868, + "learning_rate": 9.69761296727162e-07, + "loss": 0.6622, + "step": 7296 + }, + { + "epoch": 0.8965474874063153, + "grad_norm": 1.1013187674232772, + "learning_rate": 9.674869659840334e-07, + "loss": 0.4495, + "step": 7297 + }, + { + "epoch": 0.8966703526231724, + "grad_norm": 1.09016997879365, + "learning_rate": 9.652152164306788e-07, + "loss": 0.5486, + "step": 7298 + }, + { + "epoch": 0.8967932178400295, + "grad_norm": 1.0065132755106543, + "learning_rate": 9.62946048484965e-07, + "loss": 0.4664, + "step": 7299 + }, + { + "epoch": 0.8969160830568866, + "grad_norm": 1.2696243635091953, + "learning_rate": 9.606794625642934e-07, + "loss": 0.6572, + "step": 7300 + }, + { + "epoch": 0.8970389482737438, + "grad_norm": 1.1698437265247876, + "learning_rate": 9.584154590855836e-07, + "loss": 0.6262, + "step": 7301 + }, + { + "epoch": 0.8971618134906008, + "grad_norm": 1.3595150993129614, + "learning_rate": 9.561540384652879e-07, + "loss": 0.5893, + "step": 7302 + }, + { + "epoch": 0.8972846787074579, + "grad_norm": 1.330477211949914, + "learning_rate": 9.538952011193814e-07, + "loss": 0.722, + "step": 7303 + }, + { + "epoch": 0.897407543924315, + "grad_norm": 1.153633398543148, + "learning_rate": 9.516389474633585e-07, + "loss": 0.5303, + "step": 7304 + }, + { + "epoch": 0.8975304091411721, + "grad_norm": 1.1145931237867663, + "learning_rate": 9.493852779122441e-07, + "loss": 0.5859, + "step": 7305 + }, + { + "epoch": 0.8976532743580292, + "grad_norm": 1.2983873111713549, + "learning_rate": 9.471341928805865e-07, + "loss": 0.5848, + "step": 7306 + }, + { + "epoch": 0.8977761395748863, + "grad_norm": 1.0359622897393745, + "learning_rate": 9.448856927824612e-07, + "loss": 0.5828, + "step": 7307 + }, + { + "epoch": 0.8978990047917434, + "grad_norm": 1.2133637287357546, + "learning_rate": 9.426397780314555e-07, + "loss": 0.5753, + "step": 7308 + }, + { + "epoch": 0.8980218700086006, + "grad_norm": 1.0699368348074108, + "learning_rate": 9.403964490407041e-07, + "loss": 0.5668, + "step": 7309 + }, + { + "epoch": 0.8981447352254577, + "grad_norm": 1.2555050927561713, + "learning_rate": 9.381557062228435e-07, + "loss": 0.5991, + "step": 7310 + }, + { + "epoch": 0.8982676004423148, + "grad_norm": 1.3332569577853675, + "learning_rate": 9.359175499900474e-07, + "loss": 0.6055, + "step": 7311 + }, + { + "epoch": 0.8983904656591719, + "grad_norm": 1.0591066447713628, + "learning_rate": 9.336819807540081e-07, + "loss": 0.4539, + "step": 7312 + }, + { + "epoch": 0.898513330876029, + "grad_norm": 1.3148002346304082, + "learning_rate": 9.31448998925945e-07, + "loss": 0.6561, + "step": 7313 + }, + { + "epoch": 0.8986361960928861, + "grad_norm": 1.3052925287511619, + "learning_rate": 9.292186049166029e-07, + "loss": 0.5654, + "step": 7314 + }, + { + "epoch": 0.8987590613097433, + "grad_norm": 1.0966535376600863, + "learning_rate": 9.269907991362436e-07, + "loss": 0.4592, + "step": 7315 + }, + { + "epoch": 0.8988819265266004, + "grad_norm": 1.0840102359341908, + "learning_rate": 9.247655819946609e-07, + "loss": 0.5061, + "step": 7316 + }, + { + "epoch": 0.8990047917434574, + "grad_norm": 1.1533568375466714, + "learning_rate": 9.225429539011676e-07, + "loss": 0.5649, + "step": 7317 + }, + { + "epoch": 0.8991276569603145, + "grad_norm": 1.172659874331371, + "learning_rate": 9.203229152646047e-07, + "loss": 0.4254, + "step": 7318 + }, + { + "epoch": 0.8992505221771716, + "grad_norm": 1.3081768539933165, + "learning_rate": 9.181054664933291e-07, + "loss": 0.6356, + "step": 7319 + }, + { + "epoch": 0.8993733873940287, + "grad_norm": 1.232129063467181, + "learning_rate": 9.158906079952295e-07, + "loss": 0.5803, + "step": 7320 + }, + { + "epoch": 0.8994962526108858, + "grad_norm": 1.2162229434818421, + "learning_rate": 9.136783401777165e-07, + "loss": 0.5807, + "step": 7321 + }, + { + "epoch": 0.899619117827743, + "grad_norm": 1.2598601904897229, + "learning_rate": 9.114686634477165e-07, + "loss": 0.626, + "step": 7322 + }, + { + "epoch": 0.8997419830446001, + "grad_norm": 1.5135268279666063, + "learning_rate": 9.092615782116909e-07, + "loss": 0.5313, + "step": 7323 + }, + { + "epoch": 0.8998648482614572, + "grad_norm": 1.44424354448404, + "learning_rate": 9.070570848756116e-07, + "loss": 0.6485, + "step": 7324 + }, + { + "epoch": 0.8999877134783143, + "grad_norm": 1.174379515220077, + "learning_rate": 9.048551838449909e-07, + "loss": 0.5931, + "step": 7325 + }, + { + "epoch": 0.9001105786951714, + "grad_norm": 1.4638361197695775, + "learning_rate": 9.026558755248465e-07, + "loss": 0.5528, + "step": 7326 + }, + { + "epoch": 0.9002334439120285, + "grad_norm": 1.0861841442848719, + "learning_rate": 9.004591603197315e-07, + "loss": 0.5048, + "step": 7327 + }, + { + "epoch": 0.9003563091288856, + "grad_norm": 1.1543783104942913, + "learning_rate": 8.98265038633711e-07, + "loss": 0.6044, + "step": 7328 + }, + { + "epoch": 0.9004791743457428, + "grad_norm": 1.4215039371831497, + "learning_rate": 8.960735108703872e-07, + "loss": 0.5652, + "step": 7329 + }, + { + "epoch": 0.9006020395625999, + "grad_norm": 1.0815430297678506, + "learning_rate": 8.938845774328725e-07, + "loss": 0.6115, + "step": 7330 + }, + { + "epoch": 0.9007249047794569, + "grad_norm": 1.3477305195612737, + "learning_rate": 8.916982387238082e-07, + "loss": 0.635, + "step": 7331 + }, + { + "epoch": 0.900847769996314, + "grad_norm": 1.1447310104823756, + "learning_rate": 8.895144951453593e-07, + "loss": 0.5478, + "step": 7332 + }, + { + "epoch": 0.9009706352131711, + "grad_norm": 1.3250492046357754, + "learning_rate": 8.873333470992079e-07, + "loss": 0.5218, + "step": 7333 + }, + { + "epoch": 0.9010935004300282, + "grad_norm": 1.1779475331273581, + "learning_rate": 8.851547949865646e-07, + "loss": 0.5997, + "step": 7334 + }, + { + "epoch": 0.9012163656468853, + "grad_norm": 2.0197976450066206, + "learning_rate": 8.82978839208154e-07, + "loss": 0.6826, + "step": 7335 + }, + { + "epoch": 0.9013392308637425, + "grad_norm": 0.9825328881908474, + "learning_rate": 8.808054801642407e-07, + "loss": 0.4703, + "step": 7336 + }, + { + "epoch": 0.9014620960805996, + "grad_norm": 1.3726740750921003, + "learning_rate": 8.786347182545884e-07, + "loss": 0.4609, + "step": 7337 + }, + { + "epoch": 0.9015849612974567, + "grad_norm": 1.26716723324268, + "learning_rate": 8.764665538785028e-07, + "loss": 0.5731, + "step": 7338 + }, + { + "epoch": 0.9017078265143138, + "grad_norm": 1.6136966989182453, + "learning_rate": 8.743009874347979e-07, + "loss": 0.6178, + "step": 7339 + }, + { + "epoch": 0.9018306917311709, + "grad_norm": 1.0349168180811938, + "learning_rate": 8.72138019321817e-07, + "loss": 0.5444, + "step": 7340 + }, + { + "epoch": 0.901953556948028, + "grad_norm": 1.3998477294530673, + "learning_rate": 8.699776499374285e-07, + "loss": 0.6279, + "step": 7341 + }, + { + "epoch": 0.9020764221648851, + "grad_norm": 1.0439567759953747, + "learning_rate": 8.678198796790126e-07, + "loss": 0.5017, + "step": 7342 + }, + { + "epoch": 0.9021992873817423, + "grad_norm": 1.6416446193590235, + "learning_rate": 8.656647089434788e-07, + "loss": 0.6674, + "step": 7343 + }, + { + "epoch": 0.9023221525985994, + "grad_norm": 1.1684008348218362, + "learning_rate": 8.635121381272582e-07, + "loss": 0.4792, + "step": 7344 + }, + { + "epoch": 0.9024450178154565, + "grad_norm": 0.9744367708902468, + "learning_rate": 8.613621676263023e-07, + "loss": 0.5563, + "step": 7345 + }, + { + "epoch": 0.9025678830323135, + "grad_norm": 1.4420577459180015, + "learning_rate": 8.592147978360831e-07, + "loss": 0.582, + "step": 7346 + }, + { + "epoch": 0.9026907482491706, + "grad_norm": 1.2768524781892316, + "learning_rate": 8.570700291515948e-07, + "loss": 0.5791, + "step": 7347 + }, + { + "epoch": 0.9028136134660277, + "grad_norm": 1.177967796409102, + "learning_rate": 8.549278619673534e-07, + "loss": 0.4895, + "step": 7348 + }, + { + "epoch": 0.9029364786828848, + "grad_norm": 1.1969329154218218, + "learning_rate": 8.527882966774003e-07, + "loss": 0.612, + "step": 7349 + }, + { + "epoch": 0.903059343899742, + "grad_norm": 1.2463790956060845, + "learning_rate": 8.506513336752908e-07, + "loss": 0.5258, + "step": 7350 + }, + { + "epoch": 0.9031822091165991, + "grad_norm": 1.3361240402285488, + "learning_rate": 8.485169733541071e-07, + "loss": 0.5068, + "step": 7351 + }, + { + "epoch": 0.9033050743334562, + "grad_norm": 1.485243495167198, + "learning_rate": 8.463852161064517e-07, + "loss": 0.661, + "step": 7352 + }, + { + "epoch": 0.9034279395503133, + "grad_norm": 1.1820307583357978, + "learning_rate": 8.442560623244444e-07, + "loss": 0.5496, + "step": 7353 + }, + { + "epoch": 0.9035508047671704, + "grad_norm": 1.2407683765698518, + "learning_rate": 8.421295123997319e-07, + "loss": 0.5609, + "step": 7354 + }, + { + "epoch": 0.9036736699840275, + "grad_norm": 0.9828791103033973, + "learning_rate": 8.400055667234779e-07, + "loss": 0.5576, + "step": 7355 + }, + { + "epoch": 0.9037965352008847, + "grad_norm": 1.1313495418356077, + "learning_rate": 8.378842256863717e-07, + "loss": 0.4856, + "step": 7356 + }, + { + "epoch": 0.9039194004177418, + "grad_norm": 0.9925762408241992, + "learning_rate": 8.357654896786143e-07, + "loss": 0.6221, + "step": 7357 + }, + { + "epoch": 0.9040422656345989, + "grad_norm": 1.338122705402375, + "learning_rate": 8.336493590899391e-07, + "loss": 0.5688, + "step": 7358 + }, + { + "epoch": 0.904165130851456, + "grad_norm": 1.3300157745621437, + "learning_rate": 8.31535834309593e-07, + "loss": 0.7752, + "step": 7359 + }, + { + "epoch": 0.9042879960683131, + "grad_norm": 1.1740511905024327, + "learning_rate": 8.294249157263417e-07, + "loss": 0.5766, + "step": 7360 + }, + { + "epoch": 0.9044108612851701, + "grad_norm": 1.3953858565085957, + "learning_rate": 8.273166037284812e-07, + "loss": 0.4916, + "step": 7361 + }, + { + "epoch": 0.9045337265020272, + "grad_norm": 1.0408673407880593, + "learning_rate": 8.252108987038131e-07, + "loss": 0.6251, + "step": 7362 + }, + { + "epoch": 0.9046565917188844, + "grad_norm": 1.1768182037781447, + "learning_rate": 8.231078010396775e-07, + "loss": 0.4799, + "step": 7363 + }, + { + "epoch": 0.9047794569357415, + "grad_norm": 1.1768448167602896, + "learning_rate": 8.210073111229199e-07, + "loss": 0.589, + "step": 7364 + }, + { + "epoch": 0.9049023221525986, + "grad_norm": 1.1846838932122357, + "learning_rate": 8.189094293399163e-07, + "loss": 0.64, + "step": 7365 + }, + { + "epoch": 0.9050251873694557, + "grad_norm": 1.1152546577216471, + "learning_rate": 8.168141560765496e-07, + "loss": 0.5649, + "step": 7366 + }, + { + "epoch": 0.9051480525863128, + "grad_norm": 1.5172007134796563, + "learning_rate": 8.147214917182433e-07, + "loss": 0.5096, + "step": 7367 + }, + { + "epoch": 0.9052709178031699, + "grad_norm": 1.2844129554398886, + "learning_rate": 8.12631436649921e-07, + "loss": 0.5714, + "step": 7368 + }, + { + "epoch": 0.905393783020027, + "grad_norm": 1.1438575981546617, + "learning_rate": 8.105439912560403e-07, + "loss": 0.538, + "step": 7369 + }, + { + "epoch": 0.9055166482368842, + "grad_norm": 1.1421000845316795, + "learning_rate": 8.08459155920569e-07, + "loss": 0.5624, + "step": 7370 + }, + { + "epoch": 0.9056395134537413, + "grad_norm": 1.0220736723673371, + "learning_rate": 8.063769310270003e-07, + "loss": 0.5493, + "step": 7371 + }, + { + "epoch": 0.9057623786705984, + "grad_norm": 1.3069090035841764, + "learning_rate": 8.042973169583479e-07, + "loss": 0.5944, + "step": 7372 + }, + { + "epoch": 0.9058852438874555, + "grad_norm": 1.1936668315898897, + "learning_rate": 8.022203140971373e-07, + "loss": 0.5985, + "step": 7373 + }, + { + "epoch": 0.9060081091043126, + "grad_norm": 1.1900978889137754, + "learning_rate": 8.001459228254282e-07, + "loss": 0.6878, + "step": 7374 + }, + { + "epoch": 0.9061309743211696, + "grad_norm": 1.0584104218020853, + "learning_rate": 7.980741435247851e-07, + "loss": 0.5468, + "step": 7375 + }, + { + "epoch": 0.9062538395380267, + "grad_norm": 1.1068304765832104, + "learning_rate": 7.960049765763034e-07, + "loss": 0.467, + "step": 7376 + }, + { + "epoch": 0.9063767047548839, + "grad_norm": 1.2160362334128485, + "learning_rate": 7.939384223605867e-07, + "loss": 0.6291, + "step": 7377 + }, + { + "epoch": 0.906499569971741, + "grad_norm": 1.3391390087414707, + "learning_rate": 7.918744812577694e-07, + "loss": 0.5582, + "step": 7378 + }, + { + "epoch": 0.9066224351885981, + "grad_norm": 1.4335144490085237, + "learning_rate": 7.898131536474995e-07, + "loss": 0.5949, + "step": 7379 + }, + { + "epoch": 0.9067453004054552, + "grad_norm": 1.2569774052182772, + "learning_rate": 7.877544399089421e-07, + "loss": 0.5164, + "step": 7380 + }, + { + "epoch": 0.9068681656223123, + "grad_norm": 1.0466711443253034, + "learning_rate": 7.856983404207857e-07, + "loss": 0.6433, + "step": 7381 + }, + { + "epoch": 0.9069910308391694, + "grad_norm": 1.2874468393120422, + "learning_rate": 7.836448555612363e-07, + "loss": 0.5349, + "step": 7382 + }, + { + "epoch": 0.9071138960560265, + "grad_norm": 1.130931359387852, + "learning_rate": 7.815939857080218e-07, + "loss": 0.5717, + "step": 7383 + }, + { + "epoch": 0.9072367612728837, + "grad_norm": 1.0681144242018192, + "learning_rate": 7.79545731238382e-07, + "loss": 0.521, + "step": 7384 + }, + { + "epoch": 0.9073596264897408, + "grad_norm": 1.2824329976563837, + "learning_rate": 7.775000925290804e-07, + "loss": 0.5622, + "step": 7385 + }, + { + "epoch": 0.9074824917065979, + "grad_norm": 1.322568589583127, + "learning_rate": 7.754570699564028e-07, + "loss": 0.5711, + "step": 7386 + }, + { + "epoch": 0.907605356923455, + "grad_norm": 1.1435557044690343, + "learning_rate": 7.734166638961488e-07, + "loss": 0.5797, + "step": 7387 + }, + { + "epoch": 0.9077282221403121, + "grad_norm": 1.1716282413759616, + "learning_rate": 7.713788747236361e-07, + "loss": 0.6918, + "step": 7388 + }, + { + "epoch": 0.9078510873571692, + "grad_norm": 1.443430999644405, + "learning_rate": 7.693437028137018e-07, + "loss": 0.6288, + "step": 7389 + }, + { + "epoch": 0.9079739525740262, + "grad_norm": 1.2514390309732086, + "learning_rate": 7.673111485407064e-07, + "loss": 0.48, + "step": 7390 + }, + { + "epoch": 0.9080968177908834, + "grad_norm": 1.1980042334038932, + "learning_rate": 7.652812122785225e-07, + "loss": 0.6855, + "step": 7391 + }, + { + "epoch": 0.9082196830077405, + "grad_norm": 1.0763674460246806, + "learning_rate": 7.632538944005429e-07, + "loss": 0.5028, + "step": 7392 + }, + { + "epoch": 0.9083425482245976, + "grad_norm": 1.335989954133638, + "learning_rate": 7.612291952796813e-07, + "loss": 0.6955, + "step": 7393 + }, + { + "epoch": 0.9084654134414547, + "grad_norm": 1.3457346861380066, + "learning_rate": 7.592071152883695e-07, + "loss": 0.5585, + "step": 7394 + }, + { + "epoch": 0.9085882786583118, + "grad_norm": 1.210934494019051, + "learning_rate": 7.571876547985518e-07, + "loss": 0.6442, + "step": 7395 + }, + { + "epoch": 0.9087111438751689, + "grad_norm": 1.3260774194774971, + "learning_rate": 7.551708141816977e-07, + "loss": 0.6158, + "step": 7396 + }, + { + "epoch": 0.908834009092026, + "grad_norm": 1.5504574664796285, + "learning_rate": 7.531565938087937e-07, + "loss": 0.6007, + "step": 7397 + }, + { + "epoch": 0.9089568743088832, + "grad_norm": 1.168548637911704, + "learning_rate": 7.511449940503368e-07, + "loss": 0.6667, + "step": 7398 + }, + { + "epoch": 0.9090797395257403, + "grad_norm": 1.3570701861075336, + "learning_rate": 7.491360152763543e-07, + "loss": 0.5903, + "step": 7399 + }, + { + "epoch": 0.9092026047425974, + "grad_norm": 1.1662153962610542, + "learning_rate": 7.471296578563774e-07, + "loss": 0.5529, + "step": 7400 + }, + { + "epoch": 0.9093254699594545, + "grad_norm": 1.4529866989946973, + "learning_rate": 7.451259221594709e-07, + "loss": 0.5953, + "step": 7401 + }, + { + "epoch": 0.9094483351763116, + "grad_norm": 1.1984694828742244, + "learning_rate": 7.431248085542031e-07, + "loss": 0.5064, + "step": 7402 + }, + { + "epoch": 0.9095712003931687, + "grad_norm": 1.0743844133921867, + "learning_rate": 7.411263174086696e-07, + "loss": 0.5563, + "step": 7403 + }, + { + "epoch": 0.9096940656100257, + "grad_norm": 1.1100491552887946, + "learning_rate": 7.391304490904732e-07, + "loss": 0.616, + "step": 7404 + }, + { + "epoch": 0.9098169308268829, + "grad_norm": 1.133687946572893, + "learning_rate": 7.371372039667518e-07, + "loss": 0.562, + "step": 7405 + }, + { + "epoch": 0.90993979604374, + "grad_norm": 1.259498965925745, + "learning_rate": 7.351465824041403e-07, + "loss": 0.5851, + "step": 7406 + }, + { + "epoch": 0.9100626612605971, + "grad_norm": 1.4424995433308245, + "learning_rate": 7.33158584768806e-07, + "loss": 0.449, + "step": 7407 + }, + { + "epoch": 0.9101855264774542, + "grad_norm": 1.2259176342336795, + "learning_rate": 7.311732114264247e-07, + "loss": 0.581, + "step": 7408 + }, + { + "epoch": 0.9103083916943113, + "grad_norm": 0.932056455163265, + "learning_rate": 7.291904627421942e-07, + "loss": 0.6239, + "step": 7409 + }, + { + "epoch": 0.9104312569111684, + "grad_norm": 1.1314021822769924, + "learning_rate": 7.27210339080831e-07, + "loss": 0.5526, + "step": 7410 + }, + { + "epoch": 0.9105541221280256, + "grad_norm": 1.2417622887016955, + "learning_rate": 7.252328408065606e-07, + "loss": 0.5078, + "step": 7411 + }, + { + "epoch": 0.9106769873448827, + "grad_norm": 0.8763889908073847, + "learning_rate": 7.232579682831353e-07, + "loss": 0.6007, + "step": 7412 + }, + { + "epoch": 0.9107998525617398, + "grad_norm": 1.1878848317599615, + "learning_rate": 7.212857218738178e-07, + "loss": 0.6352, + "step": 7413 + }, + { + "epoch": 0.9109227177785969, + "grad_norm": 1.0513080412866922, + "learning_rate": 7.193161019413946e-07, + "loss": 0.5135, + "step": 7414 + }, + { + "epoch": 0.911045582995454, + "grad_norm": 1.0210567366246597, + "learning_rate": 7.173491088481576e-07, + "loss": 0.6163, + "step": 7415 + }, + { + "epoch": 0.9111684482123111, + "grad_norm": 1.1484762220145306, + "learning_rate": 7.153847429559257e-07, + "loss": 0.487, + "step": 7416 + }, + { + "epoch": 0.9112913134291682, + "grad_norm": 0.9637544302062211, + "learning_rate": 7.134230046260348e-07, + "loss": 0.5994, + "step": 7417 + }, + { + "epoch": 0.9114141786460254, + "grad_norm": 0.911363192675256, + "learning_rate": 7.114638942193264e-07, + "loss": 0.5987, + "step": 7418 + }, + { + "epoch": 0.9115370438628824, + "grad_norm": 1.2626822511862728, + "learning_rate": 7.09507412096172e-07, + "loss": 0.5183, + "step": 7419 + }, + { + "epoch": 0.9116599090797395, + "grad_norm": 1.091505616924391, + "learning_rate": 7.075535586164506e-07, + "loss": 0.5655, + "step": 7420 + }, + { + "epoch": 0.9117827742965966, + "grad_norm": 1.3134796842163683, + "learning_rate": 7.056023341395662e-07, + "loss": 0.5433, + "step": 7421 + }, + { + "epoch": 0.9119056395134537, + "grad_norm": 1.3497897486190136, + "learning_rate": 7.036537390244269e-07, + "loss": 0.4822, + "step": 7422 + }, + { + "epoch": 0.9120285047303108, + "grad_norm": 0.9605286262418606, + "learning_rate": 7.017077736294675e-07, + "loss": 0.5865, + "step": 7423 + }, + { + "epoch": 0.912151369947168, + "grad_norm": 1.1590328483404775, + "learning_rate": 6.997644383126367e-07, + "loss": 0.5538, + "step": 7424 + }, + { + "epoch": 0.9122742351640251, + "grad_norm": 1.0625572858472507, + "learning_rate": 6.978237334313953e-07, + "loss": 0.4958, + "step": 7425 + }, + { + "epoch": 0.9123971003808822, + "grad_norm": 1.447381315786846, + "learning_rate": 6.958856593427277e-07, + "loss": 0.5877, + "step": 7426 + }, + { + "epoch": 0.9125199655977393, + "grad_norm": 1.4496618084437536, + "learning_rate": 6.939502164031236e-07, + "loss": 0.5157, + "step": 7427 + }, + { + "epoch": 0.9126428308145964, + "grad_norm": 1.3249894906173498, + "learning_rate": 6.920174049686035e-07, + "loss": 0.6976, + "step": 7428 + }, + { + "epoch": 0.9127656960314535, + "grad_norm": 1.2130497230643087, + "learning_rate": 6.900872253946894e-07, + "loss": 0.5589, + "step": 7429 + }, + { + "epoch": 0.9128885612483106, + "grad_norm": 1.0124603046939584, + "learning_rate": 6.881596780364291e-07, + "loss": 0.6136, + "step": 7430 + }, + { + "epoch": 0.9130114264651678, + "grad_norm": 1.1338916073579977, + "learning_rate": 6.862347632483757e-07, + "loss": 0.584, + "step": 7431 + }, + { + "epoch": 0.9131342916820249, + "grad_norm": 1.2753367901217667, + "learning_rate": 6.843124813846141e-07, + "loss": 0.6135, + "step": 7432 + }, + { + "epoch": 0.9132571568988819, + "grad_norm": 1.0232245277966712, + "learning_rate": 6.823928327987283e-07, + "loss": 0.5746, + "step": 7433 + }, + { + "epoch": 0.913380022115739, + "grad_norm": 1.3835179976566894, + "learning_rate": 6.804758178438309e-07, + "loss": 0.5775, + "step": 7434 + }, + { + "epoch": 0.9135028873325961, + "grad_norm": 1.5749504391869904, + "learning_rate": 6.785614368725396e-07, + "loss": 0.5293, + "step": 7435 + }, + { + "epoch": 0.9136257525494532, + "grad_norm": 1.0702776740253543, + "learning_rate": 6.766496902369929e-07, + "loss": 0.6318, + "step": 7436 + }, + { + "epoch": 0.9137486177663103, + "grad_norm": 1.343195289076973, + "learning_rate": 6.747405782888478e-07, + "loss": 0.6872, + "step": 7437 + }, + { + "epoch": 0.9138714829831674, + "grad_norm": 1.5901252859425012, + "learning_rate": 6.728341013792683e-07, + "loss": 0.6265, + "step": 7438 + }, + { + "epoch": 0.9139943482000246, + "grad_norm": 1.2769696623342857, + "learning_rate": 6.70930259858944e-07, + "loss": 0.5585, + "step": 7439 + }, + { + "epoch": 0.9141172134168817, + "grad_norm": 1.0346857981971591, + "learning_rate": 6.690290540780681e-07, + "loss": 0.4981, + "step": 7440 + }, + { + "epoch": 0.9142400786337388, + "grad_norm": 1.577875695225468, + "learning_rate": 6.671304843863607e-07, + "loss": 0.5249, + "step": 7441 + }, + { + "epoch": 0.9143629438505959, + "grad_norm": 1.1717119301262469, + "learning_rate": 6.652345511330477e-07, + "loss": 0.5746, + "step": 7442 + }, + { + "epoch": 0.914485809067453, + "grad_norm": 1.259663222502918, + "learning_rate": 6.633412546668733e-07, + "loss": 0.5935, + "step": 7443 + }, + { + "epoch": 0.9146086742843101, + "grad_norm": 1.1896270195180563, + "learning_rate": 6.614505953361022e-07, + "loss": 0.6427, + "step": 7444 + }, + { + "epoch": 0.9147315395011673, + "grad_norm": 1.4747479581535414, + "learning_rate": 6.59562573488503e-07, + "loss": 0.5379, + "step": 7445 + }, + { + "epoch": 0.9148544047180244, + "grad_norm": 1.2407436136763283, + "learning_rate": 6.576771894713662e-07, + "loss": 0.4644, + "step": 7446 + }, + { + "epoch": 0.9149772699348815, + "grad_norm": 1.2389790380211883, + "learning_rate": 6.557944436314978e-07, + "loss": 0.6148, + "step": 7447 + }, + { + "epoch": 0.9151001351517385, + "grad_norm": 1.2790231923389215, + "learning_rate": 6.539143363152189e-07, + "loss": 0.4957, + "step": 7448 + }, + { + "epoch": 0.9152230003685956, + "grad_norm": 1.3036884351212124, + "learning_rate": 6.52036867868358e-07, + "loss": 0.6464, + "step": 7449 + }, + { + "epoch": 0.9153458655854527, + "grad_norm": 1.1726342912816294, + "learning_rate": 6.501620386362639e-07, + "loss": 0.6104, + "step": 7450 + }, + { + "epoch": 0.9154687308023098, + "grad_norm": 1.2542563969747738, + "learning_rate": 6.482898489638023e-07, + "loss": 0.5362, + "step": 7451 + }, + { + "epoch": 0.915591596019167, + "grad_norm": 1.2517222400089432, + "learning_rate": 6.46420299195351e-07, + "loss": 0.5627, + "step": 7452 + }, + { + "epoch": 0.9157144612360241, + "grad_norm": 1.3820837620006212, + "learning_rate": 6.445533896747968e-07, + "loss": 0.5867, + "step": 7453 + }, + { + "epoch": 0.9158373264528812, + "grad_norm": 1.1436820627007673, + "learning_rate": 6.426891207455482e-07, + "loss": 0.5594, + "step": 7454 + }, + { + "epoch": 0.9159601916697383, + "grad_norm": 1.1410066060655573, + "learning_rate": 6.408274927505276e-07, + "loss": 0.5233, + "step": 7455 + }, + { + "epoch": 0.9160830568865954, + "grad_norm": 1.2766399568650266, + "learning_rate": 6.389685060321643e-07, + "loss": 0.5892, + "step": 7456 + }, + { + "epoch": 0.9162059221034525, + "grad_norm": 1.2760460261313693, + "learning_rate": 6.371121609324115e-07, + "loss": 0.5807, + "step": 7457 + }, + { + "epoch": 0.9163287873203096, + "grad_norm": 1.0724016075591136, + "learning_rate": 6.352584577927278e-07, + "loss": 0.5554, + "step": 7458 + }, + { + "epoch": 0.9164516525371668, + "grad_norm": 1.6082017114980387, + "learning_rate": 6.334073969540955e-07, + "loss": 0.6497, + "step": 7459 + }, + { + "epoch": 0.9165745177540239, + "grad_norm": 1.0609278483348399, + "learning_rate": 6.315589787570003e-07, + "loss": 0.5465, + "step": 7460 + }, + { + "epoch": 0.916697382970881, + "grad_norm": 0.93904670962972, + "learning_rate": 6.297132035414488e-07, + "loss": 0.4481, + "step": 7461 + }, + { + "epoch": 0.916820248187738, + "grad_norm": 1.408359262394457, + "learning_rate": 6.278700716469593e-07, + "loss": 0.5833, + "step": 7462 + }, + { + "epoch": 0.9169431134045951, + "grad_norm": 1.1483995005694225, + "learning_rate": 6.260295834125623e-07, + "loss": 0.5451, + "step": 7463 + }, + { + "epoch": 0.9170659786214522, + "grad_norm": 1.2900639855330667, + "learning_rate": 6.241917391768071e-07, + "loss": 0.5817, + "step": 7464 + }, + { + "epoch": 0.9171888438383093, + "grad_norm": 1.3698286227708507, + "learning_rate": 6.223565392777481e-07, + "loss": 0.5667, + "step": 7465 + }, + { + "epoch": 0.9173117090551665, + "grad_norm": 1.2035518861906918, + "learning_rate": 6.205239840529636e-07, + "loss": 0.5526, + "step": 7466 + }, + { + "epoch": 0.9174345742720236, + "grad_norm": 1.11184468495026, + "learning_rate": 6.186940738395374e-07, + "loss": 0.4558, + "step": 7467 + }, + { + "epoch": 0.9175574394888807, + "grad_norm": 1.0962230135689264, + "learning_rate": 6.16866808974072e-07, + "loss": 0.6505, + "step": 7468 + }, + { + "epoch": 0.9176803047057378, + "grad_norm": 1.2036613996283163, + "learning_rate": 6.15042189792675e-07, + "loss": 0.4791, + "step": 7469 + }, + { + "epoch": 0.9178031699225949, + "grad_norm": 1.25639424101414, + "learning_rate": 6.132202166309814e-07, + "loss": 0.5718, + "step": 7470 + }, + { + "epoch": 0.917926035139452, + "grad_norm": 1.310982914606062, + "learning_rate": 6.114008898241247e-07, + "loss": 0.4481, + "step": 7471 + }, + { + "epoch": 0.9180489003563091, + "grad_norm": 1.167494027471138, + "learning_rate": 6.095842097067639e-07, + "loss": 0.6046, + "step": 7472 + }, + { + "epoch": 0.9181717655731663, + "grad_norm": 1.3007452112309061, + "learning_rate": 6.0777017661306e-07, + "loss": 0.5384, + "step": 7473 + }, + { + "epoch": 0.9182946307900234, + "grad_norm": 0.9939433546945621, + "learning_rate": 6.059587908766962e-07, + "loss": 0.5196, + "step": 7474 + }, + { + "epoch": 0.9184174960068805, + "grad_norm": 1.3558916089791166, + "learning_rate": 6.041500528308641e-07, + "loss": 0.6181, + "step": 7475 + }, + { + "epoch": 0.9185403612237376, + "grad_norm": 1.3885850684455214, + "learning_rate": 6.023439628082694e-07, + "loss": 0.5659, + "step": 7476 + }, + { + "epoch": 0.9186632264405946, + "grad_norm": 1.202045329832611, + "learning_rate": 6.005405211411297e-07, + "loss": 0.562, + "step": 7477 + }, + { + "epoch": 0.9187860916574517, + "grad_norm": 1.3961785463104528, + "learning_rate": 5.987397281611779e-07, + "loss": 0.5833, + "step": 7478 + }, + { + "epoch": 0.9189089568743088, + "grad_norm": 1.1271943880325428, + "learning_rate": 5.969415841996606e-07, + "loss": 0.7027, + "step": 7479 + }, + { + "epoch": 0.919031822091166, + "grad_norm": 1.1850117223166388, + "learning_rate": 5.951460895873284e-07, + "loss": 0.6882, + "step": 7480 + }, + { + "epoch": 0.9191546873080231, + "grad_norm": 1.2151737689272932, + "learning_rate": 5.933532446544538e-07, + "loss": 0.5696, + "step": 7481 + }, + { + "epoch": 0.9192775525248802, + "grad_norm": 1.3673352867813822, + "learning_rate": 5.915630497308228e-07, + "loss": 0.5811, + "step": 7482 + }, + { + "epoch": 0.9194004177417373, + "grad_norm": 1.2209655345548363, + "learning_rate": 5.897755051457238e-07, + "loss": 0.6102, + "step": 7483 + }, + { + "epoch": 0.9195232829585944, + "grad_norm": 1.1830278097678935, + "learning_rate": 5.87990611227967e-07, + "loss": 0.5038, + "step": 7484 + }, + { + "epoch": 0.9196461481754515, + "grad_norm": 1.0917949291400777, + "learning_rate": 5.862083683058733e-07, + "loss": 0.6013, + "step": 7485 + }, + { + "epoch": 0.9197690133923087, + "grad_norm": 1.4034150073926124, + "learning_rate": 5.844287767072753e-07, + "loss": 0.6402, + "step": 7486 + }, + { + "epoch": 0.9198918786091658, + "grad_norm": 1.390301691857146, + "learning_rate": 5.82651836759513e-07, + "loss": 0.6217, + "step": 7487 + }, + { + "epoch": 0.9200147438260229, + "grad_norm": 1.1929088317719423, + "learning_rate": 5.808775487894447e-07, + "loss": 0.5444, + "step": 7488 + }, + { + "epoch": 0.92013760904288, + "grad_norm": 1.1917352715161051, + "learning_rate": 5.791059131234411e-07, + "loss": 0.5086, + "step": 7489 + }, + { + "epoch": 0.9202604742597371, + "grad_norm": 1.0217615820846444, + "learning_rate": 5.773369300873849e-07, + "loss": 0.6224, + "step": 7490 + }, + { + "epoch": 0.9203833394765942, + "grad_norm": 1.0548144474841965, + "learning_rate": 5.755706000066624e-07, + "loss": 0.441, + "step": 7491 + }, + { + "epoch": 0.9205062046934512, + "grad_norm": 1.1937192361223212, + "learning_rate": 5.738069232061837e-07, + "loss": 0.4559, + "step": 7492 + }, + { + "epoch": 0.9206290699103084, + "grad_norm": 1.4542029403397685, + "learning_rate": 5.720459000103644e-07, + "loss": 0.5869, + "step": 7493 + }, + { + "epoch": 0.9207519351271655, + "grad_norm": 1.081297535989145, + "learning_rate": 5.702875307431321e-07, + "loss": 0.6081, + "step": 7494 + }, + { + "epoch": 0.9208748003440226, + "grad_norm": 1.226044340282077, + "learning_rate": 5.685318157279313e-07, + "loss": 0.582, + "step": 7495 + }, + { + "epoch": 0.9209976655608797, + "grad_norm": 1.3295515232667456, + "learning_rate": 5.667787552877085e-07, + "loss": 0.5153, + "step": 7496 + }, + { + "epoch": 0.9211205307777368, + "grad_norm": 1.4400019687916938, + "learning_rate": 5.650283497449327e-07, + "loss": 0.5952, + "step": 7497 + }, + { + "epoch": 0.9212433959945939, + "grad_norm": 1.2332996193111736, + "learning_rate": 5.632805994215761e-07, + "loss": 0.5507, + "step": 7498 + }, + { + "epoch": 0.921366261211451, + "grad_norm": 1.2189498384144462, + "learning_rate": 5.615355046391302e-07, + "loss": 0.6372, + "step": 7499 + }, + { + "epoch": 0.9214891264283082, + "grad_norm": 1.0361304598818593, + "learning_rate": 5.597930657185913e-07, + "loss": 0.5814, + "step": 7500 + }, + { + "epoch": 0.9216119916451653, + "grad_norm": 1.0852873295876413, + "learning_rate": 5.58053282980468e-07, + "loss": 0.5401, + "step": 7501 + }, + { + "epoch": 0.9217348568620224, + "grad_norm": 1.0801300288331062, + "learning_rate": 5.56316156744786e-07, + "loss": 0.4789, + "step": 7502 + }, + { + "epoch": 0.9218577220788795, + "grad_norm": 1.3732560514619276, + "learning_rate": 5.545816873310733e-07, + "loss": 0.5444, + "step": 7503 + }, + { + "epoch": 0.9219805872957366, + "grad_norm": 1.3856717508322618, + "learning_rate": 5.52849875058381e-07, + "loss": 0.5887, + "step": 7504 + }, + { + "epoch": 0.9221034525125937, + "grad_norm": 1.2263662540648121, + "learning_rate": 5.511207202452595e-07, + "loss": 0.628, + "step": 7505 + }, + { + "epoch": 0.9222263177294507, + "grad_norm": 1.0518584401625717, + "learning_rate": 5.493942232097792e-07, + "loss": 0.5888, + "step": 7506 + }, + { + "epoch": 0.9223491829463079, + "grad_norm": 1.7495591596430669, + "learning_rate": 5.476703842695114e-07, + "loss": 0.5861, + "step": 7507 + }, + { + "epoch": 0.922472048163165, + "grad_norm": 1.156540432758227, + "learning_rate": 5.459492037415536e-07, + "loss": 0.5434, + "step": 7508 + }, + { + "epoch": 0.9225949133800221, + "grad_norm": 1.189786024401316, + "learning_rate": 5.442306819425013e-07, + "loss": 0.4787, + "step": 7509 + }, + { + "epoch": 0.9227177785968792, + "grad_norm": 1.2193426443347493, + "learning_rate": 5.425148191884666e-07, + "loss": 0.5717, + "step": 7510 + }, + { + "epoch": 0.9228406438137363, + "grad_norm": 1.5793656319093057, + "learning_rate": 5.408016157950701e-07, + "loss": 0.6063, + "step": 7511 + }, + { + "epoch": 0.9229635090305934, + "grad_norm": 1.0709744819984381, + "learning_rate": 5.390910720774433e-07, + "loss": 0.6011, + "step": 7512 + }, + { + "epoch": 0.9230863742474505, + "grad_norm": 1.1075993088169265, + "learning_rate": 5.373831883502345e-07, + "loss": 0.4356, + "step": 7513 + }, + { + "epoch": 0.9232092394643077, + "grad_norm": 1.1091177190936585, + "learning_rate": 5.35677964927594e-07, + "loss": 0.5629, + "step": 7514 + }, + { + "epoch": 0.9233321046811648, + "grad_norm": 1.3189165668150655, + "learning_rate": 5.339754021231857e-07, + "loss": 0.4588, + "step": 7515 + }, + { + "epoch": 0.9234549698980219, + "grad_norm": 1.5232790316092173, + "learning_rate": 5.322755002501878e-07, + "loss": 0.624, + "step": 7516 + }, + { + "epoch": 0.923577835114879, + "grad_norm": 1.1302665328761694, + "learning_rate": 5.305782596212866e-07, + "loss": 0.5907, + "step": 7517 + }, + { + "epoch": 0.9237007003317361, + "grad_norm": 1.4993851031560625, + "learning_rate": 5.288836805486758e-07, + "loss": 0.6651, + "step": 7518 + }, + { + "epoch": 0.9238235655485932, + "grad_norm": 1.2343386248515626, + "learning_rate": 5.271917633440627e-07, + "loss": 0.4837, + "step": 7519 + }, + { + "epoch": 0.9239464307654504, + "grad_norm": 1.1716248760448322, + "learning_rate": 5.255025083186682e-07, + "loss": 0.5207, + "step": 7520 + }, + { + "epoch": 0.9240692959823074, + "grad_norm": 1.324219699627738, + "learning_rate": 5.23815915783214e-07, + "loss": 0.5705, + "step": 7521 + }, + { + "epoch": 0.9241921611991645, + "grad_norm": 1.1354336882013718, + "learning_rate": 5.221319860479401e-07, + "loss": 0.5854, + "step": 7522 + }, + { + "epoch": 0.9243150264160216, + "grad_norm": 1.1101797017029, + "learning_rate": 5.204507194225971e-07, + "loss": 0.5829, + "step": 7523 + }, + { + "epoch": 0.9244378916328787, + "grad_norm": 1.2139709708707587, + "learning_rate": 5.18772116216441e-07, + "loss": 0.5703, + "step": 7524 + }, + { + "epoch": 0.9245607568497358, + "grad_norm": 1.4039291714936641, + "learning_rate": 5.170961767382398e-07, + "loss": 0.575, + "step": 7525 + }, + { + "epoch": 0.9246836220665929, + "grad_norm": 1.1650057925415465, + "learning_rate": 5.154229012962702e-07, + "loss": 0.5524, + "step": 7526 + }, + { + "epoch": 0.92480648728345, + "grad_norm": 1.231152512181629, + "learning_rate": 5.137522901983244e-07, + "loss": 0.5529, + "step": 7527 + }, + { + "epoch": 0.9249293525003072, + "grad_norm": 1.1810067195750897, + "learning_rate": 5.120843437516981e-07, + "loss": 0.5136, + "step": 7528 + }, + { + "epoch": 0.9250522177171643, + "grad_norm": 1.0629762695212648, + "learning_rate": 5.104190622631977e-07, + "loss": 0.6027, + "step": 7529 + }, + { + "epoch": 0.9251750829340214, + "grad_norm": 1.4407960363326655, + "learning_rate": 5.087564460391431e-07, + "loss": 0.5281, + "step": 7530 + }, + { + "epoch": 0.9252979481508785, + "grad_norm": 1.2528916107707408, + "learning_rate": 5.070964953853629e-07, + "loss": 0.4544, + "step": 7531 + }, + { + "epoch": 0.9254208133677356, + "grad_norm": 1.165741975834856, + "learning_rate": 5.054392106071914e-07, + "loss": 0.5719, + "step": 7532 + }, + { + "epoch": 0.9255436785845927, + "grad_norm": 1.1799612416634437, + "learning_rate": 5.03784592009478e-07, + "loss": 0.5331, + "step": 7533 + }, + { + "epoch": 0.9256665438014499, + "grad_norm": 1.1838636265310092, + "learning_rate": 5.021326398965742e-07, + "loss": 0.6522, + "step": 7534 + }, + { + "epoch": 0.9257894090183069, + "grad_norm": 1.2072869203863958, + "learning_rate": 5.004833545723519e-07, + "loss": 0.6155, + "step": 7535 + }, + { + "epoch": 0.925912274235164, + "grad_norm": 1.07204280236886, + "learning_rate": 4.988367363401835e-07, + "loss": 0.637, + "step": 7536 + }, + { + "epoch": 0.9260351394520211, + "grad_norm": 1.1964336067246337, + "learning_rate": 4.971927855029551e-07, + "loss": 0.603, + "step": 7537 + }, + { + "epoch": 0.9261580046688782, + "grad_norm": 1.2739797270326663, + "learning_rate": 4.95551502363058e-07, + "loss": 0.6303, + "step": 7538 + }, + { + "epoch": 0.9262808698857353, + "grad_norm": 1.2058023645172338, + "learning_rate": 4.939128872223975e-07, + "loss": 0.7118, + "step": 7539 + }, + { + "epoch": 0.9264037351025924, + "grad_norm": 1.1916738175124733, + "learning_rate": 4.922769403823873e-07, + "loss": 0.4985, + "step": 7540 + }, + { + "epoch": 0.9265266003194496, + "grad_norm": 1.1716803368272342, + "learning_rate": 4.90643662143947e-07, + "loss": 0.5147, + "step": 7541 + }, + { + "epoch": 0.9266494655363067, + "grad_norm": 1.0637762926681107, + "learning_rate": 4.890130528075093e-07, + "loss": 0.4848, + "step": 7542 + }, + { + "epoch": 0.9267723307531638, + "grad_norm": 1.4779562506218884, + "learning_rate": 4.873851126730128e-07, + "loss": 0.6367, + "step": 7543 + }, + { + "epoch": 0.9268951959700209, + "grad_norm": 1.1196969609122163, + "learning_rate": 4.857598420399078e-07, + "loss": 0.6352, + "step": 7544 + }, + { + "epoch": 0.927018061186878, + "grad_norm": 1.2044920220169406, + "learning_rate": 4.841372412071504e-07, + "loss": 0.5491, + "step": 7545 + }, + { + "epoch": 0.9271409264037351, + "grad_norm": 1.2797697316159495, + "learning_rate": 4.8251731047321e-07, + "loss": 0.6921, + "step": 7546 + }, + { + "epoch": 0.9272637916205922, + "grad_norm": 1.1824413981666217, + "learning_rate": 4.809000501360616e-07, + "loss": 0.5611, + "step": 7547 + }, + { + "epoch": 0.9273866568374494, + "grad_norm": 1.3007679155829126, + "learning_rate": 4.79285460493189e-07, + "loss": 0.53, + "step": 7548 + }, + { + "epoch": 0.9275095220543065, + "grad_norm": 1.0714754860445481, + "learning_rate": 4.776735418415846e-07, + "loss": 0.6056, + "step": 7549 + }, + { + "epoch": 0.9276323872711635, + "grad_norm": 1.157306503532158, + "learning_rate": 4.760642944777527e-07, + "loss": 0.666, + "step": 7550 + }, + { + "epoch": 0.9277552524880206, + "grad_norm": 1.134809838494938, + "learning_rate": 4.744577186977034e-07, + "loss": 0.5167, + "step": 7551 + }, + { + "epoch": 0.9278781177048777, + "grad_norm": 1.2735767777700502, + "learning_rate": 4.728538147969536e-07, + "loss": 0.4674, + "step": 7552 + }, + { + "epoch": 0.9280009829217348, + "grad_norm": 0.9252164863915822, + "learning_rate": 4.7125258307053385e-07, + "loss": 0.543, + "step": 7553 + }, + { + "epoch": 0.928123848138592, + "grad_norm": 1.0115628926180718, + "learning_rate": 4.6965402381297874e-07, + "loss": 0.4964, + "step": 7554 + }, + { + "epoch": 0.9282467133554491, + "grad_norm": 1.0329618494395612, + "learning_rate": 4.6805813731833456e-07, + "loss": 0.5304, + "step": 7555 + }, + { + "epoch": 0.9283695785723062, + "grad_norm": 1.4872740729775655, + "learning_rate": 4.664649238801516e-07, + "loss": 0.6603, + "step": 7556 + }, + { + "epoch": 0.9284924437891633, + "grad_norm": 1.1090426522164871, + "learning_rate": 4.6487438379149207e-07, + "loss": 0.5436, + "step": 7557 + }, + { + "epoch": 0.9286153090060204, + "grad_norm": 1.2503771432251478, + "learning_rate": 4.632865173449285e-07, + "loss": 0.6682, + "step": 7558 + }, + { + "epoch": 0.9287381742228775, + "grad_norm": 1.7444897759105944, + "learning_rate": 4.617013248325341e-07, + "loss": 0.5431, + "step": 7559 + }, + { + "epoch": 0.9288610394397346, + "grad_norm": 1.3451354403595412, + "learning_rate": 4.601188065458989e-07, + "loss": 0.5671, + "step": 7560 + }, + { + "epoch": 0.9289839046565918, + "grad_norm": 1.2233176594359372, + "learning_rate": 4.5853896277610995e-07, + "loss": 0.5341, + "step": 7561 + }, + { + "epoch": 0.9291067698734489, + "grad_norm": 1.1566304834793975, + "learning_rate": 4.569617938137799e-07, + "loss": 0.6052, + "step": 7562 + }, + { + "epoch": 0.929229635090306, + "grad_norm": 1.1324201132903031, + "learning_rate": 4.5538729994900994e-07, + "loss": 0.6098, + "step": 7563 + }, + { + "epoch": 0.929352500307163, + "grad_norm": 1.4421768446391816, + "learning_rate": 4.5381548147142015e-07, + "loss": 0.631, + "step": 7564 + }, + { + "epoch": 0.9294753655240201, + "grad_norm": 1.0246141307413499, + "learning_rate": 4.5224633867014086e-07, + "loss": 0.5575, + "step": 7565 + }, + { + "epoch": 0.9295982307408772, + "grad_norm": 1.3849306067603793, + "learning_rate": 4.5067987183379956e-07, + "loss": 0.5684, + "step": 7566 + }, + { + "epoch": 0.9297210959577343, + "grad_norm": 1.0946811309193745, + "learning_rate": 4.491160812505407e-07, + "loss": 0.5664, + "step": 7567 + }, + { + "epoch": 0.9298439611745914, + "grad_norm": 1.1926903813153242, + "learning_rate": 4.4755496720801094e-07, + "loss": 0.6269, + "step": 7568 + }, + { + "epoch": 0.9299668263914486, + "grad_norm": 1.2744799248873335, + "learning_rate": 4.4599652999337213e-07, + "loss": 0.612, + "step": 7569 + }, + { + "epoch": 0.9300896916083057, + "grad_norm": 1.22310957661172, + "learning_rate": 4.444407698932834e-07, + "loss": 0.5787, + "step": 7570 + }, + { + "epoch": 0.9302125568251628, + "grad_norm": 1.365443305761694, + "learning_rate": 4.428876871939208e-07, + "loss": 0.5019, + "step": 7571 + }, + { + "epoch": 0.9303354220420199, + "grad_norm": 1.4289757741233113, + "learning_rate": 4.4133728218095916e-07, + "loss": 0.6216, + "step": 7572 + }, + { + "epoch": 0.930458287258877, + "grad_norm": 1.1794713815549989, + "learning_rate": 4.3978955513959195e-07, + "loss": 0.5506, + "step": 7573 + }, + { + "epoch": 0.9305811524757341, + "grad_norm": 1.2651670730273994, + "learning_rate": 4.382445063545065e-07, + "loss": 0.5726, + "step": 7574 + }, + { + "epoch": 0.9307040176925913, + "grad_norm": 1.0614558040498419, + "learning_rate": 4.367021361099105e-07, + "loss": 0.4429, + "step": 7575 + }, + { + "epoch": 0.9308268829094484, + "grad_norm": 1.3182950029562455, + "learning_rate": 4.351624446895086e-07, + "loss": 0.6701, + "step": 7576 + }, + { + "epoch": 0.9309497481263055, + "grad_norm": 1.2086058691459074, + "learning_rate": 4.336254323765193e-07, + "loss": 0.4612, + "step": 7577 + }, + { + "epoch": 0.9310726133431626, + "grad_norm": 0.9901983450403038, + "learning_rate": 4.320910994536664e-07, + "loss": 0.5715, + "step": 7578 + }, + { + "epoch": 0.9311954785600196, + "grad_norm": 1.2162198472550705, + "learning_rate": 4.3055944620317754e-07, + "loss": 0.505, + "step": 7579 + }, + { + "epoch": 0.9313183437768767, + "grad_norm": 1.3017071392052155, + "learning_rate": 4.2903047290679233e-07, + "loss": 0.4979, + "step": 7580 + }, + { + "epoch": 0.9314412089937338, + "grad_norm": 1.0329859385959645, + "learning_rate": 4.2750417984575573e-07, + "loss": 0.6187, + "step": 7581 + }, + { + "epoch": 0.931564074210591, + "grad_norm": 1.2966734734007774, + "learning_rate": 4.259805673008216e-07, + "loss": 0.5345, + "step": 7582 + }, + { + "epoch": 0.9316869394274481, + "grad_norm": 1.4012156466964318, + "learning_rate": 4.2445963555224396e-07, + "loss": 0.6099, + "step": 7583 + }, + { + "epoch": 0.9318098046443052, + "grad_norm": 1.4260738848455499, + "learning_rate": 4.2294138487979083e-07, + "loss": 0.6309, + "step": 7584 + }, + { + "epoch": 0.9319326698611623, + "grad_norm": 1.380703309969176, + "learning_rate": 4.214258155627371e-07, + "loss": 0.5451, + "step": 7585 + }, + { + "epoch": 0.9320555350780194, + "grad_norm": 1.2804590354001912, + "learning_rate": 4.1991292787985636e-07, + "loss": 0.6175, + "step": 7586 + }, + { + "epoch": 0.9321784002948765, + "grad_norm": 1.0452421613208913, + "learning_rate": 4.1840272210943773e-07, + "loss": 0.5274, + "step": 7587 + }, + { + "epoch": 0.9323012655117336, + "grad_norm": 1.0481901119187438, + "learning_rate": 4.168951985292724e-07, + "loss": 0.6125, + "step": 7588 + }, + { + "epoch": 0.9324241307285908, + "grad_norm": 1.0999381458839763, + "learning_rate": 4.1539035741666344e-07, + "loss": 0.6385, + "step": 7589 + }, + { + "epoch": 0.9325469959454479, + "grad_norm": 1.2008521591209065, + "learning_rate": 4.1388819904841115e-07, + "loss": 0.6005, + "step": 7590 + }, + { + "epoch": 0.932669861162305, + "grad_norm": 1.0355107464483728, + "learning_rate": 4.123887237008311e-07, + "loss": 0.6406, + "step": 7591 + }, + { + "epoch": 0.9327927263791621, + "grad_norm": 1.1555470561010432, + "learning_rate": 4.1089193164974115e-07, + "loss": 0.6452, + "step": 7592 + }, + { + "epoch": 0.9329155915960192, + "grad_norm": 1.167440063786132, + "learning_rate": 4.0939782317046924e-07, + "loss": 0.4638, + "step": 7593 + }, + { + "epoch": 0.9330384568128762, + "grad_norm": 1.2503176472630044, + "learning_rate": 4.0790639853784227e-07, + "loss": 0.5974, + "step": 7594 + }, + { + "epoch": 0.9331613220297333, + "grad_norm": 1.3292695712140326, + "learning_rate": 4.0641765802619914e-07, + "loss": 0.5265, + "step": 7595 + }, + { + "epoch": 0.9332841872465905, + "grad_norm": 1.1973436337053034, + "learning_rate": 4.049316019093874e-07, + "loss": 0.4632, + "step": 7596 + }, + { + "epoch": 0.9334070524634476, + "grad_norm": 1.459983082533924, + "learning_rate": 4.0344823046075343e-07, + "loss": 0.5181, + "step": 7597 + }, + { + "epoch": 0.9335299176803047, + "grad_norm": 1.0989139699311392, + "learning_rate": 4.0196754395315726e-07, + "loss": 0.5176, + "step": 7598 + }, + { + "epoch": 0.9336527828971618, + "grad_norm": 1.2190286802492338, + "learning_rate": 4.0048954265895774e-07, + "loss": 0.566, + "step": 7599 + }, + { + "epoch": 0.9337756481140189, + "grad_norm": 1.2587262121285197, + "learning_rate": 3.990142268500274e-07, + "loss": 0.4629, + "step": 7600 + }, + { + "epoch": 0.933898513330876, + "grad_norm": 1.1185463414261367, + "learning_rate": 3.975415967977375e-07, + "loss": 0.5124, + "step": 7601 + }, + { + "epoch": 0.9340213785477331, + "grad_norm": 1.0913800550705328, + "learning_rate": 3.96071652772973e-07, + "loss": 0.6341, + "step": 7602 + }, + { + "epoch": 0.9341442437645903, + "grad_norm": 1.8892974397199394, + "learning_rate": 3.9460439504611587e-07, + "loss": 0.6697, + "step": 7603 + }, + { + "epoch": 0.9342671089814474, + "grad_norm": 1.557678586277179, + "learning_rate": 3.9313982388706206e-07, + "loss": 0.7368, + "step": 7604 + }, + { + "epoch": 0.9343899741983045, + "grad_norm": 1.239293640144346, + "learning_rate": 3.9167793956520927e-07, + "loss": 0.6569, + "step": 7605 + }, + { + "epoch": 0.9345128394151616, + "grad_norm": 1.1821572020037776, + "learning_rate": 3.902187423494591e-07, + "loss": 0.5139, + "step": 7606 + }, + { + "epoch": 0.9346357046320187, + "grad_norm": 1.1797996605664147, + "learning_rate": 3.8876223250822516e-07, + "loss": 0.5718, + "step": 7607 + }, + { + "epoch": 0.9347585698488757, + "grad_norm": 1.0455565428639864, + "learning_rate": 3.8730841030942155e-07, + "loss": 0.5012, + "step": 7608 + }, + { + "epoch": 0.9348814350657328, + "grad_norm": 1.1777442364006812, + "learning_rate": 3.858572760204693e-07, + "loss": 0.6181, + "step": 7609 + }, + { + "epoch": 0.93500430028259, + "grad_norm": 1.0631880575579615, + "learning_rate": 3.844088299082932e-07, + "loss": 0.5724, + "step": 7610 + }, + { + "epoch": 0.9351271654994471, + "grad_norm": 1.1681741945494173, + "learning_rate": 3.829630722393301e-07, + "loss": 0.6057, + "step": 7611 + }, + { + "epoch": 0.9352500307163042, + "grad_norm": 1.3375625974928524, + "learning_rate": 3.815200032795141e-07, + "loss": 0.6464, + "step": 7612 + }, + { + "epoch": 0.9353728959331613, + "grad_norm": 1.3490583168663888, + "learning_rate": 3.800796232942894e-07, + "loss": 0.6589, + "step": 7613 + }, + { + "epoch": 0.9354957611500184, + "grad_norm": 1.240808594629904, + "learning_rate": 3.78641932548604e-07, + "loss": 0.507, + "step": 7614 + }, + { + "epoch": 0.9356186263668755, + "grad_norm": 1.1593411380745904, + "learning_rate": 3.7720693130691155e-07, + "loss": 0.6044, + "step": 7615 + }, + { + "epoch": 0.9357414915837327, + "grad_norm": 1.0961455940063567, + "learning_rate": 3.7577461983317407e-07, + "loss": 0.5418, + "step": 7616 + }, + { + "epoch": 0.9358643568005898, + "grad_norm": 0.9177259773838493, + "learning_rate": 3.743449983908526e-07, + "loss": 0.5612, + "step": 7617 + }, + { + "epoch": 0.9359872220174469, + "grad_norm": 1.021622607526969, + "learning_rate": 3.7291806724291667e-07, + "loss": 0.5662, + "step": 7618 + }, + { + "epoch": 0.936110087234304, + "grad_norm": 1.1778253294059775, + "learning_rate": 3.7149382665184305e-07, + "loss": 0.5799, + "step": 7619 + }, + { + "epoch": 0.9362329524511611, + "grad_norm": 1.0887702499753598, + "learning_rate": 3.700722768796122e-07, + "loss": 0.5299, + "step": 7620 + }, + { + "epoch": 0.9363558176680182, + "grad_norm": 0.9996505715606064, + "learning_rate": 3.686534181877066e-07, + "loss": 0.5406, + "step": 7621 + }, + { + "epoch": 0.9364786828848753, + "grad_norm": 1.3593952345686473, + "learning_rate": 3.6723725083711745e-07, + "loss": 0.6371, + "step": 7622 + }, + { + "epoch": 0.9366015481017324, + "grad_norm": 1.2293722638301758, + "learning_rate": 3.658237750883398e-07, + "loss": 0.5092, + "step": 7623 + }, + { + "epoch": 0.9367244133185895, + "grad_norm": 1.1571976457327158, + "learning_rate": 3.644129912013705e-07, + "loss": 0.4482, + "step": 7624 + }, + { + "epoch": 0.9368472785354466, + "grad_norm": 1.3073514100536288, + "learning_rate": 3.630048994357188e-07, + "loss": 0.4755, + "step": 7625 + }, + { + "epoch": 0.9369701437523037, + "grad_norm": 1.4432515514108184, + "learning_rate": 3.615995000503891e-07, + "loss": 0.5116, + "step": 7626 + }, + { + "epoch": 0.9370930089691608, + "grad_norm": 1.018433170106163, + "learning_rate": 3.601967933039013e-07, + "loss": 0.562, + "step": 7627 + }, + { + "epoch": 0.9372158741860179, + "grad_norm": 1.2766977892408913, + "learning_rate": 3.5879677945426904e-07, + "loss": 0.6301, + "step": 7628 + }, + { + "epoch": 0.937338739402875, + "grad_norm": 1.1588956854590802, + "learning_rate": 3.573994587590163e-07, + "loss": 0.5534, + "step": 7629 + }, + { + "epoch": 0.9374616046197322, + "grad_norm": 1.0647464324773823, + "learning_rate": 3.5600483147517406e-07, + "loss": 0.5563, + "step": 7630 + }, + { + "epoch": 0.9375844698365893, + "grad_norm": 1.312398586186313, + "learning_rate": 3.5461289785927384e-07, + "loss": 0.7674, + "step": 7631 + }, + { + "epoch": 0.9377073350534464, + "grad_norm": 1.146344891638692, + "learning_rate": 3.532236581673526e-07, + "loss": 0.6437, + "step": 7632 + }, + { + "epoch": 0.9378302002703035, + "grad_norm": 0.917890302177743, + "learning_rate": 3.5183711265495077e-07, + "loss": 0.5234, + "step": 7633 + }, + { + "epoch": 0.9379530654871606, + "grad_norm": 1.0665241883988756, + "learning_rate": 3.504532615771161e-07, + "loss": 0.4835, + "step": 7634 + }, + { + "epoch": 0.9380759307040177, + "grad_norm": 1.4816109076733543, + "learning_rate": 3.490721051883966e-07, + "loss": 0.6297, + "step": 7635 + }, + { + "epoch": 0.9381987959208749, + "grad_norm": 1.3237109353149827, + "learning_rate": 3.476936437428524e-07, + "loss": 0.5734, + "step": 7636 + }, + { + "epoch": 0.9383216611377319, + "grad_norm": 1.1668981556701867, + "learning_rate": 3.46317877494034e-07, + "loss": 0.5591, + "step": 7637 + }, + { + "epoch": 0.938444526354589, + "grad_norm": 1.1057585314899907, + "learning_rate": 3.449448066950139e-07, + "loss": 0.5957, + "step": 7638 + }, + { + "epoch": 0.9385673915714461, + "grad_norm": 1.1397613496608534, + "learning_rate": 3.435744315983519e-07, + "loss": 0.7315, + "step": 7639 + }, + { + "epoch": 0.9386902567883032, + "grad_norm": 1.135656600408037, + "learning_rate": 3.422067524561262e-07, + "loss": 0.5943, + "step": 7640 + }, + { + "epoch": 0.9388131220051603, + "grad_norm": 1.2905850621865576, + "learning_rate": 3.408417695199073e-07, + "loss": 0.5918, + "step": 7641 + }, + { + "epoch": 0.9389359872220174, + "grad_norm": 1.5345375478775598, + "learning_rate": 3.39479483040776e-07, + "loss": 0.6256, + "step": 7642 + }, + { + "epoch": 0.9390588524388745, + "grad_norm": 1.1730211813462579, + "learning_rate": 3.3811989326932026e-07, + "loss": 0.5106, + "step": 7643 + }, + { + "epoch": 0.9391817176557317, + "grad_norm": 1.2883806074501003, + "learning_rate": 3.367630004556216e-07, + "loss": 0.5597, + "step": 7644 + }, + { + "epoch": 0.9393045828725888, + "grad_norm": 1.2469242709651642, + "learning_rate": 3.354088048492754e-07, + "loss": 0.5826, + "step": 7645 + }, + { + "epoch": 0.9394274480894459, + "grad_norm": 1.5765814102391031, + "learning_rate": 3.340573066993757e-07, + "loss": 0.5022, + "step": 7646 + }, + { + "epoch": 0.939550313306303, + "grad_norm": 1.1390410728973255, + "learning_rate": 3.3270850625452377e-07, + "loss": 0.5567, + "step": 7647 + }, + { + "epoch": 0.9396731785231601, + "grad_norm": 1.1313677842451682, + "learning_rate": 3.3136240376281935e-07, + "loss": 0.5878, + "step": 7648 + }, + { + "epoch": 0.9397960437400172, + "grad_norm": 1.4379235763668676, + "learning_rate": 3.3001899947187275e-07, + "loss": 0.5683, + "step": 7649 + }, + { + "epoch": 0.9399189089568744, + "grad_norm": 1.3740783830783299, + "learning_rate": 3.28678293628793e-07, + "loss": 0.5808, + "step": 7650 + }, + { + "epoch": 0.9400417741737315, + "grad_norm": 1.0785353076026944, + "learning_rate": 3.273402864801944e-07, + "loss": 0.6104, + "step": 7651 + }, + { + "epoch": 0.9401646393905885, + "grad_norm": 1.053508305753356, + "learning_rate": 3.2600497827219524e-07, + "loss": 0.5497, + "step": 7652 + }, + { + "epoch": 0.9402875046074456, + "grad_norm": 1.4694993350700754, + "learning_rate": 3.246723692504139e-07, + "loss": 0.5211, + "step": 7653 + }, + { + "epoch": 0.9404103698243027, + "grad_norm": 1.5266780056444087, + "learning_rate": 3.2334245965997933e-07, + "loss": 0.5505, + "step": 7654 + }, + { + "epoch": 0.9405332350411598, + "grad_norm": 1.0348506383532476, + "learning_rate": 3.220152497455175e-07, + "loss": 0.5656, + "step": 7655 + }, + { + "epoch": 0.9406561002580169, + "grad_norm": 1.2284643503611357, + "learning_rate": 3.206907397511599e-07, + "loss": 0.5705, + "step": 7656 + }, + { + "epoch": 0.940778965474874, + "grad_norm": 1.2935486099028568, + "learning_rate": 3.1936892992054155e-07, + "loss": 0.5539, + "step": 7657 + }, + { + "epoch": 0.9409018306917312, + "grad_norm": 1.0840939951415647, + "learning_rate": 3.18049820496803e-07, + "loss": 0.5691, + "step": 7658 + }, + { + "epoch": 0.9410246959085883, + "grad_norm": 1.2148124782213034, + "learning_rate": 3.167334117225834e-07, + "loss": 0.5931, + "step": 7659 + }, + { + "epoch": 0.9411475611254454, + "grad_norm": 1.156851033705549, + "learning_rate": 3.154197038400275e-07, + "loss": 0.4651, + "step": 7660 + }, + { + "epoch": 0.9412704263423025, + "grad_norm": 1.3142138367922893, + "learning_rate": 3.141086970907853e-07, + "loss": 0.5899, + "step": 7661 + }, + { + "epoch": 0.9413932915591596, + "grad_norm": 1.4086110045504152, + "learning_rate": 3.1280039171600715e-07, + "loss": 0.4885, + "step": 7662 + }, + { + "epoch": 0.9415161567760167, + "grad_norm": 1.3394471574017637, + "learning_rate": 3.1149478795634736e-07, + "loss": 0.7062, + "step": 7663 + }, + { + "epoch": 0.9416390219928739, + "grad_norm": 1.276759628210885, + "learning_rate": 3.1019188605196035e-07, + "loss": 0.606, + "step": 7664 + }, + { + "epoch": 0.941761887209731, + "grad_norm": 1.3121554144422776, + "learning_rate": 3.088916862425112e-07, + "loss": 0.6494, + "step": 7665 + }, + { + "epoch": 0.941884752426588, + "grad_norm": 1.2061823434607322, + "learning_rate": 3.0759418876716183e-07, + "loss": 0.5366, + "step": 7666 + }, + { + "epoch": 0.9420076176434451, + "grad_norm": 1.1761001464353842, + "learning_rate": 3.062993938645781e-07, + "loss": 0.5473, + "step": 7667 + }, + { + "epoch": 0.9421304828603022, + "grad_norm": 1.0445555063555247, + "learning_rate": 3.0500730177292604e-07, + "loss": 0.4826, + "step": 7668 + }, + { + "epoch": 0.9422533480771593, + "grad_norm": 1.1286654676033918, + "learning_rate": 3.037179127298823e-07, + "loss": 0.5783, + "step": 7669 + }, + { + "epoch": 0.9423762132940164, + "grad_norm": 0.8991888517416979, + "learning_rate": 3.024312269726204e-07, + "loss": 0.5654, + "step": 7670 + }, + { + "epoch": 0.9424990785108736, + "grad_norm": 1.080597445653462, + "learning_rate": 3.0114724473781443e-07, + "loss": 0.4585, + "step": 7671 + }, + { + "epoch": 0.9426219437277307, + "grad_norm": 1.5146910984784896, + "learning_rate": 2.998659662616504e-07, + "loss": 0.5183, + "step": 7672 + }, + { + "epoch": 0.9427448089445878, + "grad_norm": 1.1604855129824596, + "learning_rate": 2.985873917798082e-07, + "loss": 0.4901, + "step": 7673 + }, + { + "epoch": 0.9428676741614449, + "grad_norm": 1.3546162155912078, + "learning_rate": 2.97311521527473e-07, + "loss": 0.6506, + "step": 7674 + }, + { + "epoch": 0.942990539378302, + "grad_norm": 1.1104357296521736, + "learning_rate": 2.9603835573933034e-07, + "loss": 0.4881, + "step": 7675 + }, + { + "epoch": 0.9431134045951591, + "grad_norm": 1.2167835179437068, + "learning_rate": 2.947678946495763e-07, + "loss": 0.5145, + "step": 7676 + }, + { + "epoch": 0.9432362698120162, + "grad_norm": 1.2089540758539854, + "learning_rate": 2.935001384919006e-07, + "loss": 0.5732, + "step": 7677 + }, + { + "epoch": 0.9433591350288734, + "grad_norm": 1.2450434970785484, + "learning_rate": 2.9223508749950003e-07, + "loss": 0.5643, + "step": 7678 + }, + { + "epoch": 0.9434820002457305, + "grad_norm": 1.3375545333780399, + "learning_rate": 2.909727419050717e-07, + "loss": 0.5929, + "step": 7679 + }, + { + "epoch": 0.9436048654625876, + "grad_norm": 1.307464360584236, + "learning_rate": 2.89713101940815e-07, + "loss": 0.6049, + "step": 7680 + }, + { + "epoch": 0.9437277306794446, + "grad_norm": 1.1471000077699096, + "learning_rate": 2.8845616783843455e-07, + "loss": 0.5083, + "step": 7681 + }, + { + "epoch": 0.9438505958963017, + "grad_norm": 1.2616424933308497, + "learning_rate": 2.872019398291337e-07, + "loss": 0.6776, + "step": 7682 + }, + { + "epoch": 0.9439734611131588, + "grad_norm": 1.2312749132120369, + "learning_rate": 2.8595041814362124e-07, + "loss": 0.6789, + "step": 7683 + }, + { + "epoch": 0.944096326330016, + "grad_norm": 1.2433560521009637, + "learning_rate": 2.8470160301210304e-07, + "loss": 0.5234, + "step": 7684 + }, + { + "epoch": 0.9442191915468731, + "grad_norm": 1.126740187826306, + "learning_rate": 2.83455494664297e-07, + "loss": 0.6146, + "step": 7685 + }, + { + "epoch": 0.9443420567637302, + "grad_norm": 1.058258586861683, + "learning_rate": 2.822120933294098e-07, + "loss": 0.5567, + "step": 7686 + }, + { + "epoch": 0.9444649219805873, + "grad_norm": 1.4750702341519464, + "learning_rate": 2.8097139923615845e-07, + "loss": 0.6626, + "step": 7687 + }, + { + "epoch": 0.9445877871974444, + "grad_norm": 1.111153597401485, + "learning_rate": 2.797334126127654e-07, + "loss": 0.5399, + "step": 7688 + }, + { + "epoch": 0.9447106524143015, + "grad_norm": 1.0396822088589652, + "learning_rate": 2.784981336869452e-07, + "loss": 0.5825, + "step": 7689 + }, + { + "epoch": 0.9448335176311586, + "grad_norm": 1.0391625286754607, + "learning_rate": 2.772655626859211e-07, + "loss": 0.5604, + "step": 7690 + }, + { + "epoch": 0.9449563828480158, + "grad_norm": 1.1313248465909034, + "learning_rate": 2.7603569983641496e-07, + "loss": 0.511, + "step": 7691 + }, + { + "epoch": 0.9450792480648729, + "grad_norm": 1.2839432594572726, + "learning_rate": 2.748085453646559e-07, + "loss": 0.5899, + "step": 7692 + }, + { + "epoch": 0.94520211328173, + "grad_norm": 1.2314748683971133, + "learning_rate": 2.7358409949636674e-07, + "loss": 0.5711, + "step": 7693 + }, + { + "epoch": 0.9453249784985871, + "grad_norm": 1.3251307665458316, + "learning_rate": 2.723623624567789e-07, + "loss": 0.6191, + "step": 7694 + }, + { + "epoch": 0.9454478437154441, + "grad_norm": 1.1777499791145356, + "learning_rate": 2.711433344706227e-07, + "loss": 0.5908, + "step": 7695 + }, + { + "epoch": 0.9455707089323012, + "grad_norm": 1.0237677025867713, + "learning_rate": 2.69927015762132e-07, + "loss": 0.5327, + "step": 7696 + }, + { + "epoch": 0.9456935741491583, + "grad_norm": 1.233825379201078, + "learning_rate": 2.687134065550362e-07, + "loss": 0.5611, + "step": 7697 + }, + { + "epoch": 0.9458164393660154, + "grad_norm": 1.344919927520651, + "learning_rate": 2.675025070725734e-07, + "loss": 0.6251, + "step": 7698 + }, + { + "epoch": 0.9459393045828726, + "grad_norm": 1.0519554151540325, + "learning_rate": 2.662943175374838e-07, + "loss": 0.6561, + "step": 7699 + }, + { + "epoch": 0.9460621697997297, + "grad_norm": 1.074051997762522, + "learning_rate": 2.650888381719996e-07, + "loss": 0.6206, + "step": 7700 + }, + { + "epoch": 0.9461850350165868, + "grad_norm": 1.2115747037814342, + "learning_rate": 2.6388606919786673e-07, + "loss": 0.5607, + "step": 7701 + }, + { + "epoch": 0.9463079002334439, + "grad_norm": 1.2559670442708277, + "learning_rate": 2.626860108363233e-07, + "loss": 0.5294, + "step": 7702 + }, + { + "epoch": 0.946430765450301, + "grad_norm": 1.2130607676126903, + "learning_rate": 2.614886633081143e-07, + "loss": 0.7381, + "step": 7703 + }, + { + "epoch": 0.9465536306671581, + "grad_norm": 1.202303420132463, + "learning_rate": 2.602940268334819e-07, + "loss": 0.6602, + "step": 7704 + }, + { + "epoch": 0.9466764958840153, + "grad_norm": 1.1073351252885164, + "learning_rate": 2.5910210163217376e-07, + "loss": 0.4925, + "step": 7705 + }, + { + "epoch": 0.9467993611008724, + "grad_norm": 1.0949693657888397, + "learning_rate": 2.5791288792343437e-07, + "loss": 0.7064, + "step": 7706 + }, + { + "epoch": 0.9469222263177295, + "grad_norm": 1.2402852638648325, + "learning_rate": 2.567263859260155e-07, + "loss": 0.507, + "step": 7707 + }, + { + "epoch": 0.9470450915345866, + "grad_norm": 1.1407232584373486, + "learning_rate": 2.555425958581642e-07, + "loss": 0.5431, + "step": 7708 + }, + { + "epoch": 0.9471679567514437, + "grad_norm": 1.274230711854219, + "learning_rate": 2.5436151793762964e-07, + "loss": 0.4729, + "step": 7709 + }, + { + "epoch": 0.9472908219683007, + "grad_norm": 1.1734181321872967, + "learning_rate": 2.531831523816663e-07, + "loss": 0.6083, + "step": 7710 + }, + { + "epoch": 0.9474136871851578, + "grad_norm": 1.0863077605639766, + "learning_rate": 2.520074994070243e-07, + "loss": 0.509, + "step": 7711 + }, + { + "epoch": 0.947536552402015, + "grad_norm": 1.273673464036806, + "learning_rate": 2.5083455922996044e-07, + "loss": 0.5552, + "step": 7712 + }, + { + "epoch": 0.9476594176188721, + "grad_norm": 1.0288444341910625, + "learning_rate": 2.496643320662256e-07, + "loss": 0.6034, + "step": 7713 + }, + { + "epoch": 0.9477822828357292, + "grad_norm": 1.1844803873452392, + "learning_rate": 2.484968181310793e-07, + "loss": 0.6305, + "step": 7714 + }, + { + "epoch": 0.9479051480525863, + "grad_norm": 1.0249993631932606, + "learning_rate": 2.4733201763927624e-07, + "loss": 0.5759, + "step": 7715 + }, + { + "epoch": 0.9480280132694434, + "grad_norm": 0.9913582097930854, + "learning_rate": 2.461699308050752e-07, + "loss": 0.6718, + "step": 7716 + }, + { + "epoch": 0.9481508784863005, + "grad_norm": 1.105959546725019, + "learning_rate": 2.450105578422318e-07, + "loss": 0.5243, + "step": 7717 + }, + { + "epoch": 0.9482737437031576, + "grad_norm": 1.1951080716465345, + "learning_rate": 2.438538989640071e-07, + "loss": 0.6769, + "step": 7718 + }, + { + "epoch": 0.9483966089200148, + "grad_norm": 1.1550996990946776, + "learning_rate": 2.4269995438316093e-07, + "loss": 0.5139, + "step": 7719 + }, + { + "epoch": 0.9485194741368719, + "grad_norm": 1.4942744161779253, + "learning_rate": 2.415487243119535e-07, + "loss": 0.5157, + "step": 7720 + }, + { + "epoch": 0.948642339353729, + "grad_norm": 0.9714307014394438, + "learning_rate": 2.404002089621471e-07, + "loss": 0.6096, + "step": 7721 + }, + { + "epoch": 0.9487652045705861, + "grad_norm": 1.3642022194367578, + "learning_rate": 2.3925440854500104e-07, + "loss": 0.5703, + "step": 7722 + }, + { + "epoch": 0.9488880697874432, + "grad_norm": 1.0515903433462852, + "learning_rate": 2.3811132327128172e-07, + "loss": 0.4996, + "step": 7723 + }, + { + "epoch": 0.9490109350043003, + "grad_norm": 1.230839537686157, + "learning_rate": 2.369709533512493e-07, + "loss": 0.5716, + "step": 7724 + }, + { + "epoch": 0.9491338002211573, + "grad_norm": 1.1346337483475637, + "learning_rate": 2.3583329899466765e-07, + "loss": 0.6728, + "step": 7725 + }, + { + "epoch": 0.9492566654380145, + "grad_norm": 1.2318020765376274, + "learning_rate": 2.346983604108044e-07, + "loss": 0.5783, + "step": 7726 + }, + { + "epoch": 0.9493795306548716, + "grad_norm": 1.2400669949142467, + "learning_rate": 2.3356613780841919e-07, + "loss": 0.5432, + "step": 7727 + }, + { + "epoch": 0.9495023958717287, + "grad_norm": 1.3675173036290396, + "learning_rate": 2.3243663139578042e-07, + "loss": 0.5681, + "step": 7728 + }, + { + "epoch": 0.9496252610885858, + "grad_norm": 1.1953108988535237, + "learning_rate": 2.3130984138065026e-07, + "loss": 0.4946, + "step": 7729 + }, + { + "epoch": 0.9497481263054429, + "grad_norm": 1.2125222913589289, + "learning_rate": 2.301857679702979e-07, + "loss": 0.5587, + "step": 7730 + }, + { + "epoch": 0.9498709915223, + "grad_norm": 1.0323853121591349, + "learning_rate": 2.2906441137148793e-07, + "loss": 0.54, + "step": 7731 + }, + { + "epoch": 0.9499938567391571, + "grad_norm": 1.307333243913769, + "learning_rate": 2.2794577179048702e-07, + "loss": 0.5525, + "step": 7732 + }, + { + "epoch": 0.9501167219560143, + "grad_norm": 1.1259516507745981, + "learning_rate": 2.2682984943305894e-07, + "loss": 0.5894, + "step": 7733 + }, + { + "epoch": 0.9502395871728714, + "grad_norm": 1.3993593866404728, + "learning_rate": 2.2571664450447616e-07, + "loss": 0.575, + "step": 7734 + }, + { + "epoch": 0.9503624523897285, + "grad_norm": 1.1668009973224958, + "learning_rate": 2.2460615720949984e-07, + "loss": 0.5976, + "step": 7735 + }, + { + "epoch": 0.9504853176065856, + "grad_norm": 1.0209570869610247, + "learning_rate": 2.2349838775239828e-07, + "loss": 0.6417, + "step": 7736 + }, + { + "epoch": 0.9506081828234427, + "grad_norm": 1.0907213377045921, + "learning_rate": 2.2239333633694182e-07, + "loss": 0.6428, + "step": 7737 + }, + { + "epoch": 0.9507310480402998, + "grad_norm": 1.268297329102986, + "learning_rate": 2.2129100316639282e-07, + "loss": 0.6551, + "step": 7738 + }, + { + "epoch": 0.9508539132571568, + "grad_norm": 1.1150788543940735, + "learning_rate": 2.2019138844352249e-07, + "loss": 0.5866, + "step": 7739 + }, + { + "epoch": 0.950976778474014, + "grad_norm": 1.256398518975664, + "learning_rate": 2.19094492370594e-07, + "loss": 0.6194, + "step": 7740 + }, + { + "epoch": 0.9510996436908711, + "grad_norm": 0.9816137782255879, + "learning_rate": 2.1800031514937757e-07, + "loss": 0.5503, + "step": 7741 + }, + { + "epoch": 0.9512225089077282, + "grad_norm": 1.296707386615462, + "learning_rate": 2.1690885698113728e-07, + "loss": 0.566, + "step": 7742 + }, + { + "epoch": 0.9513453741245853, + "grad_norm": 1.330449368016545, + "learning_rate": 2.1582011806664248e-07, + "loss": 0.583, + "step": 7743 + }, + { + "epoch": 0.9514682393414424, + "grad_norm": 1.1708921784677875, + "learning_rate": 2.1473409860615635e-07, + "loss": 0.4582, + "step": 7744 + }, + { + "epoch": 0.9515911045582995, + "grad_norm": 1.1733241553795841, + "learning_rate": 2.1365079879944904e-07, + "loss": 0.6118, + "step": 7745 + }, + { + "epoch": 0.9517139697751567, + "grad_norm": 1.3774588336233446, + "learning_rate": 2.1257021884578286e-07, + "loss": 0.5544, + "step": 7746 + }, + { + "epoch": 0.9518368349920138, + "grad_norm": 1.120622943860527, + "learning_rate": 2.114923589439255e-07, + "loss": 0.538, + "step": 7747 + }, + { + "epoch": 0.9519597002088709, + "grad_norm": 1.8646951394917248, + "learning_rate": 2.1041721929214163e-07, + "loss": 0.5723, + "step": 7748 + }, + { + "epoch": 0.952082565425728, + "grad_norm": 1.1445923739800998, + "learning_rate": 2.0934480008819645e-07, + "loss": 0.5665, + "step": 7749 + }, + { + "epoch": 0.9522054306425851, + "grad_norm": 1.3660339368907883, + "learning_rate": 2.0827510152935546e-07, + "loss": 0.6652, + "step": 7750 + }, + { + "epoch": 0.9523282958594422, + "grad_norm": 1.0695277123321518, + "learning_rate": 2.0720812381238131e-07, + "loss": 0.5386, + "step": 7751 + }, + { + "epoch": 0.9524511610762993, + "grad_norm": 1.2304844276253297, + "learning_rate": 2.0614386713353696e-07, + "loss": 0.5952, + "step": 7752 + }, + { + "epoch": 0.9525740262931565, + "grad_norm": 1.0126325743751725, + "learning_rate": 2.0508233168858749e-07, + "loss": 0.6165, + "step": 7753 + }, + { + "epoch": 0.9526968915100135, + "grad_norm": 1.2989811539676857, + "learning_rate": 2.040235176727967e-07, + "loss": 0.5372, + "step": 7754 + }, + { + "epoch": 0.9528197567268706, + "grad_norm": 1.1930240723499068, + "learning_rate": 2.0296742528092216e-07, + "loss": 0.5509, + "step": 7755 + }, + { + "epoch": 0.9529426219437277, + "grad_norm": 1.2895261776846862, + "learning_rate": 2.0191405470722847e-07, + "loss": 0.5655, + "step": 7756 + }, + { + "epoch": 0.9530654871605848, + "grad_norm": 1.5238762000936108, + "learning_rate": 2.008634061454756e-07, + "loss": 0.5791, + "step": 7757 + }, + { + "epoch": 0.9531883523774419, + "grad_norm": 1.2362623829606096, + "learning_rate": 1.9981547978892234e-07, + "loss": 0.6902, + "step": 7758 + }, + { + "epoch": 0.953311217594299, + "grad_norm": 1.0647834371014968, + "learning_rate": 1.9877027583032947e-07, + "loss": 0.5089, + "step": 7759 + }, + { + "epoch": 0.9534340828111562, + "grad_norm": 1.098431183844803, + "learning_rate": 1.9772779446195488e-07, + "loss": 0.5215, + "step": 7760 + }, + { + "epoch": 0.9535569480280133, + "grad_norm": 1.1371855354091476, + "learning_rate": 1.966880358755585e-07, + "loss": 0.6419, + "step": 7761 + }, + { + "epoch": 0.9536798132448704, + "grad_norm": 1.0222149762098098, + "learning_rate": 1.9565100026239237e-07, + "loss": 0.5949, + "step": 7762 + }, + { + "epoch": 0.9538026784617275, + "grad_norm": 1.3273189733477422, + "learning_rate": 1.9461668781321717e-07, + "loss": 0.5318, + "step": 7763 + }, + { + "epoch": 0.9539255436785846, + "grad_norm": 1.1809587661422711, + "learning_rate": 1.9358509871828577e-07, + "loss": 0.5696, + "step": 7764 + }, + { + "epoch": 0.9540484088954417, + "grad_norm": 1.2833313356330704, + "learning_rate": 1.925562331673514e-07, + "loss": 0.4753, + "step": 7765 + }, + { + "epoch": 0.9541712741122989, + "grad_norm": 1.180651274838312, + "learning_rate": 1.9153009134966926e-07, + "loss": 0.5781, + "step": 7766 + }, + { + "epoch": 0.954294139329156, + "grad_norm": 1.2833201169043948, + "learning_rate": 1.905066734539884e-07, + "loss": 0.4828, + "step": 7767 + }, + { + "epoch": 0.954417004546013, + "grad_norm": 1.0623706044866086, + "learning_rate": 1.894859796685633e-07, + "loss": 0.6133, + "step": 7768 + }, + { + "epoch": 0.9545398697628701, + "grad_norm": 1.462711857768134, + "learning_rate": 1.884680101811437e-07, + "loss": 0.5146, + "step": 7769 + }, + { + "epoch": 0.9546627349797272, + "grad_norm": 1.193002284134927, + "learning_rate": 1.8745276517897647e-07, + "loss": 0.6401, + "step": 7770 + }, + { + "epoch": 0.9547856001965843, + "grad_norm": 1.2366251851023828, + "learning_rate": 1.8644024484880894e-07, + "loss": 0.5428, + "step": 7771 + }, + { + "epoch": 0.9549084654134414, + "grad_norm": 1.157281241312102, + "learning_rate": 1.8543044937689213e-07, + "loss": 0.6449, + "step": 7772 + }, + { + "epoch": 0.9550313306302985, + "grad_norm": 1.1095978004926672, + "learning_rate": 1.8442337894896577e-07, + "loss": 0.5514, + "step": 7773 + }, + { + "epoch": 0.9551541958471557, + "grad_norm": 1.131583384029103, + "learning_rate": 1.8341903375027836e-07, + "loss": 0.5175, + "step": 7774 + }, + { + "epoch": 0.9552770610640128, + "grad_norm": 1.3994685950103056, + "learning_rate": 1.8241741396557044e-07, + "loss": 0.5701, + "step": 7775 + }, + { + "epoch": 0.9553999262808699, + "grad_norm": 1.149376941646617, + "learning_rate": 1.8141851977908298e-07, + "loss": 0.633, + "step": 7776 + }, + { + "epoch": 0.955522791497727, + "grad_norm": 1.128274504291997, + "learning_rate": 1.804223513745573e-07, + "loss": 0.6017, + "step": 7777 + }, + { + "epoch": 0.9556456567145841, + "grad_norm": 1.1978741058322593, + "learning_rate": 1.7942890893523022e-07, + "loss": 0.5591, + "step": 7778 + }, + { + "epoch": 0.9557685219314412, + "grad_norm": 1.2382695270097392, + "learning_rate": 1.7843819264384386e-07, + "loss": 0.557, + "step": 7779 + }, + { + "epoch": 0.9558913871482984, + "grad_norm": 1.3979740022736589, + "learning_rate": 1.7745020268262746e-07, + "loss": 0.599, + "step": 7780 + }, + { + "epoch": 0.9560142523651555, + "grad_norm": 1.0870985096580672, + "learning_rate": 1.7646493923332063e-07, + "loss": 0.6419, + "step": 7781 + }, + { + "epoch": 0.9561371175820126, + "grad_norm": 1.2025885596609573, + "learning_rate": 1.7548240247715342e-07, + "loss": 0.5946, + "step": 7782 + }, + { + "epoch": 0.9562599827988696, + "grad_norm": 1.1592519263874321, + "learning_rate": 1.745025925948579e-07, + "loss": 0.6325, + "step": 7783 + }, + { + "epoch": 0.9563828480157267, + "grad_norm": 1.467817384611979, + "learning_rate": 1.7352550976666493e-07, + "loss": 0.5258, + "step": 7784 + }, + { + "epoch": 0.9565057132325838, + "grad_norm": 1.326984590855388, + "learning_rate": 1.725511541723007e-07, + "loss": 0.6657, + "step": 7785 + }, + { + "epoch": 0.9566285784494409, + "grad_norm": 1.2537415624659347, + "learning_rate": 1.7157952599099192e-07, + "loss": 0.4361, + "step": 7786 + }, + { + "epoch": 0.956751443666298, + "grad_norm": 1.2336630612375346, + "learning_rate": 1.7061062540146387e-07, + "loss": 0.5907, + "step": 7787 + }, + { + "epoch": 0.9568743088831552, + "grad_norm": 1.1893502296910492, + "learning_rate": 1.6964445258193906e-07, + "loss": 0.5646, + "step": 7788 + }, + { + "epoch": 0.9569971741000123, + "grad_norm": 1.00979372986818, + "learning_rate": 1.6868100771014027e-07, + "loss": 0.5768, + "step": 7789 + }, + { + "epoch": 0.9571200393168694, + "grad_norm": 1.4393506909322724, + "learning_rate": 1.677202909632841e-07, + "loss": 0.5672, + "step": 7790 + }, + { + "epoch": 0.9572429045337265, + "grad_norm": 1.0784113284944765, + "learning_rate": 1.6676230251809088e-07, + "loss": 0.524, + "step": 7791 + }, + { + "epoch": 0.9573657697505836, + "grad_norm": 0.9169108804801552, + "learning_rate": 1.6580704255077295e-07, + "loss": 0.7061, + "step": 7792 + }, + { + "epoch": 0.9574886349674407, + "grad_norm": 1.2076275878828486, + "learning_rate": 1.6485451123704974e-07, + "loss": 0.5377, + "step": 7793 + }, + { + "epoch": 0.9576115001842979, + "grad_norm": 0.9689551870524671, + "learning_rate": 1.6390470875212615e-07, + "loss": 0.5424, + "step": 7794 + }, + { + "epoch": 0.957734365401155, + "grad_norm": 1.1978926816427216, + "learning_rate": 1.6295763527071906e-07, + "loss": 0.601, + "step": 7795 + }, + { + "epoch": 0.9578572306180121, + "grad_norm": 1.129254135959243, + "learning_rate": 1.6201329096703076e-07, + "loss": 0.7094, + "step": 7796 + }, + { + "epoch": 0.9579800958348691, + "grad_norm": 0.9335342572751679, + "learning_rate": 1.6107167601477235e-07, + "loss": 0.5668, + "step": 7797 + }, + { + "epoch": 0.9581029610517262, + "grad_norm": 1.319627733046934, + "learning_rate": 1.6013279058714357e-07, + "loss": 0.5457, + "step": 7798 + }, + { + "epoch": 0.9582258262685833, + "grad_norm": 1.1800137571930962, + "learning_rate": 1.5919663485684965e-07, + "loss": 0.5947, + "step": 7799 + }, + { + "epoch": 0.9583486914854404, + "grad_norm": 1.2660377373128198, + "learning_rate": 1.5826320899608616e-07, + "loss": 0.5229, + "step": 7800 + }, + { + "epoch": 0.9584715567022976, + "grad_norm": 1.0904579243787162, + "learning_rate": 1.5733251317655574e-07, + "loss": 0.5415, + "step": 7801 + }, + { + "epoch": 0.9585944219191547, + "grad_norm": 1.2779831951737801, + "learning_rate": 1.5640454756945144e-07, + "loss": 0.5257, + "step": 7802 + }, + { + "epoch": 0.9587172871360118, + "grad_norm": 1.3894241237869698, + "learning_rate": 1.554793123454651e-07, + "loss": 0.5656, + "step": 7803 + }, + { + "epoch": 0.9588401523528689, + "grad_norm": 1.2782347841819308, + "learning_rate": 1.5455680767479053e-07, + "loss": 0.4775, + "step": 7804 + }, + { + "epoch": 0.958963017569726, + "grad_norm": 1.2547011304764095, + "learning_rate": 1.5363703372711368e-07, + "loss": 0.4802, + "step": 7805 + }, + { + "epoch": 0.9590858827865831, + "grad_norm": 1.5929000502170487, + "learning_rate": 1.5271999067162256e-07, + "loss": 0.5811, + "step": 7806 + }, + { + "epoch": 0.9592087480034402, + "grad_norm": 1.2630847855860625, + "learning_rate": 1.5180567867700223e-07, + "loss": 0.5442, + "step": 7807 + }, + { + "epoch": 0.9593316132202974, + "grad_norm": 1.2416497962759796, + "learning_rate": 1.5089409791143316e-07, + "loss": 0.6059, + "step": 7808 + }, + { + "epoch": 0.9594544784371545, + "grad_norm": 1.2998795044066904, + "learning_rate": 1.4998524854259454e-07, + "loss": 0.5373, + "step": 7809 + }, + { + "epoch": 0.9595773436540116, + "grad_norm": 1.231736791491449, + "learning_rate": 1.4907913073766432e-07, + "loss": 0.4938, + "step": 7810 + }, + { + "epoch": 0.9597002088708687, + "grad_norm": 1.208974796872807, + "learning_rate": 1.4817574466331586e-07, + "loss": 0.5234, + "step": 7811 + }, + { + "epoch": 0.9598230740877257, + "grad_norm": 1.1224774242808986, + "learning_rate": 1.4727509048572118e-07, + "loss": 0.631, + "step": 7812 + }, + { + "epoch": 0.9599459393045828, + "grad_norm": 1.3679978207043557, + "learning_rate": 1.4637716837055115e-07, + "loss": 0.5635, + "step": 7813 + }, + { + "epoch": 0.96006880452144, + "grad_norm": 1.2163918254298796, + "learning_rate": 1.4548197848297194e-07, + "loss": 0.6382, + "step": 7814 + }, + { + "epoch": 0.9601916697382971, + "grad_norm": 1.0043549220685004, + "learning_rate": 1.4458952098764688e-07, + "loss": 0.5331, + "step": 7815 + }, + { + "epoch": 0.9603145349551542, + "grad_norm": 1.0330617932711146, + "learning_rate": 1.4369979604873962e-07, + "loss": 0.5411, + "step": 7816 + }, + { + "epoch": 0.9604374001720113, + "grad_norm": 1.2646790002677737, + "learning_rate": 1.4281280382990758e-07, + "loss": 0.5867, + "step": 7817 + }, + { + "epoch": 0.9605602653888684, + "grad_norm": 1.4945623973963051, + "learning_rate": 1.419285444943086e-07, + "loss": 0.5834, + "step": 7818 + }, + { + "epoch": 0.9606831306057255, + "grad_norm": 1.0294876991311295, + "learning_rate": 1.4104701820459588e-07, + "loss": 0.6015, + "step": 7819 + }, + { + "epoch": 0.9608059958225826, + "grad_norm": 1.1897915624463136, + "learning_rate": 1.4016822512292138e-07, + "loss": 0.5649, + "step": 7820 + }, + { + "epoch": 0.9609288610394398, + "grad_norm": 1.2220832203509522, + "learning_rate": 1.3929216541093083e-07, + "loss": 0.4561, + "step": 7821 + }, + { + "epoch": 0.9610517262562969, + "grad_norm": 1.0416252484187243, + "learning_rate": 1.3841883922977194e-07, + "loss": 0.5972, + "step": 7822 + }, + { + "epoch": 0.961174591473154, + "grad_norm": 1.558740683134636, + "learning_rate": 1.3754824674008792e-07, + "loss": 0.6514, + "step": 7823 + }, + { + "epoch": 0.9612974566900111, + "grad_norm": 1.1655806424345687, + "learning_rate": 1.3668038810201565e-07, + "loss": 0.553, + "step": 7824 + }, + { + "epoch": 0.9614203219068682, + "grad_norm": 1.2271037192766345, + "learning_rate": 1.3581526347519414e-07, + "loss": 0.5399, + "step": 7825 + }, + { + "epoch": 0.9615431871237252, + "grad_norm": 1.07035049010988, + "learning_rate": 1.3495287301875936e-07, + "loss": 0.5396, + "step": 7826 + }, + { + "epoch": 0.9616660523405823, + "grad_norm": 1.062820416160557, + "learning_rate": 1.3409321689133947e-07, + "loss": 0.4961, + "step": 7827 + }, + { + "epoch": 0.9617889175574394, + "grad_norm": 1.103191521914067, + "learning_rate": 1.3323629525106295e-07, + "loss": 0.6166, + "step": 7828 + }, + { + "epoch": 0.9619117827742966, + "grad_norm": 1.0478264960751524, + "learning_rate": 1.3238210825555542e-07, + "loss": 0.6083, + "step": 7829 + }, + { + "epoch": 0.9620346479911537, + "grad_norm": 1.0203360179324665, + "learning_rate": 1.3153065606193948e-07, + "loss": 0.5818, + "step": 7830 + }, + { + "epoch": 0.9621575132080108, + "grad_norm": 1.1661228962540515, + "learning_rate": 1.3068193882683488e-07, + "loss": 0.5798, + "step": 7831 + }, + { + "epoch": 0.9622803784248679, + "grad_norm": 1.2853486585145466, + "learning_rate": 1.2983595670635507e-07, + "loss": 0.5356, + "step": 7832 + }, + { + "epoch": 0.962403243641725, + "grad_norm": 1.2828433160849324, + "learning_rate": 1.2899270985611555e-07, + "loss": 0.5182, + "step": 7833 + }, + { + "epoch": 0.9625261088585821, + "grad_norm": 1.1228234217251525, + "learning_rate": 1.281521984312256e-07, + "loss": 0.413, + "step": 7834 + }, + { + "epoch": 0.9626489740754393, + "grad_norm": 1.259596506910912, + "learning_rate": 1.2731442258629156e-07, + "loss": 0.5704, + "step": 7835 + }, + { + "epoch": 0.9627718392922964, + "grad_norm": 1.0664921966998073, + "learning_rate": 1.2647938247541345e-07, + "loss": 0.5263, + "step": 7836 + }, + { + "epoch": 0.9628947045091535, + "grad_norm": 1.0271684271424386, + "learning_rate": 1.2564707825219845e-07, + "loss": 0.6159, + "step": 7837 + }, + { + "epoch": 0.9630175697260106, + "grad_norm": 1.2240750363396535, + "learning_rate": 1.2481751006973908e-07, + "loss": 0.6253, + "step": 7838 + }, + { + "epoch": 0.9631404349428677, + "grad_norm": 1.2712395672077883, + "learning_rate": 1.2399067808062992e-07, + "loss": 0.5662, + "step": 7839 + }, + { + "epoch": 0.9632633001597248, + "grad_norm": 1.1310653029322084, + "learning_rate": 1.23166582436961e-07, + "loss": 0.4871, + "step": 7840 + }, + { + "epoch": 0.9633861653765818, + "grad_norm": 1.138013784386157, + "learning_rate": 1.2234522329031773e-07, + "loss": 0.5243, + "step": 7841 + }, + { + "epoch": 0.963509030593439, + "grad_norm": 1.6357193906220397, + "learning_rate": 1.2152660079178923e-07, + "loss": 0.5472, + "step": 7842 + }, + { + "epoch": 0.9636318958102961, + "grad_norm": 1.01002401548189, + "learning_rate": 1.2071071509194842e-07, + "loss": 0.5062, + "step": 7843 + }, + { + "epoch": 0.9637547610271532, + "grad_norm": 1.183643709581267, + "learning_rate": 1.1989756634087856e-07, + "loss": 0.534, + "step": 7844 + }, + { + "epoch": 0.9638776262440103, + "grad_norm": 1.4632212109028846, + "learning_rate": 1.1908715468815002e-07, + "loss": 0.5903, + "step": 7845 + }, + { + "epoch": 0.9640004914608674, + "grad_norm": 1.2603680441770666, + "learning_rate": 1.1827948028283353e-07, + "loss": 0.6708, + "step": 7846 + }, + { + "epoch": 0.9641233566777245, + "grad_norm": 0.9238407645808355, + "learning_rate": 1.174745432734936e-07, + "loss": 0.6333, + "step": 7847 + }, + { + "epoch": 0.9642462218945816, + "grad_norm": 1.435427910260267, + "learning_rate": 1.166723438081968e-07, + "loss": 0.5737, + "step": 7848 + }, + { + "epoch": 0.9643690871114388, + "grad_norm": 1.3445081587757026, + "learning_rate": 1.1587288203450008e-07, + "loss": 0.4705, + "step": 7849 + }, + { + "epoch": 0.9644919523282959, + "grad_norm": 1.263315501445857, + "learning_rate": 1.1507615809945915e-07, + "loss": 0.6518, + "step": 7850 + }, + { + "epoch": 0.964614817545153, + "grad_norm": 1.8432322424374505, + "learning_rate": 1.1428217214962677e-07, + "loss": 0.6743, + "step": 7851 + }, + { + "epoch": 0.9647376827620101, + "grad_norm": 1.0715337610122606, + "learning_rate": 1.1349092433105279e-07, + "loss": 0.5433, + "step": 7852 + }, + { + "epoch": 0.9648605479788672, + "grad_norm": 1.1324431929266474, + "learning_rate": 1.1270241478927912e-07, + "loss": 0.7205, + "step": 7853 + }, + { + "epoch": 0.9649834131957243, + "grad_norm": 1.2023922937365075, + "learning_rate": 1.1191664366934973e-07, + "loss": 0.5672, + "step": 7854 + }, + { + "epoch": 0.9651062784125815, + "grad_norm": 1.4412746126662541, + "learning_rate": 1.1113361111580067e-07, + "loss": 0.5859, + "step": 7855 + }, + { + "epoch": 0.9652291436294385, + "grad_norm": 1.4758271084433805, + "learning_rate": 1.1035331727266673e-07, + "loss": 0.585, + "step": 7856 + }, + { + "epoch": 0.9653520088462956, + "grad_norm": 1.3513443958017548, + "learning_rate": 1.095757622834781e-07, + "loss": 0.5381, + "step": 7857 + }, + { + "epoch": 0.9654748740631527, + "grad_norm": 1.0897966685427745, + "learning_rate": 1.088009462912587e-07, + "loss": 0.5588, + "step": 7858 + }, + { + "epoch": 0.9655977392800098, + "grad_norm": 1.0221207644919628, + "learning_rate": 1.0802886943853285e-07, + "loss": 0.5543, + "step": 7859 + }, + { + "epoch": 0.9657206044968669, + "grad_norm": 1.2854865583545483, + "learning_rate": 1.0725953186731863e-07, + "loss": 0.4937, + "step": 7860 + }, + { + "epoch": 0.965843469713724, + "grad_norm": 1.2437548804979761, + "learning_rate": 1.0649293371913115e-07, + "loss": 0.53, + "step": 7861 + }, + { + "epoch": 0.9659663349305811, + "grad_norm": 1.3344262165983112, + "learning_rate": 1.0572907513498097e-07, + "loss": 0.467, + "step": 7862 + }, + { + "epoch": 0.9660892001474383, + "grad_norm": 1.2797513428434228, + "learning_rate": 1.0496795625537403e-07, + "loss": 0.5429, + "step": 7863 + }, + { + "epoch": 0.9662120653642954, + "grad_norm": 1.595350582486819, + "learning_rate": 1.0420957722031333e-07, + "loss": 0.5748, + "step": 7864 + }, + { + "epoch": 0.9663349305811525, + "grad_norm": 1.0069494447636738, + "learning_rate": 1.0345393816929893e-07, + "loss": 0.5515, + "step": 7865 + }, + { + "epoch": 0.9664577957980096, + "grad_norm": 1.0030442090378906, + "learning_rate": 1.0270103924132467e-07, + "loss": 0.54, + "step": 7866 + }, + { + "epoch": 0.9665806610148667, + "grad_norm": 1.1874198432200562, + "learning_rate": 1.0195088057488311e-07, + "loss": 0.4029, + "step": 7867 + }, + { + "epoch": 0.9667035262317238, + "grad_norm": 1.0127673890376376, + "learning_rate": 1.0120346230795884e-07, + "loss": 0.5571, + "step": 7868 + }, + { + "epoch": 0.966826391448581, + "grad_norm": 1.0434956882061928, + "learning_rate": 1.0045878457803692e-07, + "loss": 0.5062, + "step": 7869 + }, + { + "epoch": 0.966949256665438, + "grad_norm": 1.0843732012955516, + "learning_rate": 9.971684752209276e-08, + "loss": 0.5623, + "step": 7870 + }, + { + "epoch": 0.9670721218822951, + "grad_norm": 0.9709289290507604, + "learning_rate": 9.897765127660386e-08, + "loss": 0.6182, + "step": 7871 + }, + { + "epoch": 0.9671949870991522, + "grad_norm": 1.15068633244589, + "learning_rate": 9.824119597753811e-08, + "loss": 0.5617, + "step": 7872 + }, + { + "epoch": 0.9673178523160093, + "grad_norm": 1.1685937495762564, + "learning_rate": 9.75074817603655e-08, + "loss": 0.6076, + "step": 7873 + }, + { + "epoch": 0.9674407175328664, + "grad_norm": 1.106623121785899, + "learning_rate": 9.677650876004307e-08, + "loss": 0.5953, + "step": 7874 + }, + { + "epoch": 0.9675635827497235, + "grad_norm": 1.149518495285776, + "learning_rate": 9.604827711103326e-08, + "loss": 0.4912, + "step": 7875 + }, + { + "epoch": 0.9676864479665807, + "grad_norm": 1.1651617580859603, + "learning_rate": 9.532278694728557e-08, + "loss": 0.5402, + "step": 7876 + }, + { + "epoch": 0.9678093131834378, + "grad_norm": 1.1016711358611102, + "learning_rate": 9.460003840225162e-08, + "loss": 0.5845, + "step": 7877 + }, + { + "epoch": 0.9679321784002949, + "grad_norm": 1.058526545480281, + "learning_rate": 9.388003160887503e-08, + "loss": 0.6663, + "step": 7878 + }, + { + "epoch": 0.968055043617152, + "grad_norm": 1.1322672398107858, + "learning_rate": 9.316276669959822e-08, + "loss": 0.5523, + "step": 7879 + }, + { + "epoch": 0.9681779088340091, + "grad_norm": 1.2170712987449457, + "learning_rate": 9.244824380635564e-08, + "loss": 0.51, + "step": 7880 + }, + { + "epoch": 0.9683007740508662, + "grad_norm": 1.138276174872761, + "learning_rate": 9.173646306058048e-08, + "loss": 0.5533, + "step": 7881 + }, + { + "epoch": 0.9684236392677233, + "grad_norm": 1.4696893534943545, + "learning_rate": 9.102742459319802e-08, + "loss": 0.5779, + "step": 7882 + }, + { + "epoch": 0.9685465044845805, + "grad_norm": 1.2321755547085937, + "learning_rate": 9.032112853463393e-08, + "loss": 0.602, + "step": 7883 + }, + { + "epoch": 0.9686693697014376, + "grad_norm": 1.480143059616058, + "learning_rate": 8.961757501480595e-08, + "loss": 0.477, + "step": 7884 + }, + { + "epoch": 0.9687922349182946, + "grad_norm": 1.3018568987357602, + "learning_rate": 8.891676416312722e-08, + "loss": 0.6656, + "step": 7885 + }, + { + "epoch": 0.9689151001351517, + "grad_norm": 1.3736015772222996, + "learning_rate": 8.82186961085063e-08, + "loss": 0.5592, + "step": 7886 + }, + { + "epoch": 0.9690379653520088, + "grad_norm": 1.06676897312019, + "learning_rate": 8.752337097935215e-08, + "loss": 0.6257, + "step": 7887 + }, + { + "epoch": 0.9691608305688659, + "grad_norm": 1.1760388033941362, + "learning_rate": 8.683078890356245e-08, + "loss": 0.675, + "step": 7888 + }, + { + "epoch": 0.969283695785723, + "grad_norm": 1.3283605240580882, + "learning_rate": 8.614095000853361e-08, + "loss": 0.4837, + "step": 7889 + }, + { + "epoch": 0.9694065610025802, + "grad_norm": 1.2929912986975878, + "learning_rate": 8.545385442115749e-08, + "loss": 0.521, + "step": 7890 + }, + { + "epoch": 0.9695294262194373, + "grad_norm": 1.0361170708801757, + "learning_rate": 8.476950226782131e-08, + "loss": 0.5212, + "step": 7891 + }, + { + "epoch": 0.9696522914362944, + "grad_norm": 1.5784468490084351, + "learning_rate": 8.408789367440606e-08, + "loss": 0.5681, + "step": 7892 + }, + { + "epoch": 0.9697751566531515, + "grad_norm": 1.0596577671868384, + "learning_rate": 8.340902876628809e-08, + "loss": 0.5127, + "step": 7893 + }, + { + "epoch": 0.9698980218700086, + "grad_norm": 1.098563537368093, + "learning_rate": 8.273290766834252e-08, + "loss": 0.6252, + "step": 7894 + }, + { + "epoch": 0.9700208870868657, + "grad_norm": 1.255153123232875, + "learning_rate": 8.20595305049382e-08, + "loss": 0.6063, + "step": 7895 + }, + { + "epoch": 0.9701437523037229, + "grad_norm": 1.1012402399947991, + "learning_rate": 8.138889739993604e-08, + "loss": 0.5111, + "step": 7896 + }, + { + "epoch": 0.97026661752058, + "grad_norm": 1.178007276576795, + "learning_rate": 8.072100847669572e-08, + "loss": 0.4986, + "step": 7897 + }, + { + "epoch": 0.9703894827374371, + "grad_norm": 1.0949534645993448, + "learning_rate": 8.005586385807063e-08, + "loss": 0.5295, + "step": 7898 + }, + { + "epoch": 0.9705123479542941, + "grad_norm": 1.242704729613472, + "learning_rate": 7.93934636664112e-08, + "loss": 0.5907, + "step": 7899 + }, + { + "epoch": 0.9706352131711512, + "grad_norm": 1.2234788962066618, + "learning_rate": 7.873380802356001e-08, + "loss": 0.603, + "step": 7900 + }, + { + "epoch": 0.9707580783880083, + "grad_norm": 1.3517708866066238, + "learning_rate": 7.807689705085663e-08, + "loss": 0.6634, + "step": 7901 + }, + { + "epoch": 0.9708809436048654, + "grad_norm": 1.7638370668463743, + "learning_rate": 7.742273086913609e-08, + "loss": 0.6677, + "step": 7902 + }, + { + "epoch": 0.9710038088217225, + "grad_norm": 1.3537622310126303, + "learning_rate": 7.677130959872713e-08, + "loss": 0.6169, + "step": 7903 + }, + { + "epoch": 0.9711266740385797, + "grad_norm": 1.0445942288011771, + "learning_rate": 7.612263335945724e-08, + "loss": 0.5362, + "step": 7904 + }, + { + "epoch": 0.9712495392554368, + "grad_norm": 1.3524656171439051, + "learning_rate": 7.547670227064263e-08, + "loss": 0.5442, + "step": 7905 + }, + { + "epoch": 0.9713724044722939, + "grad_norm": 1.084863996981803, + "learning_rate": 7.483351645109993e-08, + "loss": 0.4713, + "step": 7906 + }, + { + "epoch": 0.971495269689151, + "grad_norm": 1.202249386489926, + "learning_rate": 7.41930760191395e-08, + "loss": 0.5131, + "step": 7907 + }, + { + "epoch": 0.9716181349060081, + "grad_norm": 1.2916819666004233, + "learning_rate": 7.355538109256377e-08, + "loss": 0.5977, + "step": 7908 + }, + { + "epoch": 0.9717410001228652, + "grad_norm": 1.21514774930444, + "learning_rate": 7.292043178867558e-08, + "loss": 0.5509, + "step": 7909 + }, + { + "epoch": 0.9718638653397224, + "grad_norm": 1.2734012752189985, + "learning_rate": 7.228822822426817e-08, + "loss": 0.7077, + "step": 7910 + }, + { + "epoch": 0.9719867305565795, + "grad_norm": 1.0402355840230186, + "learning_rate": 7.165877051563186e-08, + "loss": 0.5439, + "step": 7911 + }, + { + "epoch": 0.9721095957734366, + "grad_norm": 1.1710558085608551, + "learning_rate": 7.103205877855067e-08, + "loss": 0.4638, + "step": 7912 + }, + { + "epoch": 0.9722324609902937, + "grad_norm": 1.24515789894153, + "learning_rate": 7.040809312830576e-08, + "loss": 0.5224, + "step": 7913 + }, + { + "epoch": 0.9723553262071507, + "grad_norm": 1.0618036784460894, + "learning_rate": 6.978687367966862e-08, + "loss": 0.6131, + "step": 7914 + }, + { + "epoch": 0.9724781914240078, + "grad_norm": 1.2546990018641369, + "learning_rate": 6.91684005469112e-08, + "loss": 0.7781, + "step": 7915 + }, + { + "epoch": 0.9726010566408649, + "grad_norm": 1.1669641811514635, + "learning_rate": 6.855267384379582e-08, + "loss": 0.6181, + "step": 7916 + }, + { + "epoch": 0.972723921857722, + "grad_norm": 1.1762953107896017, + "learning_rate": 6.793969368358355e-08, + "loss": 0.5334, + "step": 7917 + }, + { + "epoch": 0.9728467870745792, + "grad_norm": 1.1754956621957804, + "learning_rate": 6.732946017902586e-08, + "loss": 0.5494, + "step": 7918 + }, + { + "epoch": 0.9729696522914363, + "grad_norm": 1.2601131627612836, + "learning_rate": 6.672197344237296e-08, + "loss": 0.6458, + "step": 7919 + }, + { + "epoch": 0.9730925175082934, + "grad_norm": 1.2555201500510678, + "learning_rate": 6.611723358536547e-08, + "loss": 0.5598, + "step": 7920 + }, + { + "epoch": 0.9732153827251505, + "grad_norm": 1.0137792034608388, + "learning_rate": 6.551524071924442e-08, + "loss": 0.6303, + "step": 7921 + }, + { + "epoch": 0.9733382479420076, + "grad_norm": 1.1763139350291203, + "learning_rate": 6.491599495474288e-08, + "loss": 0.5067, + "step": 7922 + }, + { + "epoch": 0.9734611131588647, + "grad_norm": 1.2708653495468742, + "learning_rate": 6.431949640208434e-08, + "loss": 0.5651, + "step": 7923 + }, + { + "epoch": 0.9735839783757219, + "grad_norm": 1.0036587251052789, + "learning_rate": 6.372574517099439e-08, + "loss": 0.6054, + "step": 7924 + }, + { + "epoch": 0.973706843592579, + "grad_norm": 1.1031794382653075, + "learning_rate": 6.313474137068731e-08, + "loss": 0.602, + "step": 7925 + }, + { + "epoch": 0.9738297088094361, + "grad_norm": 1.2188623832368932, + "learning_rate": 6.254648510987616e-08, + "loss": 0.6484, + "step": 7926 + }, + { + "epoch": 0.9739525740262932, + "grad_norm": 1.3875938360138016, + "learning_rate": 6.196097649676768e-08, + "loss": 0.4916, + "step": 7927 + }, + { + "epoch": 0.9740754392431502, + "grad_norm": 1.0654032931635287, + "learning_rate": 6.13782156390591e-08, + "loss": 0.442, + "step": 7928 + }, + { + "epoch": 0.9741983044600073, + "grad_norm": 1.169409280054631, + "learning_rate": 6.079820264394797e-08, + "loss": 0.6159, + "step": 7929 + }, + { + "epoch": 0.9743211696768644, + "grad_norm": 1.0792372220021367, + "learning_rate": 6.022093761812398e-08, + "loss": 0.4995, + "step": 7930 + }, + { + "epoch": 0.9744440348937216, + "grad_norm": 1.1128586648601857, + "learning_rate": 5.964642066776882e-08, + "loss": 0.5698, + "step": 7931 + }, + { + "epoch": 0.9745669001105787, + "grad_norm": 1.2528218279451848, + "learning_rate": 5.907465189856465e-08, + "loss": 0.5263, + "step": 7932 + }, + { + "epoch": 0.9746897653274358, + "grad_norm": 1.2244366225352308, + "learning_rate": 5.8505631415682325e-08, + "loss": 0.5017, + "step": 7933 + }, + { + "epoch": 0.9748126305442929, + "grad_norm": 1.2539076609290896, + "learning_rate": 5.7939359323791465e-08, + "loss": 0.6105, + "step": 7934 + }, + { + "epoch": 0.97493549576115, + "grad_norm": 1.1489300293536928, + "learning_rate": 5.737583572705041e-08, + "loss": 0.6399, + "step": 7935 + }, + { + "epoch": 0.9750583609780071, + "grad_norm": 1.1700725300190982, + "learning_rate": 5.681506072911957e-08, + "loss": 0.6184, + "step": 7936 + }, + { + "epoch": 0.9751812261948642, + "grad_norm": 1.235799962506757, + "learning_rate": 5.6257034433148115e-08, + "loss": 0.4631, + "step": 7937 + }, + { + "epoch": 0.9753040914117214, + "grad_norm": 1.0913657431826391, + "learning_rate": 5.570175694178226e-08, + "loss": 0.5921, + "step": 7938 + }, + { + "epoch": 0.9754269566285785, + "grad_norm": 1.0760461238397525, + "learning_rate": 5.5149228357160296e-08, + "loss": 0.6713, + "step": 7939 + }, + { + "epoch": 0.9755498218454356, + "grad_norm": 1.2242991904488805, + "learning_rate": 5.459944878091761e-08, + "loss": 0.5842, + "step": 7940 + }, + { + "epoch": 0.9756726870622927, + "grad_norm": 1.2327749684788447, + "learning_rate": 5.405241831418162e-08, + "loss": 0.4756, + "step": 7941 + }, + { + "epoch": 0.9757955522791498, + "grad_norm": 1.068152234292588, + "learning_rate": 5.350813705757518e-08, + "loss": 0.603, + "step": 7942 + }, + { + "epoch": 0.9759184174960068, + "grad_norm": 1.1897083274468803, + "learning_rate": 5.2966605111214874e-08, + "loss": 0.4709, + "step": 7943 + }, + { + "epoch": 0.976041282712864, + "grad_norm": 1.313111495341666, + "learning_rate": 5.242782257471268e-08, + "loss": 0.4953, + "step": 7944 + }, + { + "epoch": 0.9761641479297211, + "grad_norm": 1.0800561987648858, + "learning_rate": 5.189178954717599e-08, + "loss": 0.6013, + "step": 7945 + }, + { + "epoch": 0.9762870131465782, + "grad_norm": 1.3389974771336113, + "learning_rate": 5.135850612720094e-08, + "loss": 0.5721, + "step": 7946 + }, + { + "epoch": 0.9764098783634353, + "grad_norm": 1.3572087239177455, + "learning_rate": 5.082797241288406e-08, + "loss": 0.671, + "step": 7947 + }, + { + "epoch": 0.9765327435802924, + "grad_norm": 1.3453947114069504, + "learning_rate": 5.030018850181228e-08, + "loss": 0.6102, + "step": 7948 + }, + { + "epoch": 0.9766556087971495, + "grad_norm": 1.3591428248387296, + "learning_rate": 4.977515449106962e-08, + "loss": 0.5486, + "step": 7949 + }, + { + "epoch": 0.9767784740140066, + "grad_norm": 0.9631579693292484, + "learning_rate": 4.925287047723048e-08, + "loss": 0.5603, + "step": 7950 + }, + { + "epoch": 0.9769013392308638, + "grad_norm": 1.2568657638678462, + "learning_rate": 4.8733336556368024e-08, + "loss": 0.514, + "step": 7951 + }, + { + "epoch": 0.9770242044477209, + "grad_norm": 1.175602890883501, + "learning_rate": 4.82165528240458e-08, + "loss": 0.5091, + "step": 7952 + }, + { + "epoch": 0.977147069664578, + "grad_norm": 1.0622539234544548, + "learning_rate": 4.770251937532277e-08, + "loss": 0.5107, + "step": 7953 + }, + { + "epoch": 0.9772699348814351, + "grad_norm": 1.0745401143223374, + "learning_rate": 4.719123630475164e-08, + "loss": 0.545, + "step": 7954 + }, + { + "epoch": 0.9773928000982922, + "grad_norm": 1.5821311719041475, + "learning_rate": 4.66827037063805e-08, + "loss": 0.6015, + "step": 7955 + }, + { + "epoch": 0.9775156653151493, + "grad_norm": 1.0583656333207705, + "learning_rate": 4.6176921673751204e-08, + "loss": 0.6337, + "step": 7956 + }, + { + "epoch": 0.9776385305320064, + "grad_norm": 1.1070197926275904, + "learning_rate": 4.567389029989599e-08, + "loss": 0.617, + "step": 7957 + }, + { + "epoch": 0.9777613957488634, + "grad_norm": 1.0679210824532126, + "learning_rate": 4.517360967734918e-08, + "loss": 0.5336, + "step": 7958 + }, + { + "epoch": 0.9778842609657206, + "grad_norm": 1.3410533541143352, + "learning_rate": 4.467607989812883e-08, + "loss": 0.6369, + "step": 7959 + }, + { + "epoch": 0.9780071261825777, + "grad_norm": 1.030391663776545, + "learning_rate": 4.418130105375673e-08, + "loss": 0.6517, + "step": 7960 + }, + { + "epoch": 0.9781299913994348, + "grad_norm": 0.9959157031611443, + "learning_rate": 4.368927323524174e-08, + "loss": 0.5897, + "step": 7961 + }, + { + "epoch": 0.9782528566162919, + "grad_norm": 1.3050964357223351, + "learning_rate": 4.3199996533089815e-08, + "loss": 0.5397, + "step": 7962 + }, + { + "epoch": 0.978375721833149, + "grad_norm": 0.8896088758523613, + "learning_rate": 4.271347103730061e-08, + "loss": 0.6142, + "step": 7963 + }, + { + "epoch": 0.9784985870500061, + "grad_norm": 1.2023294808831217, + "learning_rate": 4.222969683736755e-08, + "loss": 0.3741, + "step": 7964 + }, + { + "epoch": 0.9786214522668633, + "grad_norm": 1.1933642361404244, + "learning_rate": 4.1748674022276114e-08, + "loss": 0.5069, + "step": 7965 + }, + { + "epoch": 0.9787443174837204, + "grad_norm": 1.2872590561845276, + "learning_rate": 4.127040268050886e-08, + "loss": 0.5833, + "step": 7966 + }, + { + "epoch": 0.9788671827005775, + "grad_norm": 1.1323445563720687, + "learning_rate": 4.0794882900040406e-08, + "loss": 0.5304, + "step": 7967 + }, + { + "epoch": 0.9789900479174346, + "grad_norm": 1.2654883498556326, + "learning_rate": 4.032211476833914e-08, + "loss": 0.5629, + "step": 7968 + }, + { + "epoch": 0.9791129131342917, + "grad_norm": 1.2941472860593901, + "learning_rate": 3.985209837236881e-08, + "loss": 0.517, + "step": 7969 + }, + { + "epoch": 0.9792357783511488, + "grad_norm": 1.2425996545586697, + "learning_rate": 3.93848337985836e-08, + "loss": 0.5414, + "step": 7970 + }, + { + "epoch": 0.979358643568006, + "grad_norm": 1.3570834239302219, + "learning_rate": 3.892032113293642e-08, + "loss": 0.7001, + "step": 7971 + }, + { + "epoch": 0.979481508784863, + "grad_norm": 1.3417121385509694, + "learning_rate": 3.845856046086893e-08, + "loss": 0.4966, + "step": 7972 + }, + { + "epoch": 0.9796043740017201, + "grad_norm": 1.08750849704422, + "learning_rate": 3.799955186732151e-08, + "loss": 0.6133, + "step": 7973 + }, + { + "epoch": 0.9797272392185772, + "grad_norm": 1.3516878253637432, + "learning_rate": 3.7543295436723304e-08, + "loss": 0.6282, + "step": 7974 + }, + { + "epoch": 0.9798501044354343, + "grad_norm": 1.2026392993506154, + "learning_rate": 3.7089791253002156e-08, + "loss": 0.5349, + "step": 7975 + }, + { + "epoch": 0.9799729696522914, + "grad_norm": 1.7867146346854232, + "learning_rate": 3.6639039399574694e-08, + "loss": 0.5569, + "step": 7976 + }, + { + "epoch": 0.9800958348691485, + "grad_norm": 1.1463882722807537, + "learning_rate": 3.6191039959356245e-08, + "loss": 0.5784, + "step": 7977 + }, + { + "epoch": 0.9802187000860056, + "grad_norm": 1.5999679377963991, + "learning_rate": 3.574579301475256e-08, + "loss": 0.6132, + "step": 7978 + }, + { + "epoch": 0.9803415653028628, + "grad_norm": 1.158497689802478, + "learning_rate": 3.530329864766313e-08, + "loss": 0.6096, + "step": 7979 + }, + { + "epoch": 0.9804644305197199, + "grad_norm": 1.0769792176614135, + "learning_rate": 3.4863556939482846e-08, + "loss": 0.5246, + "step": 7980 + }, + { + "epoch": 0.980587295736577, + "grad_norm": 0.9578720939662675, + "learning_rate": 3.4426567971097e-08, + "loss": 0.6509, + "step": 7981 + }, + { + "epoch": 0.9807101609534341, + "grad_norm": 1.2122610858541727, + "learning_rate": 3.39923318228913e-08, + "loss": 0.581, + "step": 7982 + }, + { + "epoch": 0.9808330261702912, + "grad_norm": 1.3255379578308624, + "learning_rate": 3.3560848574736845e-08, + "loss": 0.5788, + "step": 7983 + }, + { + "epoch": 0.9809558913871483, + "grad_norm": 1.1930225220959974, + "learning_rate": 3.313211830600349e-08, + "loss": 0.5971, + "step": 7984 + }, + { + "epoch": 0.9810787566040055, + "grad_norm": 1.008919057693785, + "learning_rate": 3.270614109555314e-08, + "loss": 0.5792, + "step": 7985 + }, + { + "epoch": 0.9812016218208626, + "grad_norm": 1.116604457057502, + "learning_rate": 3.228291702174313e-08, + "loss": 0.5848, + "step": 7986 + }, + { + "epoch": 0.9813244870377196, + "grad_norm": 1.4768299633623003, + "learning_rate": 3.1862446162421176e-08, + "loss": 0.5909, + "step": 7987 + }, + { + "epoch": 0.9814473522545767, + "grad_norm": 0.9052940825873653, + "learning_rate": 3.144472859493042e-08, + "loss": 0.6401, + "step": 7988 + }, + { + "epoch": 0.9815702174714338, + "grad_norm": 1.3580090879637217, + "learning_rate": 3.1029764396106055e-08, + "loss": 0.676, + "step": 7989 + }, + { + "epoch": 0.9816930826882909, + "grad_norm": 1.5065730431542972, + "learning_rate": 3.061755364228036e-08, + "loss": 0.6013, + "step": 7990 + }, + { + "epoch": 0.981815947905148, + "grad_norm": 1.0822583020195302, + "learning_rate": 3.020809640927602e-08, + "loss": 0.6047, + "step": 7991 + }, + { + "epoch": 0.9819388131220051, + "grad_norm": 1.1678185295095227, + "learning_rate": 2.9801392772409453e-08, + "loss": 0.5539, + "step": 7992 + }, + { + "epoch": 0.9820616783388623, + "grad_norm": 1.3103490886477447, + "learning_rate": 2.9397442806492482e-08, + "loss": 0.5965, + "step": 7993 + }, + { + "epoch": 0.9821845435557194, + "grad_norm": 1.3160897809879666, + "learning_rate": 2.8996246585827335e-08, + "loss": 0.4586, + "step": 7994 + }, + { + "epoch": 0.9823074087725765, + "grad_norm": 1.059804929204877, + "learning_rate": 2.859780418421165e-08, + "loss": 0.5156, + "step": 7995 + }, + { + "epoch": 0.9824302739894336, + "grad_norm": 1.2021995691119096, + "learning_rate": 2.8202115674938468e-08, + "loss": 0.5525, + "step": 7996 + }, + { + "epoch": 0.9825531392062907, + "grad_norm": 1.122114966080332, + "learning_rate": 2.7809181130789562e-08, + "loss": 0.6372, + "step": 7997 + }, + { + "epoch": 0.9826760044231478, + "grad_norm": 1.3181658357819488, + "learning_rate": 2.7419000624043787e-08, + "loss": 0.6736, + "step": 7998 + }, + { + "epoch": 0.982798869640005, + "grad_norm": 1.0298937972038045, + "learning_rate": 2.7031574226472066e-08, + "loss": 0.6448, + "step": 7999 + }, + { + "epoch": 0.9829217348568621, + "grad_norm": 1.2091202816441327, + "learning_rate": 2.6646902009339057e-08, + "loss": 0.6068, + "step": 8000 + }, + { + "epoch": 0.9830446000737191, + "grad_norm": 1.146241201450024, + "learning_rate": 2.626498404340316e-08, + "loss": 0.5511, + "step": 8001 + }, + { + "epoch": 0.9831674652905762, + "grad_norm": 1.5853153464112946, + "learning_rate": 2.5885820398916516e-08, + "loss": 0.5442, + "step": 8002 + }, + { + "epoch": 0.9832903305074333, + "grad_norm": 1.2505045186058528, + "learning_rate": 2.5509411145621665e-08, + "loss": 0.5351, + "step": 8003 + }, + { + "epoch": 0.9834131957242904, + "grad_norm": 1.3015526576547096, + "learning_rate": 2.5135756352756555e-08, + "loss": 0.4487, + "step": 8004 + }, + { + "epoch": 0.9835360609411475, + "grad_norm": 1.3422090225823184, + "learning_rate": 2.4764856089054544e-08, + "loss": 0.4901, + "step": 8005 + }, + { + "epoch": 0.9836589261580047, + "grad_norm": 1.0367906110222358, + "learning_rate": 2.4396710422739387e-08, + "loss": 0.5757, + "step": 8006 + }, + { + "epoch": 0.9837817913748618, + "grad_norm": 1.2785086612035967, + "learning_rate": 2.4031319421530252e-08, + "loss": 0.5972, + "step": 8007 + }, + { + "epoch": 0.9839046565917189, + "grad_norm": 1.1801608054858275, + "learning_rate": 2.366868315263504e-08, + "loss": 0.4894, + "step": 8008 + }, + { + "epoch": 0.984027521808576, + "grad_norm": 1.2047013576955874, + "learning_rate": 2.330880168276206e-08, + "loss": 0.5826, + "step": 8009 + }, + { + "epoch": 0.9841503870254331, + "grad_norm": 1.3211801042268811, + "learning_rate": 2.2951675078108357e-08, + "loss": 0.5803, + "step": 8010 + }, + { + "epoch": 0.9842732522422902, + "grad_norm": 1.5036403353921273, + "learning_rate": 2.2597303404363058e-08, + "loss": 0.585, + "step": 8011 + }, + { + "epoch": 0.9843961174591473, + "grad_norm": 1.0847191609711093, + "learning_rate": 2.2245686726712346e-08, + "loss": 0.6442, + "step": 8012 + }, + { + "epoch": 0.9845189826760045, + "grad_norm": 1.197202403882334, + "learning_rate": 2.1896825109834486e-08, + "loss": 0.6062, + "step": 8013 + }, + { + "epoch": 0.9846418478928616, + "grad_norm": 1.045142255260294, + "learning_rate": 2.1550718617898145e-08, + "loss": 0.5121, + "step": 8014 + }, + { + "epoch": 0.9847647131097187, + "grad_norm": 1.2950442699398432, + "learning_rate": 2.120736731456907e-08, + "loss": 0.4944, + "step": 8015 + }, + { + "epoch": 0.9848875783265757, + "grad_norm": 1.4577182925995809, + "learning_rate": 2.0866771263003403e-08, + "loss": 0.5893, + "step": 8016 + }, + { + "epoch": 0.9850104435434328, + "grad_norm": 0.9954081424058401, + "learning_rate": 2.0528930525852697e-08, + "loss": 0.6524, + "step": 8017 + }, + { + "epoch": 0.9851333087602899, + "grad_norm": 1.2474771357371692, + "learning_rate": 2.0193845165258906e-08, + "loss": 0.6423, + "step": 8018 + }, + { + "epoch": 0.985256173977147, + "grad_norm": 1.1105295620082798, + "learning_rate": 1.9861515242861062e-08, + "loss": 0.5478, + "step": 8019 + }, + { + "epoch": 0.9853790391940042, + "grad_norm": 1.7241847412940476, + "learning_rate": 1.953194081978693e-08, + "loss": 0.6548, + "step": 8020 + }, + { + "epoch": 0.9855019044108613, + "grad_norm": 1.0615741785387525, + "learning_rate": 1.9205121956661352e-08, + "loss": 0.5874, + "step": 8021 + }, + { + "epoch": 0.9856247696277184, + "grad_norm": 1.2052386574769773, + "learning_rate": 1.8881058713599577e-08, + "loss": 0.4678, + "step": 8022 + }, + { + "epoch": 0.9857476348445755, + "grad_norm": 1.352163825361923, + "learning_rate": 1.855975115021058e-08, + "loss": 0.5425, + "step": 8023 + }, + { + "epoch": 0.9858705000614326, + "grad_norm": 1.1690527519829623, + "learning_rate": 1.824119932559709e-08, + "loss": 0.5392, + "step": 8024 + }, + { + "epoch": 0.9859933652782897, + "grad_norm": 1.101156659463525, + "learning_rate": 1.792540329835557e-08, + "loss": 0.5938, + "step": 8025 + }, + { + "epoch": 0.9861162304951469, + "grad_norm": 1.0497699161702165, + "learning_rate": 1.7612363126572883e-08, + "loss": 0.4782, + "step": 8026 + }, + { + "epoch": 0.986239095712004, + "grad_norm": 1.2807776718229729, + "learning_rate": 1.730207886783297e-08, + "loss": 0.6416, + "step": 8027 + }, + { + "epoch": 0.9863619609288611, + "grad_norm": 1.3824338193359418, + "learning_rate": 1.699455057920851e-08, + "loss": 0.6204, + "step": 8028 + }, + { + "epoch": 0.9864848261457182, + "grad_norm": 1.3701888639593096, + "learning_rate": 1.6689778317269254e-08, + "loss": 0.4473, + "step": 8029 + }, + { + "epoch": 0.9866076913625752, + "grad_norm": 1.291171584184624, + "learning_rate": 1.6387762138075358e-08, + "loss": 0.5684, + "step": 8030 + }, + { + "epoch": 0.9867305565794323, + "grad_norm": 1.4069802423509112, + "learning_rate": 1.6088502097179047e-08, + "loss": 0.6201, + "step": 8031 + }, + { + "epoch": 0.9868534217962894, + "grad_norm": 1.461091691568767, + "learning_rate": 1.5791998249629625e-08, + "loss": 0.532, + "step": 8032 + }, + { + "epoch": 0.9869762870131465, + "grad_norm": 1.4011663285862548, + "learning_rate": 1.5498250649965128e-08, + "loss": 0.494, + "step": 8033 + }, + { + "epoch": 0.9870991522300037, + "grad_norm": 1.2151867134185521, + "learning_rate": 1.520725935222067e-08, + "loss": 0.5002, + "step": 8034 + }, + { + "epoch": 0.9872220174468608, + "grad_norm": 1.1515237826510736, + "learning_rate": 1.49190244099201e-08, + "loss": 0.4859, + "step": 8035 + }, + { + "epoch": 0.9873448826637179, + "grad_norm": 1.03410070250484, + "learning_rate": 1.4633545876084342e-08, + "loss": 0.5838, + "step": 8036 + }, + { + "epoch": 0.987467747880575, + "grad_norm": 1.3148321917928756, + "learning_rate": 1.4350823803224721e-08, + "loss": 0.6352, + "step": 8037 + }, + { + "epoch": 0.9875906130974321, + "grad_norm": 1.0877545873442767, + "learning_rate": 1.4070858243344641e-08, + "loss": 0.5652, + "step": 8038 + }, + { + "epoch": 0.9877134783142892, + "grad_norm": 1.4480697564781828, + "learning_rate": 1.3793649247942909e-08, + "loss": 0.5722, + "step": 8039 + }, + { + "epoch": 0.9878363435311464, + "grad_norm": 1.1001578773628062, + "learning_rate": 1.3519196868010398e-08, + "loss": 0.5908, + "step": 8040 + }, + { + "epoch": 0.9879592087480035, + "grad_norm": 1.1270821010951588, + "learning_rate": 1.3247501154031727e-08, + "loss": 0.5378, + "step": 8041 + }, + { + "epoch": 0.9880820739648606, + "grad_norm": 1.2983532840943826, + "learning_rate": 1.297856215598192e-08, + "loss": 0.5569, + "step": 8042 + }, + { + "epoch": 0.9882049391817177, + "grad_norm": 0.946385960171128, + "learning_rate": 1.2712379923331407e-08, + "loss": 0.5166, + "step": 8043 + }, + { + "epoch": 0.9883278043985748, + "grad_norm": 1.1529525483906802, + "learning_rate": 1.2448954505042686e-08, + "loss": 0.5381, + "step": 8044 + }, + { + "epoch": 0.9884506696154318, + "grad_norm": 1.2930677968355404, + "learning_rate": 1.2188285949571998e-08, + "loss": 0.6993, + "step": 8045 + }, + { + "epoch": 0.9885735348322889, + "grad_norm": 1.279952365652824, + "learning_rate": 1.1930374304865988e-08, + "loss": 0.4988, + "step": 8046 + }, + { + "epoch": 0.988696400049146, + "grad_norm": 1.255779286030602, + "learning_rate": 1.1675219618366706e-08, + "loss": 0.6135, + "step": 8047 + }, + { + "epoch": 0.9888192652660032, + "grad_norm": 1.0888947647310458, + "learning_rate": 1.1422821937008276e-08, + "loss": 0.6865, + "step": 8048 + }, + { + "epoch": 0.9889421304828603, + "grad_norm": 1.3028248344196969, + "learning_rate": 1.1173181307216896e-08, + "loss": 0.6437, + "step": 8049 + }, + { + "epoch": 0.9890649956997174, + "grad_norm": 1.0345401796797176, + "learning_rate": 1.0926297774912497e-08, + "loss": 0.463, + "step": 8050 + }, + { + "epoch": 0.9891878609165745, + "grad_norm": 1.0782471415112382, + "learning_rate": 1.0682171385508755e-08, + "loss": 0.5508, + "step": 8051 + }, + { + "epoch": 0.9893107261334316, + "grad_norm": 1.3700187691455787, + "learning_rate": 1.0440802183911414e-08, + "loss": 0.5773, + "step": 8052 + }, + { + "epoch": 0.9894335913502887, + "grad_norm": 1.3831200546951379, + "learning_rate": 1.0202190214516626e-08, + "loss": 0.5857, + "step": 8053 + }, + { + "epoch": 0.9895564565671459, + "grad_norm": 1.1532807383274042, + "learning_rate": 9.966335521215953e-09, + "loss": 0.4818, + "step": 8054 + }, + { + "epoch": 0.989679321784003, + "grad_norm": 1.7486969928729637, + "learning_rate": 9.733238147394685e-09, + "loss": 0.7503, + "step": 8055 + }, + { + "epoch": 0.9898021870008601, + "grad_norm": 1.0789812101709106, + "learning_rate": 9.502898135930194e-09, + "loss": 0.6643, + "step": 8056 + }, + { + "epoch": 0.9899250522177172, + "grad_norm": 1.256241760451359, + "learning_rate": 9.275315529188588e-09, + "loss": 0.5815, + "step": 8057 + }, + { + "epoch": 0.9900479174345743, + "grad_norm": 0.9913749750296283, + "learning_rate": 9.050490369036379e-09, + "loss": 0.6511, + "step": 8058 + }, + { + "epoch": 0.9901707826514313, + "grad_norm": 1.193731879382614, + "learning_rate": 8.828422696825488e-09, + "loss": 0.5444, + "step": 8059 + }, + { + "epoch": 0.9902936478682884, + "grad_norm": 1.1522595968898106, + "learning_rate": 8.609112553406573e-09, + "loss": 0.553, + "step": 8060 + }, + { + "epoch": 0.9904165130851456, + "grad_norm": 1.1922913072739427, + "learning_rate": 8.392559979117365e-09, + "loss": 0.6307, + "step": 8061 + }, + { + "epoch": 0.9905393783020027, + "grad_norm": 1.0885113901487542, + "learning_rate": 8.178765013792665e-09, + "loss": 0.5182, + "step": 8062 + }, + { + "epoch": 0.9906622435188598, + "grad_norm": 1.059366582473548, + "learning_rate": 7.967727696761019e-09, + "loss": 0.4716, + "step": 8063 + }, + { + "epoch": 0.9907851087357169, + "grad_norm": 1.1331354244885037, + "learning_rate": 7.759448066836373e-09, + "loss": 0.7602, + "step": 8064 + }, + { + "epoch": 0.990907973952574, + "grad_norm": 1.2511727757546656, + "learning_rate": 7.553926162334745e-09, + "loss": 0.5693, + "step": 8065 + }, + { + "epoch": 0.9910308391694311, + "grad_norm": 1.0254268663095505, + "learning_rate": 7.351162021059232e-09, + "loss": 0.576, + "step": 8066 + }, + { + "epoch": 0.9911537043862882, + "grad_norm": 1.2346048819449105, + "learning_rate": 7.151155680304999e-09, + "loss": 0.5562, + "step": 8067 + }, + { + "epoch": 0.9912765696031454, + "grad_norm": 1.1295040279289164, + "learning_rate": 6.953907176864283e-09, + "loss": 0.532, + "step": 8068 + }, + { + "epoch": 0.9913994348200025, + "grad_norm": 1.3487594159123066, + "learning_rate": 6.759416547019725e-09, + "loss": 0.594, + "step": 8069 + }, + { + "epoch": 0.9915223000368596, + "grad_norm": 1.141790709939659, + "learning_rate": 6.567683826546045e-09, + "loss": 0.5862, + "step": 8070 + }, + { + "epoch": 0.9916451652537167, + "grad_norm": 1.0129926502007005, + "learning_rate": 6.37870905071003e-09, + "loss": 0.5676, + "step": 8071 + }, + { + "epoch": 0.9917680304705738, + "grad_norm": 1.2319202432729983, + "learning_rate": 6.192492254273874e-09, + "loss": 0.5399, + "step": 8072 + }, + { + "epoch": 0.9918908956874309, + "grad_norm": 1.358428666072059, + "learning_rate": 6.009033471491842e-09, + "loss": 0.5432, + "step": 8073 + }, + { + "epoch": 0.992013760904288, + "grad_norm": 1.2703686594197823, + "learning_rate": 5.828332736106945e-09, + "loss": 0.591, + "step": 8074 + }, + { + "epoch": 0.9921366261211451, + "grad_norm": 1.2566480489320797, + "learning_rate": 5.650390081359258e-09, + "loss": 0.5864, + "step": 8075 + }, + { + "epoch": 0.9922594913380022, + "grad_norm": 1.4061430826771173, + "learning_rate": 5.4752055399825975e-09, + "loss": 0.6404, + "step": 8076 + }, + { + "epoch": 0.9923823565548593, + "grad_norm": 1.1273950435925322, + "learning_rate": 5.302779144197856e-09, + "loss": 0.5398, + "step": 8077 + }, + { + "epoch": 0.9925052217717164, + "grad_norm": 1.149868913475122, + "learning_rate": 5.1331109257229945e-09, + "loss": 0.5924, + "step": 8078 + }, + { + "epoch": 0.9926280869885735, + "grad_norm": 0.894895228330755, + "learning_rate": 4.966200915766383e-09, + "loss": 0.6267, + "step": 8079 + }, + { + "epoch": 0.9927509522054306, + "grad_norm": 1.3657698253822583, + "learning_rate": 4.802049145031795e-09, + "loss": 0.6386, + "step": 8080 + }, + { + "epoch": 0.9928738174222878, + "grad_norm": 1.3283900160039688, + "learning_rate": 4.640655643713409e-09, + "loss": 0.5925, + "step": 8081 + }, + { + "epoch": 0.9929966826391449, + "grad_norm": 1.2846436957105956, + "learning_rate": 4.482020441497481e-09, + "loss": 0.6135, + "step": 8082 + }, + { + "epoch": 0.993119547856002, + "grad_norm": 1.229760704485686, + "learning_rate": 4.326143567564e-09, + "loss": 0.561, + "step": 8083 + }, + { + "epoch": 0.9932424130728591, + "grad_norm": 1.2561293313153818, + "learning_rate": 4.173025050586699e-09, + "loss": 0.5363, + "step": 8084 + }, + { + "epoch": 0.9933652782897162, + "grad_norm": 1.2379941230131533, + "learning_rate": 4.022664918729713e-09, + "loss": 0.4974, + "step": 8085 + }, + { + "epoch": 0.9934881435065733, + "grad_norm": 1.1089662326132168, + "learning_rate": 3.875063199650919e-09, + "loss": 0.6001, + "step": 8086 + }, + { + "epoch": 0.9936110087234304, + "grad_norm": 0.9836847990763243, + "learning_rate": 3.730219920501932e-09, + "loss": 0.5979, + "step": 8087 + }, + { + "epoch": 0.9937338739402876, + "grad_norm": 1.3118670908057515, + "learning_rate": 3.5881351079247725e-09, + "loss": 0.603, + "step": 8088 + }, + { + "epoch": 0.9938567391571446, + "grad_norm": 1.1592653635999126, + "learning_rate": 3.448808788053537e-09, + "loss": 0.5075, + "step": 8089 + }, + { + "epoch": 0.9939796043740017, + "grad_norm": 1.5547443906199319, + "learning_rate": 3.312240986519388e-09, + "loss": 0.6175, + "step": 8090 + }, + { + "epoch": 0.9941024695908588, + "grad_norm": 1.2245728276802659, + "learning_rate": 3.1784317284405675e-09, + "loss": 0.5673, + "step": 8091 + }, + { + "epoch": 0.9942253348077159, + "grad_norm": 1.121501165908033, + "learning_rate": 3.0473810384323843e-09, + "loss": 0.5212, + "step": 8092 + }, + { + "epoch": 0.994348200024573, + "grad_norm": 1.4546621802910542, + "learning_rate": 2.9190889406005562e-09, + "loss": 0.7227, + "step": 8093 + }, + { + "epoch": 0.9944710652414301, + "grad_norm": 1.206607802667171, + "learning_rate": 2.7935554585412083e-09, + "loss": 0.5336, + "step": 8094 + }, + { + "epoch": 0.9945939304582873, + "grad_norm": 1.1905174298427603, + "learning_rate": 2.6707806153475347e-09, + "loss": 0.5954, + "step": 8095 + }, + { + "epoch": 0.9947167956751444, + "grad_norm": 1.3660613361689262, + "learning_rate": 2.5507644336014713e-09, + "loss": 0.4851, + "step": 8096 + }, + { + "epoch": 0.9948396608920015, + "grad_norm": 1.515088390666289, + "learning_rate": 2.4335069353820238e-09, + "loss": 0.5198, + "step": 8097 + }, + { + "epoch": 0.9949625261088586, + "grad_norm": 1.2389341530846003, + "learning_rate": 2.3190081422569398e-09, + "loss": 0.6061, + "step": 8098 + }, + { + "epoch": 0.9950853913257157, + "grad_norm": 1.3230517596742473, + "learning_rate": 2.2072680752843745e-09, + "loss": 0.5587, + "step": 8099 + }, + { + "epoch": 0.9952082565425728, + "grad_norm": 1.273574959961994, + "learning_rate": 2.0982867550228822e-09, + "loss": 0.6437, + "step": 8100 + }, + { + "epoch": 0.99533112175943, + "grad_norm": 1.3290024029323153, + "learning_rate": 1.9920642015164305e-09, + "loss": 0.6371, + "step": 8101 + }, + { + "epoch": 0.9954539869762871, + "grad_norm": 1.2081423261001714, + "learning_rate": 1.8886004343043885e-09, + "loss": 0.6055, + "step": 8102 + }, + { + "epoch": 0.9955768521931441, + "grad_norm": 1.2185807134160314, + "learning_rate": 1.7878954724165342e-09, + "loss": 0.5698, + "step": 8103 + }, + { + "epoch": 0.9956997174100012, + "grad_norm": 1.1682743303340581, + "learning_rate": 1.6899493343797146e-09, + "loss": 0.5024, + "step": 8104 + }, + { + "epoch": 0.9958225826268583, + "grad_norm": 1.644492469946242, + "learning_rate": 1.5947620382095185e-09, + "loss": 0.6892, + "step": 8105 + }, + { + "epoch": 0.9959454478437154, + "grad_norm": 1.1627091033293284, + "learning_rate": 1.5023336014152734e-09, + "loss": 0.6729, + "step": 8106 + }, + { + "epoch": 0.9960683130605725, + "grad_norm": 0.9518923304789875, + "learning_rate": 1.412664040996714e-09, + "loss": 0.5794, + "step": 8107 + }, + { + "epoch": 0.9961911782774296, + "grad_norm": 1.1649882565587246, + "learning_rate": 1.325753373448979e-09, + "loss": 0.5122, + "step": 8108 + }, + { + "epoch": 0.9963140434942868, + "grad_norm": 1.4891176846848169, + "learning_rate": 1.2416016147609454e-09, + "loss": 0.5772, + "step": 8109 + }, + { + "epoch": 0.9964369087111439, + "grad_norm": 1.1507961867299497, + "learning_rate": 1.160208780408567e-09, + "loss": 0.5582, + "step": 8110 + }, + { + "epoch": 0.996559773928001, + "grad_norm": 1.3381867488167045, + "learning_rate": 1.0815748853648666e-09, + "loss": 0.6928, + "step": 8111 + }, + { + "epoch": 0.9966826391448581, + "grad_norm": 1.2678745470480035, + "learning_rate": 1.00569994409494e-09, + "loss": 0.4518, + "step": 8112 + }, + { + "epoch": 0.9968055043617152, + "grad_norm": 1.0056908690486248, + "learning_rate": 9.325839705542904e-10, + "loss": 0.5258, + "step": 8113 + }, + { + "epoch": 0.9969283695785723, + "grad_norm": 2.6414259662547233, + "learning_rate": 8.622269781921599e-10, + "loss": 0.6567, + "step": 8114 + }, + { + "epoch": 0.9970512347954295, + "grad_norm": 1.230066034200606, + "learning_rate": 7.946289799515282e-10, + "loss": 0.6774, + "step": 8115 + }, + { + "epoch": 0.9971741000122866, + "grad_norm": 1.2614608407967711, + "learning_rate": 7.297899882641179e-10, + "loss": 0.5443, + "step": 8116 + }, + { + "epoch": 0.9972969652291437, + "grad_norm": 1.2970510495321734, + "learning_rate": 6.677100150587201e-10, + "loss": 0.4728, + "step": 8117 + }, + { + "epoch": 0.9974198304460007, + "grad_norm": 1.2892208367019529, + "learning_rate": 6.083890717545337e-10, + "loss": 0.4951, + "step": 8118 + }, + { + "epoch": 0.9975426956628578, + "grad_norm": 1.2145295710459174, + "learning_rate": 5.518271692628308e-10, + "loss": 0.5616, + "step": 8119 + }, + { + "epoch": 0.9976655608797149, + "grad_norm": 1.4451372381912968, + "learning_rate": 4.980243179869559e-10, + "loss": 0.6648, + "step": 8120 + }, + { + "epoch": 0.997788426096572, + "grad_norm": 1.5163099524470016, + "learning_rate": 4.4698052782399244e-10, + "loss": 0.5214, + "step": 8121 + }, + { + "epoch": 0.9979112913134291, + "grad_norm": 1.5005740637742002, + "learning_rate": 3.986958081647618e-10, + "loss": 0.6044, + "step": 8122 + }, + { + "epoch": 0.9980341565302863, + "grad_norm": 1.1093454080536318, + "learning_rate": 3.5317016788882773e-10, + "loss": 0.5659, + "step": 8123 + }, + { + "epoch": 0.9981570217471434, + "grad_norm": 1.3680888011948018, + "learning_rate": 3.1040361536949223e-10, + "loss": 0.5435, + "step": 8124 + }, + { + "epoch": 0.9982798869640005, + "grad_norm": 1.1747927808944483, + "learning_rate": 2.703961584771264e-10, + "loss": 0.5416, + "step": 8125 + }, + { + "epoch": 0.9984027521808576, + "grad_norm": 1.571899306684621, + "learning_rate": 2.331478045691782e-10, + "loss": 0.6808, + "step": 8126 + }, + { + "epoch": 0.9985256173977147, + "grad_norm": 1.1213493827740544, + "learning_rate": 1.986585604951685e-10, + "loss": 0.5923, + "step": 8127 + }, + { + "epoch": 0.9986484826145718, + "grad_norm": 1.3563344396482295, + "learning_rate": 1.6692843260168734e-10, + "loss": 0.6233, + "step": 8128 + }, + { + "epoch": 0.998771347831429, + "grad_norm": 1.0785520360116365, + "learning_rate": 1.3795742672406687e-10, + "loss": 0.5568, + "step": 8129 + }, + { + "epoch": 0.9988942130482861, + "grad_norm": 1.2934007494087418, + "learning_rate": 1.1174554819137761e-10, + "loss": 0.5272, + "step": 8130 + }, + { + "epoch": 0.9990170782651432, + "grad_norm": 1.0959046373814003, + "learning_rate": 8.82928018264284e-11, + "loss": 0.547, + "step": 8131 + }, + { + "epoch": 0.9991399434820002, + "grad_norm": 1.2850966928887688, + "learning_rate": 6.759919194077036e-11, + "loss": 0.5341, + "step": 8132 + }, + { + "epoch": 0.9992628086988573, + "grad_norm": 1.1637352576145872, + "learning_rate": 4.966472234302355e-11, + "loss": 0.6085, + "step": 8133 + }, + { + "epoch": 0.9993856739157144, + "grad_norm": 1.125407046195776, + "learning_rate": 3.4489396332215705e-11, + "loss": 0.597, + "step": 8134 + }, + { + "epoch": 0.9995085391325715, + "grad_norm": 1.7983791458558973, + "learning_rate": 2.2073216697782174e-11, + "loss": 0.6456, + "step": 8135 + }, + { + "epoch": 0.9996314043494287, + "grad_norm": 1.0156589430648115, + "learning_rate": 1.2416185724561935e-11, + "loss": 0.4818, + "step": 8136 + }, + { + "epoch": 0.9997542695662858, + "grad_norm": 1.011594477518567, + "learning_rate": 5.518305189466944e-12, + "loss": 0.5449, + "step": 8137 + }, + { + "epoch": 0.9998771347831429, + "grad_norm": 1.1716177249184163, + "learning_rate": 1.3795763614821155e-12, + "loss": 0.5594, + "step": 8138 + }, + { + "epoch": 1.0, + "grad_norm": 1.2461582729332843, + "learning_rate": 0.0, + "loss": 0.5178, + "step": 8139 + } + ], + "logging_steps": 1, + "max_steps": 8139, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 168053621296128.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}